diff --git a/arrow-arith/Cargo.toml b/arrow-arith/Cargo.toml index f2a4604c116e..6a02f543ecda 100644 --- a/arrow-arith/Cargo.toml +++ b/arrow-arith/Cargo.toml @@ -42,3 +42,6 @@ arrow-data = { workspace = true } arrow-schema = { workspace = true } chrono = { workspace = true } num-traits = { version = "0.2.19", default-features = false, features = ["std"] } + +[dev-dependencies] +rand = { version = "0.9", default-features = false, features = ["std", "std_rng", "thread_rng"] } diff --git a/arrow-arith/src/bitwise.rs b/arrow-arith/src/bitwise.rs index aedeecd5b835..1daefa6cfd86 100644 --- a/arrow-arith/src/bitwise.rs +++ b/arrow-arith/src/bitwise.rs @@ -390,3 +390,244 @@ mod tests { assert_eq!(expected, result); } } + +// Helper functions for reference implementations +fn ref_bitwise_and>(a: Option, b: Option) -> Option { + match (a, b) { + (Some(a), Some(b)) => Some(a & b), + _ => None, + } +} + +fn ref_bitwise_or>(a: Option, b: Option) -> Option { + match (a, b) { + (Some(a), Some(b)) => Some(a | b), + _ => None, + } +} + +fn ref_bitwise_xor>(a: Option, b: Option) -> Option { + match (a, b) { + (Some(a), Some(b)) => Some(a ^ b), + _ => None, + } +} + +fn ref_bitwise_shift_left(a: Option, b: Option) -> Option +where + T: num_traits::WrappingShl + num_traits::AsPrimitive, +{ + match (a, b) { + (Some(a), Some(b)) => { + let b = b.as_(); + Some(a.wrapping_shl(b as u32)) + } + _ => None, + } +} + +fn ref_bitwise_shift_right(a: Option, b: Option) -> Option +where + T: num_traits::WrappingShr + num_traits::AsPrimitive, +{ + match (a, b) { + (Some(a), Some(b)) => { + let b = b.as_(); + Some(a.wrapping_shr(b as u32)) + } + _ => None, + } +} + +fn ref_bitwise_and_not(a: Option, b: Option) -> Option +where + T: std::ops::BitAnd + std::ops::Not, +{ + match (a, b) { + (Some(a), Some(b)) => Some(a & !b), + _ => None, + } +} + +fn ref_bitwise_not>(a: Option) -> Option { + a.map(|x| !x) +} + +#[test] +fn test_primitive_bitwise_binary_random_equivalence() { + use rand::{Rng, SeedableRng}; + + // Use a fixed seed for reproducible tests + let mut rng = rand::rngs::StdRng::from_seed([42u8; 32]); + + for _ in 0..10 { // 10 iterations + let len = rng.random_range(1..=64); + + // Generate for i32 + let mut left_vec_i32 = Vec::with_capacity(len); + let mut right_vec_i32 = Vec::with_capacity(len); + for _ in 0..len { + let is_null = rng.random_bool(0.2); + let val = if is_null { None } else { Some(rng.random::()) }; + left_vec_i32.push(val); + let is_null = rng.random_bool(0.2); + let val = if is_null { None } else { Some(rng.random::()) }; + right_vec_i32.push(val); + } + let left_i32 = Int32Array::from(left_vec_i32.clone()); + let right_i32 = Int32Array::from(right_vec_i32.clone()); + let (left_slice_i32, right_slice_i32) = if len > 1 { + let slice_len = len - 1; + (left_i32.slice(1, slice_len), right_i32.slice(1, slice_len)) + } else { + (left_i32.clone(), right_i32.clone()) + }; + + // Test bitwise_and for i32 + let result = bitwise_and(&left_i32, &right_i32).unwrap(); + let expected: Vec> = left_vec_i32.iter().zip(&right_vec_i32).map(|(a, b)| ref_bitwise_and(*a, *b)).collect(); + let result_vec: Vec> = result.iter().collect(); + assert_eq!(result_vec, expected, "bitwise_and full i32 mismatch"); + + if len > 1 { + let result_slice = bitwise_and(&left_slice_i32, &right_slice_i32).unwrap(); + let expected_slice: Vec> = left_vec_i32[1..].iter().zip(&right_vec_i32[1..]).map(|(a, b)| ref_bitwise_and(*a, *b)).collect(); + let result_slice_vec: Vec> = result_slice.iter().collect(); + assert_eq!(result_slice_vec, expected_slice, "bitwise_and sliced i32 mismatch"); + } + + // Test bitwise_or for i32 + let result = bitwise_or(&left_i32, &right_i32).unwrap(); + let expected: Vec> = left_vec_i32.iter().zip(&right_vec_i32).map(|(a, b)| ref_bitwise_or(*a, *b)).collect(); + let result_vec: Vec> = result.iter().collect(); + assert_eq!(result_vec, expected, "bitwise_or full i32 mismatch"); + + // Test bitwise_xor for i32 + let result = bitwise_xor(&left_i32, &right_i32).unwrap(); + let expected: Vec> = left_vec_i32.iter().zip(&right_vec_i32).map(|(a, b)| ref_bitwise_xor(*a, *b)).collect(); + let result_vec: Vec> = result.iter().collect(); + assert_eq!(result_vec, expected, "bitwise_xor full i32 mismatch"); + + // Test bitwise_shift_left for i32 + let result = bitwise_shift_left(&left_i32, &right_i32).unwrap(); + let expected: Vec> = left_vec_i32.iter().zip(&right_vec_i32).map(|(a, b)| ref_bitwise_shift_left(*a, *b)).collect(); + let result_vec: Vec> = result.iter().collect(); + assert_eq!(result_vec, expected, "bitwise_shift_left full i32 mismatch"); + + // Test bitwise_shift_right for i32 + let result = bitwise_shift_right(&left_i32, &right_i32).unwrap(); + let expected: Vec> = left_vec_i32.iter().zip(&right_vec_i32).map(|(a, b)| ref_bitwise_shift_right(*a, *b)).collect(); + let result_vec: Vec> = result.iter().collect(); + assert_eq!(result_vec, expected, "bitwise_shift_right full i32 mismatch"); + + // Test bitwise_and_not for i32 + let result = bitwise_and_not(&left_i32, &right_i32).unwrap(); + let expected: Vec> = left_vec_i32.iter().zip(&right_vec_i32).map(|(a, b)| ref_bitwise_and_not(*a, *b)).collect(); + let result_vec: Vec> = result.iter().collect(); + assert_eq!(result_vec, expected, "bitwise_and_not full i32 mismatch"); + + // Generate for u32 + let mut left_vec_u32 = Vec::with_capacity(len); + let mut right_vec_u32 = Vec::with_capacity(len); + for _ in 0..len { + let is_null = rng.random_bool(0.2); + let val = if is_null { None } else { Some(rng.random::()) }; + left_vec_u32.push(val); + let is_null = rng.random_bool(0.2); + let val = if is_null { None } else { Some(rng.random::()) }; + right_vec_u32.push(val); + } + let left_u32 = UInt32Array::from(left_vec_u32.clone()); + let right_u32 = UInt32Array::from(right_vec_u32.clone()); + + // Test bitwise_and for u32 + let result = bitwise_and(&left_u32, &right_u32).unwrap(); + let expected: Vec> = left_vec_u32.iter().zip(&right_vec_u32).map(|(a, b)| ref_bitwise_and(*a, *b)).collect(); + let result_vec: Vec> = result.iter().collect(); + assert_eq!(result_vec, expected, "bitwise_and full u32 mismatch"); + + // Test bitwise_or for u32 + let result = bitwise_or(&left_u32, &right_u32).unwrap(); + let expected: Vec> = left_vec_u32.iter().zip(&right_vec_u32).map(|(a, b)| ref_bitwise_or(*a, *b)).collect(); + let result_vec: Vec> = result.iter().collect(); + assert_eq!(result_vec, expected, "bitwise_or full u32 mismatch"); + + // Test bitwise_xor for u32 + let result = bitwise_xor(&left_u32, &right_u32).unwrap(); + let expected: Vec> = left_vec_u32.iter().zip(&right_vec_u32).map(|(a, b)| ref_bitwise_xor(*a, *b)).collect(); + let result_vec: Vec> = result.iter().collect(); + assert_eq!(result_vec, expected, "bitwise_xor full u32 mismatch"); + + // Test bitwise_shift_left for u32 + let result = bitwise_shift_left(&left_u32, &right_u32).unwrap(); + let expected: Vec> = left_vec_u32.iter().zip(&right_vec_u32).map(|(a, b)| ref_bitwise_shift_left(*a, *b)).collect(); + let result_vec: Vec> = result.iter().collect(); + assert_eq!(result_vec, expected, "bitwise_shift_left full u32 mismatch"); + + // Test bitwise_shift_right for u32 + let result = bitwise_shift_right(&left_u32, &right_u32).unwrap(); + let expected: Vec> = left_vec_u32.iter().zip(&right_vec_u32).map(|(a, b)| ref_bitwise_shift_right(*a, *b)).collect(); + let result_vec: Vec> = result.iter().collect(); + assert_eq!(result_vec, expected, "bitwise_shift_right full u32 mismatch"); + + // Test bitwise_and_not for u32 + let result = bitwise_and_not(&left_u32, &right_u32).unwrap(); + let expected: Vec> = left_vec_u32.iter().zip(&right_vec_u32).map(|(a, b)| ref_bitwise_and_not(*a, *b)).collect(); + let result_vec: Vec> = result.iter().collect(); + assert_eq!(result_vec, expected, "bitwise_and_not full u32 mismatch"); + } +} + +#[test] +fn test_primitive_bitwise_unary_random_equivalence() { + use rand::{Rng, SeedableRng}; + + // Use a fixed seed for reproducible tests + let mut rng = rand::rngs::StdRng::from_seed([43u8; 32]); + + for _ in 0..10 { // 10 iterations + let len = rng.random_range(1..=64); + + // Generate for i32 + let mut vec_i32 = Vec::with_capacity(len); + for _ in 0..len { + let is_null = rng.random_bool(0.2); + let val = if is_null { None } else { Some(rng.random::()) }; + vec_i32.push(val); + } + let array_i32 = Int32Array::from(vec_i32.clone()); + let array_slice_i32 = if len > 1 { + array_i32.slice(1, len - 1) + } else { + array_i32.clone() + }; + + // Test bitwise_not for i32 + let result = bitwise_not(&array_i32).unwrap(); + let expected: Vec> = vec_i32.iter().map(|a| ref_bitwise_not(*a)).collect(); + let result_vec: Vec> = result.iter().collect(); + assert_eq!(result_vec, expected, "bitwise_not full i32 mismatch"); + + if len > 1 { + let result_slice = bitwise_not(&array_slice_i32).unwrap(); + let expected_slice: Vec> = vec_i32[1..].iter().map(|a| ref_bitwise_not(*a)).collect(); + let result_slice_vec: Vec> = result_slice.iter().collect(); + assert_eq!(result_slice_vec, expected_slice, "bitwise_not sliced i32 mismatch"); + } + + // Generate for u32 + let mut vec_u32 = Vec::with_capacity(len); + for _ in 0..len { + let is_null = rng.random_bool(0.2); + let val = if is_null { None } else { Some(rng.random::()) }; + vec_u32.push(val); + } + let array_u32 = UInt32Array::from(vec_u32.clone()); + + // Test bitwise_not for u32 + let result = bitwise_not(&array_u32).unwrap(); + let expected: Vec> = vec_u32.iter().map(|a| ref_bitwise_not(*a)).collect(); + let result_vec: Vec> = result.iter().collect(); + assert_eq!(result_vec, expected, "bitwise_not full u32 mismatch"); + } +} diff --git a/arrow-arith/src/boolean.rs b/arrow-arith/src/boolean.rs index d94df49de256..ef9f085284a6 100644 --- a/arrow-arith/src/boolean.rs +++ b/arrow-arith/src/boolean.rs @@ -24,7 +24,7 @@ use arrow_array::*; use arrow_buffer::buffer::{bitwise_bin_op_helper, bitwise_quaternary_op_helper}; -use arrow_buffer::{BooleanBuffer, NullBuffer, buffer_bin_and_not}; +use arrow_buffer::{BooleanBuffer, NullBuffer}; use arrow_schema::ArrowError; /// Logical 'and' boolean values with Kleene logic @@ -252,7 +252,7 @@ where /// assert_eq!(and_ab, BooleanArray::from(vec![Some(false), Some(true), None])); /// ``` pub fn and(left: &BooleanArray, right: &BooleanArray) -> Result { - binary_boolean_kernel(left, right, |a, b| a & b) + left.binary(right, |a, b| a & b) } /// Performs `OR` operation on two arrays. If either left or right value is null then the @@ -269,7 +269,7 @@ pub fn and(left: &BooleanArray, right: &BooleanArray) -> Result Result { - binary_boolean_kernel(left, right, |a, b| a | b) + left.binary(right, |a, b| a | b) } /// Performs `AND_NOT` operation on two arrays. If either left or right value is null then the @@ -287,10 +287,7 @@ pub fn or(left: &BooleanArray, right: &BooleanArray) -> Result Result { - binary_boolean_kernel(left, right, |a, b| { - let buffer = buffer_bin_and_not(a.inner(), b.offset(), b.inner(), a.offset(), a.len()); - BooleanBuffer::new(buffer, left.offset(), left.len()) - }) + left.binary(right, |a, b| a & !b) } /// Performs unary `NOT` operation on an arrays. If value is null then the result is also @@ -306,9 +303,7 @@ pub fn and_not(left: &BooleanArray, right: &BooleanArray) -> Result Result { - let nulls = left.nulls().cloned(); - let values = !left.values(); - Ok(BooleanArray::new(values, nulls)) + Ok(left.unary(|a| !a)) } /// Returns a non-null [BooleanArray] with whether each value of the array is null. @@ -971,4 +966,336 @@ mod tests { .into_iter() .collect() } + + #[test] + fn test_boolean_kernels_with_nulls_and_offsets() { + // Construct BooleanArrays with mixed values and nulls + let left = BooleanArray::from(vec![ + Some(true), Some(false), None, Some(true), Some(false), None, Some(true) + ]); + let right = BooleanArray::from(vec![ + None, Some(true), Some(false), None, Some(true), Some(false), Some(true) + ]); + + // Create sliced views with non-zero offsets + let left_sliced = left.slice(1, 5); // Some(false), None, Some(true), Some(false), None + let right_sliced = right.slice(2, 5); // Some(false), None, Some(true), Some(false), Some(true) + + // Test and + let result_full = and(&left, &right).unwrap(); + let result_sliced = and(&left_sliced, &right_sliced).unwrap(); + + let expected_full = BooleanArray::from(vec![ + None, Some(false), None, None, Some(false), None, Some(true) + ]); + let expected_sliced = BooleanArray::from(vec![ + Some(false), None, Some(true), Some(false), None + ]); + + assert_eq!(result_full, expected_full); + assert_eq!(result_sliced, expected_sliced); + + // Test or + let result_full = or(&left, &right).unwrap(); + let result_sliced = or(&left_sliced, &right_sliced).unwrap(); + + let expected_full = BooleanArray::from(vec![ + None, Some(true), None, None, Some(true), None, Some(true) + ]); + let expected_sliced = BooleanArray::from(vec![ + Some(false), None, Some(true), Some(false), None + ]); + + assert_eq!(result_full, expected_full); + assert_eq!(result_sliced, expected_sliced); + + // Test and_kleene: true if both true, false if either false, null otherwise + let result_full = and_kleene(&left, &right).unwrap(); + let result_sliced = and_kleene(&left_sliced, &right_sliced).unwrap(); + + let expected_full = BooleanArray::from(vec![ + None, Some(false), Some(false), None, Some(false), Some(false), Some(true) + ]); + let expected_sliced = BooleanArray::from(vec![ + Some(false), None, Some(true), Some(false), None + ]); + + assert_eq!(result_full, expected_full); + assert_eq!(result_sliced, expected_sliced); + + // Test or_kleene: false if both false, true if either true, null otherwise + let result_full = or_kleene(&left, &right).unwrap(); + let result_sliced = or_kleene(&left_sliced, &right_sliced).unwrap(); + + let expected_full = BooleanArray::from(vec![ + Some(true), Some(true), None, Some(true), Some(true), None, Some(true) + ]); + let expected_sliced = BooleanArray::from(vec![ + Some(false), None, Some(true), Some(false), Some(true) + ]); + + assert_eq!(result_full, expected_full); + assert_eq!(result_sliced, expected_sliced); + + // Test not + let result_full = not(&left).unwrap(); + let result_sliced = not(&left_sliced).unwrap(); + + let expected_full = BooleanArray::from(vec![ + Some(false), Some(true), None, Some(false), Some(true), None, Some(false) + ]); + let expected_sliced = BooleanArray::from(vec![ + Some(true), None, Some(false), Some(true), None + ]); + + assert_eq!(result_full, expected_full); + assert_eq!(result_sliced, expected_sliced); + } + + #[test] + fn test_boolean_kernels_zero_length_and_all_null() { + // Empty arrays + let empty = BooleanArray::from(Vec::>::new()); + let result_and = and(&empty, &empty).unwrap(); + let result_or = or(&empty, &empty).unwrap(); + let result_not = not(&empty).unwrap(); + let result_and_kleene = and_kleene(&empty, &empty).unwrap(); + let result_or_kleene = or_kleene(&empty, &empty).unwrap(); + + assert_eq!(result_and.len(), 0); + assert_eq!(result_or.len(), 0); + assert_eq!(result_not.len(), 0); + assert_eq!(result_and_kleene.len(), 0); + assert_eq!(result_or_kleene.len(), 0); + + // All-null arrays + let all_null = BooleanArray::new_null(5); + let result_and = and(&all_null, &all_null).unwrap(); + let result_or = or(&all_null, &all_null).unwrap(); + let result_not = not(&all_null).unwrap(); + let result_and_kleene = and_kleene(&all_null, &all_null).unwrap(); + let result_or_kleene = or_kleene(&all_null, &all_null).unwrap(); + + assert_eq!(result_and, all_null); + assert_eq!(result_or, all_null); + assert_eq!(result_not, all_null); + assert_eq!(result_and_kleene, all_null); + assert_eq!(result_or_kleene, all_null); + + // Array with only first element non-null + let partial = BooleanArray::from(vec![Some(true), None, None, None, None]); + let result_not = not(&partial).unwrap(); + let expected_not = BooleanArray::from(vec![Some(false), None, None, None, None]); + assert_eq!(result_not, expected_not); + + // Array with only last element non-null + let partial = BooleanArray::from(vec![None, None, None, None, Some(false)]); + let result_not = not(&partial).unwrap(); + let expected_not = BooleanArray::from(vec![None, None, None, None, Some(true)]); + assert_eq!(result_not, expected_not); + } + + // Helper functions for reference implementations + fn ref_and_sql(a: Option, b: Option) -> Option { + match (a, b) { + (Some(a), Some(b)) => Some(a & b), + _ => None, + } + } + + fn ref_or_sql(a: Option, b: Option) -> Option { + match (a, b) { + (Some(a), Some(b)) => Some(a | b), + _ => None, + } + } + + fn ref_and_kleene(a: Option, b: Option) -> Option { + match (a, b) { + (Some(a), Some(b)) => Some(a & b), + (None, Some(b)) => if !b { Some(false) } else { None }, + (Some(a), None) => if !a { Some(false) } else { None }, + (None, None) => None, + } + } + + fn ref_or_kleene(a: Option, b: Option) -> Option { + match (a, b) { + (Some(a), Some(b)) => Some(a | b), + (None, Some(b)) => if b { Some(true) } else { None }, + (Some(a), None) => if a { Some(true) } else { None }, + (None, None) => None, + } + } + + fn ref_not(a: Option) -> Option { + a.map(|x| !x) + } + + #[test] + fn test_boolean_kernels_random_equivalence() { + use rand::{Rng, SeedableRng}; + + // Use a fixed seed for reproducible tests + let mut rng = rand::rngs::StdRng::from_seed([48u8; 32]); + + for _ in 0..20 { // 20 random iterations + // Pick random length 1..64 + let len = rng.random_range(1..=64); + + // Generate random Vec> for left and right + let mut left_vec = Vec::with_capacity(len); + let mut right_vec = Vec::with_capacity(len); + for _ in 0..len { + let is_null = rng.random_bool(0.2); // 20% chance of null + let val = if is_null { None } else { Some(rng.random_bool(0.5)) }; + left_vec.push(val); + let is_null = rng.random_bool(0.2); + let val = if is_null { None } else { Some(rng.random_bool(0.5)) }; + right_vec.push(val); + } + + // Construct BooleanArrays + let left = BooleanArray::from(left_vec.clone()); + let right = BooleanArray::from(right_vec.clone()); + + // Construct sliced variants if possible + let (left_slice, right_slice) = if len > 1 { + let slice_len = len - 1; + (left.slice(1, slice_len), right.slice(1, slice_len)) + } else { + (left.clone(), right.clone()) // fallback for len=1 + }; + + // Test each kernel + let kernels = vec![ + ("and", Box::new(|l: &BooleanArray, r: &BooleanArray| and(l, r).unwrap()) as Box BooleanArray>), + ("or", Box::new(|l, r| or(l, r).unwrap())), + ("and_kleene", Box::new(|l, r| and_kleene(l, r).unwrap())), + ("or_kleene", Box::new(|l, r| or_kleene(l, r).unwrap())), + ]; + + for (name, kernel) in kernels { + // Full arrays + let result = kernel(&left, &right); + let expected: Vec> = left_vec.iter().zip(&right_vec).map(|(a, b)| match name { + "and" => ref_and_sql(*a, *b), + "or" => ref_or_sql(*a, *b), + "and_kleene" => ref_and_kleene(*a, *b), + "or_kleene" => ref_or_kleene(*a, *b), + _ => unreachable!(), + }).collect(); + let result_vec: Vec> = result.iter().collect(); + assert_eq!(result_vec, expected, "Full {} mismatch", name); + + // Sliced arrays + if len > 1 { + let result_slice = kernel(&left_slice, &right_slice); + let expected_slice: Vec> = left_vec[1..].iter().zip(&right_vec[1..]).map(|(a, b)| match name { + "and" => ref_and_sql(*a, *b), + "or" => ref_or_sql(*a, *b), + "and_kleene" => ref_and_kleene(*a, *b), + "or_kleene" => ref_or_kleene(*a, *b), + _ => unreachable!(), + }).collect(); + let result_slice_vec: Vec> = result_slice.iter().collect(); + assert_eq!(result_slice_vec, expected_slice, "Sliced {} mismatch", name); + } + } + + // Test not separately + let result_not = not(&left).unwrap(); + let expected_not: Vec> = left_vec.iter().map(|a| ref_not(*a)).collect(); + let result_not_vec: Vec> = result_not.iter().collect(); + assert_eq!(result_not_vec, expected_not, "Full not mismatch"); + + if len > 1 { + let result_not_slice = not(&left_slice).unwrap(); + let expected_not_slice: Vec> = left_vec[1..].iter().map(|a| ref_not(*a)).collect(); + let result_not_slice_vec: Vec> = result_not_slice.iter().collect(); + assert_eq!(result_not_slice_vec, expected_not_slice, "Sliced not mismatch"); + } + } + } + + #[test] + fn test_boolean_array_byte_boundary_regressions() { + // Test historically dangerous bitmap patterns for BooleanArray binary/unary operations + // Construct BooleanArray from Vec> with length 10: [T, F, None, T, F, None, T, F, None, T] + // Underlying bitmap: bits for values and nulls + let data = vec![Some(true), Some(false), None, Some(true), Some(false), None, Some(true), Some(false), None, Some(true)]; + let array = BooleanArray::from(data.clone()); + + // Slice cases: (slice_start, slice_len, description) + let slice_cases = vec![ + (0, 5, "start=0, len=5"), + (1, 4, "start=1, len=4 (offset+len=5)"), + (3, 5, "start=3, len=5 (cross potential boundary)"), + (5, 5, "start=5, len=5"), + ]; + + for (start, len, desc) in slice_cases { + let slice = array.slice(start, len); + let slice_data = &data[start..start+len]; + + // Test unary NOT + let result_not = slice.unary(|a| !a); + let expected_not: Vec> = slice_data.iter().map(|x| x.map(|b| !b)).collect(); + let result_not_vec: Vec> = result_not.iter().collect(); + assert_eq!(result_not_vec, expected_not, "NOT {} mismatch", desc); + + // For binary, need another slice; use the same slice for simplicity, but with different op + // Test binary AND with itself (should be identity for non-null) + let result_and = slice.binary(&slice, |a, b| a & b).unwrap(); + let expected_and: Vec> = slice_data.iter().map(|x| match x { + Some(b) => Some(b & b), + None => None, + }).collect(); + let result_and_vec: Vec> = result_and.iter().collect(); + assert_eq!(result_and_vec, expected_and, "AND self {} mismatch", desc); + + // Test binary OR with itself + let result_or = slice.binary(&slice, |a, b| a | b).unwrap(); + let expected_or: Vec> = slice_data.iter().map(|x| match x { + Some(b) => Some(b | b), + None => None, + }).collect(); + let result_or_vec: Vec> = result_or.iter().collect(); + assert_eq!(result_or_vec, expected_or, "OR self {} mismatch", desc); + } + } + + #[test] + fn test_and_kleene_byte_boundary_regressions() { + // Test and_kleene with slices that hit byte boundaries + let left_data = vec![Some(true), Some(false), None, Some(true), Some(false), None, Some(true), Some(false), None, Some(true)]; + let right_data = vec![Some(false), Some(true), Some(true), Some(false), Some(true), Some(false), Some(true), Some(false), Some(true), Some(false)]; + let left = BooleanArray::from(left_data.clone()); + let right = BooleanArray::from(right_data.clone()); + + // Slice cases + let slice_cases = vec![ + (0, 5), + (1, 4), + (3, 5), + (5, 5), + ]; + + for (start, len) in slice_cases { + let left_slice = left.slice(start, len); + let right_slice = right.slice(start, len); + let left_slice_data = &left_data[start..start+len]; + let right_slice_data = &right_data[start..start+len]; + + let result = and_kleene(&left_slice, &right_slice).unwrap(); + let expected: Vec> = left_slice_data.iter().zip(right_slice_data).map(|(a, b)| match (a, b) { + (Some(a), Some(b)) => Some(a & b), + (None, Some(b)) => if !b { Some(false) } else { None }, + (Some(a), None) => if !a { Some(false) } else { None }, + (None, None) => None, + }).collect(); + let result_vec: Vec> = result.iter().collect(); + assert_eq!(result_vec, expected, "and_kleene slice start={}, len={} mismatch", start, len); + } + } } diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index 7967084aa7ab..9cc22ff242ec 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -21,7 +21,7 @@ use crate::iterator::BooleanIter; use crate::{Array, ArrayAccessor, ArrayRef, Scalar}; use arrow_buffer::{BooleanBuffer, Buffer, MutableBuffer, NullBuffer, bit_util}; use arrow_data::{ArrayData, ArrayDataBuilder}; -use arrow_schema::DataType; +use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::sync::Arc; @@ -284,6 +284,52 @@ impl BooleanArray { pub fn into_parts(self) -> (BooleanBuffer, Option) { (self.values, self.nulls) } + + /// Apply a bitwise unary operation to this array, returning a new array. + /// + /// The operation is applied to the values, and nulls are cloned. + /// + /// # Arguments + /// + /// * `op` - The unary operation to apply. + pub fn unary(&self, op: F) -> Self + where + F: Fn(u64) -> u64 + Copy, + { + let buffer = self.values().inner().bitwise_unary(self.values().offset(), self.len(), op); + let values = BooleanBuffer::new(buffer, 0, self.len()); + let nulls = self.nulls().cloned(); + Self::new(values, nulls) + } + + /// Apply a bitwise binary operation between this array and another, returning a new array. + /// + /// The operation is applied to the values, and nulls are combined as union. + /// + /// # Arguments + /// + /// * `other` - The other array. + /// * `op` - The binary operation to apply. + pub fn binary(&self, other: &BooleanArray, op: F) -> Result + where + F: Fn(u64, u64) -> u64 + Copy, + { + if self.len() != other.len() { + return Err(ArrowError::ComputeError( + "Cannot perform bitwise operation on arrays of different length".to_string(), + )); + } + let buffer = self.values().inner().bitwise_binary( + other.values().inner(), + self.values().offset(), + other.values().offset(), + self.len(), + op, + ); + let values = BooleanBuffer::new(buffer, 0, self.len()); + let nulls = NullBuffer::union(self.nulls(), other.nulls()); + Ok(Self::new(values, nulls)) + } } impl Array for BooleanArray { @@ -829,4 +875,45 @@ mod tests { assert_eq!(values.values(), &[0b1000_0000]); assert!(nulls.is_none()); } + + #[test] + fn test_boolean_array_binary_nulls() { + // Test BooleanArray::binary with nulls + let left = BooleanArray::from(vec![Some(true), None, Some(false), Some(true)]); + let right = BooleanArray::from(vec![Some(false), Some(true), None, Some(false)]); + + // Test and + let result = BooleanArray::binary(&left, &right, |a, b| a & b).unwrap(); + let expected = BooleanArray::from(vec![Some(false), None, None, Some(false)]); + assert_eq!(result.iter().collect::>(), expected.iter().collect::>()); + + // Test or + let result = BooleanArray::binary(&left, &right, |a, b| a | b).unwrap(); + let expected = BooleanArray::from(vec![Some(true), None, None, Some(true)]); + assert_eq!(result.iter().collect::>(), expected.iter().collect::>()); + + // Test with offsets (sliced arrays) + let left_sliced = left.slice(1, 3); // [None, false, true] + let right_sliced = right.slice(1, 3); // [true, None, false] + + let result = BooleanArray::binary(&left_sliced, &right_sliced, |a, b| a & b).unwrap(); + let expected = BooleanArray::from(vec![None, None, Some(false)]); + assert_eq!(result.iter().collect::>(), expected.iter().collect::>()); + } + + #[test] + fn test_boolean_array_unary_nulls() { + // Test BooleanArray::unary with nulls + let array = BooleanArray::from(vec![Some(true), None, Some(false), Some(true)]); + + let result = BooleanArray::unary(&array, |a| !a); + let expected = BooleanArray::from(vec![Some(false), None, Some(true), Some(false)]); + assert_eq!(result.iter().collect::>(), expected.iter().collect::>()); + + // Test with offsets + let sliced = array.slice(1, 3); // [None, false, true] + let result = BooleanArray::unary(&sliced, |a| !a); + let expected = BooleanArray::from(vec![None, Some(true), Some(false)]); + assert_eq!(result.iter().collect::>(), expected.iter().collect::>()); + } } diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index a3bcabbfdb34..0cd169b4dff7 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -18,8 +18,7 @@ use crate::bit_chunk_iterator::BitChunks; use crate::bit_iterator::{BitIndexIterator, BitIndexU32Iterator, BitIterator, BitSliceIterator}; use crate::{ - BooleanBufferBuilder, Buffer, MutableBuffer, bit_util, buffer_bin_and, buffer_bin_or, - buffer_bin_xor, buffer_unary_not, + BooleanBufferBuilder, Buffer, MutableBuffer, bit_util, }; use std::ops::{BitAnd, BitOr, BitXor, Not}; @@ -224,7 +223,7 @@ impl Not for &BooleanBuffer { fn not(self) -> Self::Output { BooleanBuffer { - buffer: buffer_unary_not(&self.buffer, self.offset, self.len), + buffer: self.buffer.bitwise_unary(self.offset, self.len, |a| !a), offset: 0, len: self.len, } @@ -237,7 +236,7 @@ impl BitAnd<&BooleanBuffer> for &BooleanBuffer { fn bitand(self, rhs: &BooleanBuffer) -> Self::Output { assert_eq!(self.len, rhs.len); BooleanBuffer { - buffer: buffer_bin_and(&self.buffer, self.offset, &rhs.buffer, rhs.offset, self.len), + buffer: self.buffer.bitwise_binary(&rhs.buffer, self.offset, rhs.offset, self.len, |a, b| a & b), offset: 0, len: self.len, } @@ -250,7 +249,7 @@ impl BitOr<&BooleanBuffer> for &BooleanBuffer { fn bitor(self, rhs: &BooleanBuffer) -> Self::Output { assert_eq!(self.len, rhs.len); BooleanBuffer { - buffer: buffer_bin_or(&self.buffer, self.offset, &rhs.buffer, rhs.offset, self.len), + buffer: self.buffer.bitwise_binary(&rhs.buffer, self.offset, rhs.offset, self.len, |a, b| a | b), offset: 0, len: self.len, } @@ -263,7 +262,7 @@ impl BitXor<&BooleanBuffer> for &BooleanBuffer { fn bitxor(self, rhs: &BooleanBuffer) -> Self::Output { assert_eq!(self.len, rhs.len); BooleanBuffer { - buffer: buffer_bin_xor(&self.buffer, self.offset, &rhs.buffer, rhs.offset, self.len), + buffer: self.buffer.bitwise_binary(&rhs.buffer, self.offset, rhs.offset, self.len, |a, b| a ^ b), offset: 0, len: self.len, } diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index 55a4621540c8..170e2740d628 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -28,7 +28,6 @@ use crate::{bit_util, bytes::Bytes, native::ArrowNativeType}; #[cfg(feature = "pool")] use crate::pool::MemoryPool; -use super::ops::bitwise_unary_op_helper; use super::{MutableBuffer, ScalarBuffer}; /// A contiguous memory region that can be shared with other buffers and across @@ -344,7 +343,7 @@ impl Buffer { return self.slice_with_length(offset / 8, bit_util::ceil(len, 8)); } - bitwise_unary_op_helper(self, offset, len, |a| a) + self.bitwise_unary(offset, len, |a| a) } /// Returns a `BitChunks` instance which can be used to iterate over this buffers bits @@ -444,6 +443,77 @@ impl Buffer { pub fn claim(&self, pool: &dyn MemoryPool) { self.data.claim(pool) } + + /// Apply a bitwise unary operation to this buffer, returning a new buffer. + /// + /// The operation is applied to `len` bits starting at `offset` bits. + /// + /// # Arguments + /// + /// * `offset` - The bit offset to start the operation. + /// * `len` - The number of bits to process. + /// * `op` - The unary operation to apply. + pub fn bitwise_unary(&self, offset: usize, len: usize, op: F) -> Buffer + where + F: Fn(u64) -> u64, + { + // reserve capacity and set length so we can get a typed view of u64 chunks + let mut result = + MutableBuffer::new(crate::util::bit_util::ceil(len, 8)).with_bitset(len / 64 * 8, false); + + let left_chunks = self.bit_chunks(offset, len); + + let result_chunks = result.typed_data_mut::().iter_mut(); + + result_chunks + .zip(left_chunks.iter()) + .for_each(|(res, left)| { + *res = op(left); + }); + + let remainder_bytes = crate::util::bit_util::ceil(left_chunks.remainder_len(), 8); + let rem = op(left_chunks.remainder_bits()); + // we are counting its starting from the least significant bit, to to_le_bytes should be correct + let rem = &rem.to_le_bytes()[0..remainder_bytes]; + result.extend_from_slice(rem); + + result.into() + } + + /// Apply a bitwise binary operation between this buffer and another, returning a new buffer. + /// + /// The operation is applied to `len` bits starting at `self_offset` in self and `other_offset` in other. + /// + /// # Arguments + /// + /// * `other` - The other buffer. + /// * `self_offset` - The bit offset in self. + /// * `other_offset` - The bit offset in other. + /// * `len` - The number of bits to process. + /// * `op` - The binary operation to apply. + pub fn bitwise_binary(&self, other: &Buffer, self_offset: usize, other_offset: usize, len: usize, op: F) -> Buffer + where + F: Fn(u64, u64) -> u64, + { + let left_chunks = self.bit_chunks(self_offset, len); + let right_chunks = other.bit_chunks(other_offset, len); + + let chunks = left_chunks + .iter() + .zip(right_chunks.iter()) + .map(|(left, right)| op(left, right)); + // Soundness: `BitChunks` is a `BitChunks` iterator which + // correctly reports its upper bound + let mut buffer = unsafe { MutableBuffer::from_trusted_len_iter(chunks) }; + + let remainder_bytes = crate::util::bit_util::ceil(left_chunks.remainder_len(), 8); + let rem = op(left_chunks.remainder_bits(), right_chunks.remainder_bits()); + // we are counting its starting from the least significant bit, to to_le_bytes should be correct + let rem = &rem.to_le_bytes()[0..remainder_bytes]; + buffer.extend_from_slice(rem); + + buffer.into() + } } /// Note that here we deliberately do not implement @@ -598,6 +668,7 @@ impl FromIterator for Buffer { #[cfg(test)] mod tests { use crate::i256; + use rand::{Rng, SeedableRng}; use std::panic::{RefUnwindSafe, UnwindSafe}; use std::thread; @@ -1065,4 +1136,252 @@ mod tests { drop(capture); assert_eq!(buffer2.strong_count(), 1); } + + #[test] + fn test_buffer_bitwise_binary_offsets() { + use crate::buffer::ops::{buffer_bin_and, buffer_bin_or}; + + let left = Buffer::from(vec![0b10101010u8, 0b01010101u8]); + let right = Buffer::from(vec![0b11110000u8, 0b00001111u8]); + + // Test with offset 0, len full (16 bits) + let new_result = left.bitwise_binary(&right, 0, 0, 16, |a, b| a & b); + let old_result = buffer_bin_and(&left, 0, &right, 0, 16); + assert_eq!(new_result, old_result); + + // Test with offset 1, len 7 (crosses byte boundary) + let new_result = left.bitwise_binary(&right, 1, 1, 7, |a, b| a & b); + let old_result = buffer_bin_and(&left, 1, &right, 1, 7); + assert_eq!(new_result, old_result); + + // Test tail bits, len not multiple of 8 (10 bits) + let new_result = left.bitwise_binary(&right, 0, 0, 10, |a, b| a & b); + let old_result = buffer_bin_and(&left, 0, &right, 0, 10); + assert_eq!(new_result, old_result); + + // Test or operation + let new_result = left.bitwise_binary(&right, 0, 0, 16, |a, b| a | b); + let old_result = buffer_bin_or(&left, 0, &right, 0, 16); + assert_eq!(new_result, old_result); + } + + #[test] + fn test_buffer_bitwise_unary_offsets() { + use crate::buffer::ops::buffer_unary_not; + + let buffer = Buffer::from(vec![0b10101010u8, 0b01010101u8]); + + // Test with offset 0, len full + let new_result = buffer.bitwise_unary(0, 16, |a| !a); + let old_result = buffer_unary_not(&buffer, 0, 16); + assert_eq!(new_result, old_result); + + // Test with offset 2, len 10 (crosses byte, tail bits) + let new_result = buffer.bitwise_unary(2, 10, |a| !a); + let old_result = buffer_unary_not(&buffer, 2, 10); + assert_eq!(new_result, old_result); + } + + #[test] + fn test_buffer_bitwise_binary_random_equivalence() { + use crate::buffer::ops::{buffer_bin_and, buffer_bin_or, buffer_bin_xor}; + + // Use a fixed seed for reproducible tests + let mut rng = rand::rngs::StdRng::from_seed([42u8; 32]); + + let buffer_sizes = [8, 16, 32, 64]; + let offsets = [0, 1, 3, 7, 8, 13]; + let lengths = [1, 2, 7, 8, 9, 15, 16]; + + for &size in &buffer_sizes { + for _ in 0..5 { // Generate 5 random pairs per size + let left_vec: Vec = (0..size).map(|_| rng.random::()).collect(); + let right_vec: Vec = (0..size).map(|_| rng.random::()).collect(); + let left = Buffer::from(left_vec); + let right = Buffer::from(right_vec); + let buffer_bits = left.len() * 8; + + for &offset in &offsets { + if offset >= buffer_bits { continue; } + for &len in &lengths { + if offset + len > buffer_bits { continue; } + + // Test AND + let new_and = left.bitwise_binary(&right, offset, offset, len, |a, b| a & b); + let old_and = buffer_bin_and(&left, offset, &right, offset, len); + assert_eq!(new_and, old_and, "AND failed for offset={}, len={}", offset, len); + + // Test OR + let new_or = left.bitwise_binary(&right, offset, offset, len, |a, b| a | b); + let old_or = buffer_bin_or(&left, offset, &right, offset, len); + assert_eq!(new_or, old_or, "OR failed for offset={}, len={}", offset, len); + + // Test XOR + let new_xor = left.bitwise_binary(&right, offset, offset, len, |a, b| a ^ b); + let old_xor = buffer_bin_xor(&left, offset, &right, offset, len); + assert_eq!(new_xor, old_xor, "XOR failed for offset={}, len={}", offset, len); + } + } + } + } + } + + #[test] + fn test_buffer_bitwise_unary_random_equivalence() { + use crate::buffer::ops::buffer_unary_not; + + // Use a fixed seed for reproducible tests + let mut rng = rand::rngs::StdRng::from_seed([43u8; 32]); + + let buffer_sizes = [8, 16, 32, 64]; + let offsets = [0, 1, 3, 7, 8, 13]; + let lengths = [1, 2, 7, 8, 9, 15, 16]; + + for &size in &buffer_sizes { + for _ in 0..5 { // Generate 5 random buffers per size + let vec: Vec = (0..size).map(|_| rng.random::()).collect(); + let buffer = Buffer::from(vec); + let buffer_bits = buffer.len() * 8; + + for &offset in &offsets { + if offset >= buffer_bits { continue; } + for &len in &lengths { + if offset + len > buffer_bits { continue; } + + let new_not = buffer.bitwise_unary(offset, len, |a| !a); + let old_not = buffer_unary_not(&buffer, offset, len); + assert_eq!(new_not, old_not, "NOT failed for offset={}, len={}", offset, len); + } + } + } + } + } + + #[test] + fn test_buffer_bitwise_boundaries() { + // Use a fixed seed for reproducible tests + let mut rng = rand::rngs::StdRng::from_seed([45u8; 32]); + + // Create a large buffer: 1024 bytes + let data: Vec = (0..1024).map(|_| rng.random::()).collect(); + let buffer = Buffer::from(data); + let total_bits = buffer.len() * 8; + + // Boundary configurations for unary + let boundary_configs = vec![ + (0, 0), // zero length + (0, total_bits), // full length + (1, total_bits - 1), // offset 1, to end + (7, total_bits - 7), // offset 7, crosses byte + (8, total_bits - 8), // offset at byte boundary + (total_bits - 1, 1), // last bit + (total_bits - 8, 8), // last byte + (total_bits - 9, 9), // last byte plus one + ]; + + for (offset, len) in boundary_configs { + if offset + len > total_bits { + continue; // skip invalid + } + + // Test bitwise_unary + let result = buffer.bitwise_unary(offset, len, |a| !a); + let expected_len = (len + 7) / 8; + assert_eq!(result.len(), expected_len, "Wrong length for offset={}, len={}", offset, len); + + // Idempotence: NOT twice should be identity, but since result is packed, compare lengths + let result_twice = Buffer::from(result.clone()).bitwise_unary(0, len, |a| !a); + assert_eq!(result_twice.len(), expected_len, "NOT twice length mismatch for offset={}, len={}", offset, len); + } + } + + #[test] + fn test_buffer_bitwise_binary_boundaries() { + use crate::buffer::ops::{buffer_bin_and, buffer_bin_or}; + + // Use a fixed seed for reproducible tests + let mut rng = rand::rngs::StdRng::from_seed([46u8; 32]); + + // Create two large buffers: 1024 bytes each + let data_left: Vec = (0..1024).map(|_| rng.random::()).collect(); + let data_right: Vec = (0..1024).map(|_| rng.random::()).collect(); + let left = Buffer::from(data_left); + let right = Buffer::from(data_right); + let total_bits = left.len() * 8; + + // Boundary configurations for binary + let boundary_configs = vec![ + (0, 0), // zero length + (0, total_bits), // full length + (1, total_bits - 1), // offset 1, to end + (7, total_bits - 7), // offset 7, crosses byte + (8, total_bits - 8), // offset at byte boundary + (total_bits - 1, 1), // last bit + (total_bits - 8, 8), // last byte + (total_bits - 9, 9), // last byte plus one + ]; + + for (offset, len) in boundary_configs { + if offset + len > total_bits { + continue; // skip invalid + } + + // Test bitwise_binary AND + let result_and = left.bitwise_binary(&right, offset, offset, len, |a, b| a & b); + let expected_len = (len + 7) / 8; + assert_eq!(result_and.len(), expected_len, "AND wrong length for offset={}, len={}", offset, len); + + // Compare with legacy for a few cases + if len <= 64 { // to keep test fast + let old_and = buffer_bin_and(&left, offset, &right, offset, len); + assert_eq!(result_and, old_and, "AND mismatch with legacy for offset={}, len={}", offset, len); + } + + // Test bitwise_binary OR + let result_or = left.bitwise_binary(&right, offset, offset, len, |a, b| a | b); + assert_eq!(result_or.len(), expected_len, "OR wrong length for offset={}, len={}", offset, len); + + // Compare with legacy for a few cases + if len <= 64 { + let old_or = buffer_bin_or(&left, offset, &right, offset, len); + assert_eq!(result_or, old_or, "OR mismatch with legacy for offset={}, len={}", offset, len); + } + } + } + + + + #[test] + fn test_buffer_bitwise_byte_boundary_regressions() { + use crate::buffer::ops::{buffer_bin_and, buffer_bin_or, buffer_unary_not}; + + // Construct small buffers + let left = Buffer::from(vec![0b11110000u8, 0b00001111u8]); // 240, 15 + let right = Buffer::from(vec![0b10101010u8, 0b01010101u8]); // 170, 85 + + // (offset, len, description) + let cases: &[(usize, usize, &str)] = &[ + (0, 8, "(offset=0, len=8) exact byte"), + (1, 7, "(offset=1, len=7) (offset+len)%8 == 0"), + (3, 5, "(offset=3, len=5) partial byte"), + (4, 8, "(offset=4, len=8) cross-byte"), + ]; + + for &(offset, len, desc) in cases { + // New AND vs legacy AND + let result_and = left.bitwise_binary(&right, offset, offset, len, |a, b| a & b); + let legacy_and = buffer_bin_and(&left, offset, &right, offset, len); + assert_eq!(result_and.as_slice(), &legacy_and[..], "AND {}", desc); + + // New OR vs legacy OR + let result_or = left.bitwise_binary(&right, offset, offset, len, |a, b| a | b); + let legacy_or = buffer_bin_or(&left, offset, &right, offset, len); + assert_eq!(result_or.as_slice(), &legacy_or[..], "OR {}", desc); + + // New NOT vs legacy NOT (unary, using only `left`) + let result_not = left.bitwise_unary(offset, len, |a| !a); + let legacy_not = buffer_unary_not(&left, offset, len); + assert_eq!(result_not.as_slice(), &legacy_not[..], "NOT {}", desc); + } + } } diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index 83c4d878346c..eae7fba82d8b 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -23,7 +23,7 @@ use crate::alloc::{ALIGNMENT, Deallocation}; use crate::{ bytes::Bytes, native::{ArrowNativeType, ToByteSlice}, - util::bit_util, + util::bit_util::{self, apply_bitwise_binary_op, apply_bitwise_unary_op}, }; #[cfg(feature = "pool")] @@ -596,6 +596,40 @@ impl MutableBuffer { pub fn claim(&self, pool: &dyn MemoryPool) { *self.reservation.lock().unwrap() = Some(pool.reserve(self.capacity())); } + + /// Apply a bitwise unary operation in place to this buffer. + /// + /// The operation is applied to `len` bits starting at `offset` bits. + /// + /// # Arguments + /// + /// * `offset` - The bit offset to start the operation. + /// * `len` - The number of bits to process. + /// * `op` - The unary operation to apply. + pub fn bitwise_unary_inplace(&mut self, offset: usize, len: usize, op: F) + where + F: Fn(u64) -> u64, + { + apply_bitwise_unary_op(self.as_slice_mut(), offset, len, op) + } + + /// Apply a bitwise binary operation in place between this buffer and another. + /// + /// The operation is applied to `len` bits starting at `self_offset` in self and `other_offset` in other. + /// + /// # Arguments + /// + /// * `other` - The other buffer. + /// * `self_offset` - The bit offset in self. + /// * `other_offset` - The bit offset in other. + /// * `len` - The number of bits to process. + /// * `op` - The binary operation to apply. + pub fn bitwise_binary_inplace(&mut self, other: &Buffer, self_offset: usize, other_offset: usize, len: usize, op: F) + where + F: Fn(u64, u64) -> u64, + { + apply_bitwise_binary_op(self.as_slice_mut(), self_offset, other.as_slice(), other_offset, len, op) + } } /// Creates a non-null pointer with alignment of [`ALIGNMENT`] @@ -931,6 +965,7 @@ impl std::iter::FromIterator for MutableBuffer { #[cfg(test)] mod tests { + use rand::{Rng, SeedableRng}; use super::*; #[test] @@ -1382,4 +1417,179 @@ mod tests { let data_1000: Vec = (0..1000).collect(); test_repeat_count(repeat_count, &data_1000); } + + #[test] + fn test_mutable_buffer_bitwise_inplace() { + let initial = vec![0b10101010u8, 0b01010101u8]; + let other = Buffer::from(vec![0b11110000u8, 0b00001111u8]); + + // Test in-place and vs allocating + let mut mutable = MutableBuffer::from(initial.clone()); + mutable.bitwise_binary_inplace(&other, 0, 0, 16, |a, b| a & b); + let inplace_result = Buffer::from(mutable); + + let allocating_result = Buffer::from(initial.clone()).bitwise_binary(&other, 0, 0, 16, |a, b| a & b); + + assert_eq!(inplace_result, allocating_result); + + // Test with offsets + let mut mutable = MutableBuffer::from(initial.clone()); + mutable.bitwise_binary_inplace(&other, 1, 1, 7, |a, b| a | b); + let inplace_result = Buffer::from(mutable); + + let allocating_result = Buffer::from(initial.clone()).bitwise_binary(&other, 1, 1, 7, |a, b| a | b); + + // Extract the operated bits: bits 1 to 7 from inplace_result (all in byte 0) + let operated_value = ((inplace_result.as_slice()[0] >> 1) & 0x7F) as u8; + assert_eq!(operated_value, allocating_result.as_slice()[0]); + + // Test unary in-place + let mut mutable = MutableBuffer::from(initial.clone()); + mutable.bitwise_unary_inplace(0, 16, |a| !a); + let inplace_result = Buffer::from(mutable); + + let allocating_result = Buffer::from(initial).bitwise_unary(0, 16, |a| !a); + + assert_eq!(inplace_result, allocating_result); + } + + #[test] + fn test_mutable_buffer_bitwise_inplace_random_equivalence() { + // Use a fixed seed for reproducible tests + let mut rng = rand::rngs::StdRng::from_seed([44u8; 32]); + + let buffer_sizes = [8, 16, 32, 64]; + let offsets = [0, 1, 3, 7, 8, 13]; + let lengths = [1, 2, 7, 8, 9, 15, 16]; + + for &size in &buffer_sizes { + for _ in 0..5 { // Generate 5 random pairs per size + let initial_vec: Vec = (0..size).map(|_| rng.random::()).collect(); + let other_vec: Vec = (0..size).map(|_| rng.random::()).collect(); + let other = Buffer::from(other_vec); + let buffer_bits = initial_vec.len() * 8; + + for &offset in &offsets { + if offset >= buffer_bits { continue; } + for &len in &lengths { + if offset + len > buffer_bits { continue; } + + // Test binary AND in-place + let mut mutable = MutableBuffer::from(initial_vec.clone()); + mutable.bitwise_binary_inplace(&other, offset, offset, len, |a, b| a & b); + let inplace_result = Buffer::from(mutable); + + let allocating_result = Buffer::from(initial_vec.clone()).bitwise_binary(&other, offset, offset, len, |a, b| a & b); + + // Compare bit by bit + for i in 0..len { + let inplace_bit = crate::bit_util::get_bit(inplace_result.as_slice(), offset + i); + let expected_bit = crate::bit_util::get_bit(allocating_result.as_slice(), i); + assert_eq!(inplace_bit, expected_bit, "AND bit {} mismatch for offset={}, len={}", i, offset, len); + } + + // Test binary OR in-place + let mut mutable = MutableBuffer::from(initial_vec.clone()); + mutable.bitwise_binary_inplace(&other, offset, offset, len, |a, b| a | b); + let inplace_result = Buffer::from(mutable); + + let allocating_result = Buffer::from(initial_vec.clone()).bitwise_binary(&other, offset, offset, len, |a, b| a | b); + + // Compare bit by bit + for i in 0..len { + let inplace_bit = crate::bit_util::get_bit(inplace_result.as_slice(), offset + i); + let expected_bit = crate::bit_util::get_bit(allocating_result.as_slice(), i); + assert_eq!(inplace_bit, expected_bit, "OR bit {} mismatch for offset={}, len={}", i, offset, len); + } + + // Test unary NOT in-place + let mut mutable = MutableBuffer::from(initial_vec.clone()); + mutable.bitwise_unary_inplace(offset, len, |a| !a); + let inplace_result = Buffer::from(mutable); + + let allocating_result = Buffer::from(initial_vec.clone()).bitwise_unary(offset, len, |a| !a); + + // Compare bit by bit + for i in 0..len { + let inplace_bit = crate::bit_util::get_bit(inplace_result.as_slice(), offset + i); + let expected_bit = crate::bit_util::get_bit(allocating_result.as_slice(), i); + assert_eq!(inplace_bit, expected_bit, "NOT bit {} mismatch for offset={}, len={}", i, offset, len); + } + } + } + } + } + } + + #[test] + fn test_mutable_buffer_bitwise_inplace_boundaries() { + // Use a fixed seed for reproducible tests + let mut rng = rand::rngs::StdRng::from_seed([47u8; 32]); + + // Create large buffers: 1024 bytes + let data: Vec = (0..1024).map(|_| rng.random::()).collect(); + let other_data: Vec = (0..1024).map(|_| rng.random::()).collect(); + let other = Buffer::from(other_data); + let total_bits = data.len() * 8; + + // Boundary configurations + let boundary_configs = vec![ + (0, 0), // zero length + (0, total_bits), // full length + (1, total_bits - 1), // offset 1, to end + (7, total_bits - 7), // offset 7, crosses byte + (8, total_bits - 8), // offset at byte boundary + (total_bits - 1, 1), // last bit + (total_bits - 8, 8), // last byte + (total_bits - 9, 9), // last byte plus one + ]; + + for (offset, len) in boundary_configs { + if offset + len > total_bits { + continue; // skip invalid + } + + // Test unary in-place + let mut mutable = MutableBuffer::from(data.clone()); + mutable.bitwise_unary_inplace(offset, len, |a| !a); + let inplace_result = Buffer::from(mutable); + + let allocating_result = Buffer::from(data.clone()).bitwise_unary(offset, len, |a| !a); + + // Compare bit by bit + for i in 0..len { + let inplace_bit = crate::bit_util::get_bit(inplace_result.as_slice(), offset + i); + let expected_bit = crate::bit_util::get_bit(allocating_result.as_slice(), i); + assert_eq!(inplace_bit, expected_bit, "Unary bit {} mismatch for offset={}, len={}", i, offset, len); + } + + // Test binary in-place AND + let mut mutable = MutableBuffer::from(data.clone()); + mutable.bitwise_binary_inplace(&other, offset, offset, len, |a, b| a & b); + let inplace_result = Buffer::from(mutable); + + let allocating_result = Buffer::from(data.clone()).bitwise_binary(&other, offset, offset, len, |a, b| a & b); + + // Compare bit by bit + for i in 0..len { + let inplace_bit = crate::bit_util::get_bit(inplace_result.as_slice(), offset + i); + let expected_bit = crate::bit_util::get_bit(allocating_result.as_slice(), i); + assert_eq!(inplace_bit, expected_bit, "Binary AND bit {} mismatch for offset={}, len={}", i, offset, len); + } + + // Test binary in-place OR + let mut mutable = MutableBuffer::from(data.clone()); + mutable.bitwise_binary_inplace(&other, offset, offset, len, |a, b| a | b); + let inplace_result = Buffer::from(mutable); + + let allocating_result = Buffer::from(data.clone()).bitwise_binary(&other, offset, offset, len, |a, b| a | b); + + // Compare bit by bit + for i in 0..len { + let inplace_bit = crate::bit_util::get_bit(inplace_result.as_slice(), offset + i); + let expected_bit = crate::bit_util::get_bit(allocating_result.as_slice(), i); + assert_eq!(inplace_bit, expected_bit, "Binary OR bit {} mismatch for offset={}, len={}", i, offset, len); + } + } + } } diff --git a/arrow-buffer/src/buffer/ops.rs b/arrow-buffer/src/buffer/ops.rs index c69e5c6deb10..f0c591ab8dd8 100644 --- a/arrow-buffer/src/buffer/ops.rs +++ b/arrow-buffer/src/buffer/ops.rs @@ -60,73 +60,39 @@ where /// Apply a bitwise operation `op` to two inputs and return the result as a Buffer. /// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits. +#[deprecated(note = "use Buffer::bitwise_binary instead")] pub fn bitwise_bin_op_helper( left: &Buffer, left_offset_in_bits: usize, right: &Buffer, right_offset_in_bits: usize, len_in_bits: usize, - mut op: F, + op: F, ) -> Buffer where - F: FnMut(u64, u64) -> u64, + F: Fn(u64, u64) -> u64 + Copy, { - let left_chunks = left.bit_chunks(left_offset_in_bits, len_in_bits); - let right_chunks = right.bit_chunks(right_offset_in_bits, len_in_bits); - - let chunks = left_chunks - .iter() - .zip(right_chunks.iter()) - .map(|(left, right)| op(left, right)); - // Soundness: `BitChunks` is a `BitChunks` iterator which - // correctly reports its upper bound - let mut buffer = unsafe { MutableBuffer::from_trusted_len_iter(chunks) }; - - let remainder_bytes = ceil(left_chunks.remainder_len(), 8); - let rem = op(left_chunks.remainder_bits(), right_chunks.remainder_bits()); - // we are counting its starting from the least significant bit, to to_le_bytes should be correct - let rem = &rem.to_le_bytes()[0..remainder_bytes]; - buffer.extend_from_slice(rem); - - buffer.into() + left.bitwise_binary(right, left_offset_in_bits, right_offset_in_bits, len_in_bits, op) } /// Apply a bitwise operation `op` to one input and return the result as a Buffer. /// The input is treated as a bitmap, meaning that offset and length are specified in number of bits. +#[deprecated(note = "use Buffer::bitwise_unary instead")] pub fn bitwise_unary_op_helper( left: &Buffer, offset_in_bits: usize, len_in_bits: usize, - mut op: F, + op: F, ) -> Buffer where - F: FnMut(u64) -> u64, + F: Fn(u64) -> u64 + Copy, { - // reserve capacity and set length so we can get a typed view of u64 chunks - let mut result = - MutableBuffer::new(ceil(len_in_bits, 8)).with_bitset(len_in_bits / 64 * 8, false); - - let left_chunks = left.bit_chunks(offset_in_bits, len_in_bits); - - let result_chunks = result.typed_data_mut::().iter_mut(); - - result_chunks - .zip(left_chunks.iter()) - .for_each(|(res, left)| { - *res = op(left); - }); - - let remainder_bytes = ceil(left_chunks.remainder_len(), 8); - let rem = op(left_chunks.remainder_bits()); - // we are counting its starting from the least significant bit, to to_le_bytes should be correct - let rem = &rem.to_le_bytes()[0..remainder_bytes]; - result.extend_from_slice(rem); - - result.into() + left.bitwise_unary(offset_in_bits, len_in_bits, op) } /// Apply a bitwise and to two inputs and return the result as a Buffer. /// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits. +#[deprecated(note = "use Buffer::bitwise_binary with |a, b| a & b or BooleanArray::binary instead")] pub fn buffer_bin_and( left: &Buffer, left_offset_in_bits: usize, @@ -134,18 +100,12 @@ pub fn buffer_bin_and( right_offset_in_bits: usize, len_in_bits: usize, ) -> Buffer { - bitwise_bin_op_helper( - left, - left_offset_in_bits, - right, - right_offset_in_bits, - len_in_bits, - |a, b| a & b, - ) + left.bitwise_binary(right, left_offset_in_bits, right_offset_in_bits, len_in_bits, |a, b| a & b) } /// Apply a bitwise or to two inputs and return the result as a Buffer. /// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits. +#[deprecated(note = "use Buffer::bitwise_binary with |a, b| a | b or BooleanArray::binary instead")] pub fn buffer_bin_or( left: &Buffer, left_offset_in_bits: usize, @@ -153,18 +113,12 @@ pub fn buffer_bin_or( right_offset_in_bits: usize, len_in_bits: usize, ) -> Buffer { - bitwise_bin_op_helper( - left, - left_offset_in_bits, - right, - right_offset_in_bits, - len_in_bits, - |a, b| a | b, - ) + left.bitwise_binary(right, left_offset_in_bits, right_offset_in_bits, len_in_bits, |a, b| a | b) } /// Apply a bitwise xor to two inputs and return the result as a Buffer. /// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits. +#[deprecated(note = "use Buffer::bitwise_binary with |a, b| a ^ b or BooleanArray::binary instead")] pub fn buffer_bin_xor( left: &Buffer, left_offset_in_bits: usize, @@ -172,18 +126,12 @@ pub fn buffer_bin_xor( right_offset_in_bits: usize, len_in_bits: usize, ) -> Buffer { - bitwise_bin_op_helper( - left, - left_offset_in_bits, - right, - right_offset_in_bits, - len_in_bits, - |a, b| a ^ b, - ) + left.bitwise_binary(right, left_offset_in_bits, right_offset_in_bits, len_in_bits, |a, b| a ^ b) } /// Apply a bitwise and_not to two inputs and return the result as a Buffer. /// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits. +#[deprecated(note = "use Buffer::bitwise_binary with |a, b| a & !b or BooleanArray::binary instead")] pub fn buffer_bin_and_not( left: &Buffer, left_offset_in_bits: usize, @@ -191,18 +139,12 @@ pub fn buffer_bin_and_not( right_offset_in_bits: usize, len_in_bits: usize, ) -> Buffer { - bitwise_bin_op_helper( - left, - left_offset_in_bits, - right, - right_offset_in_bits, - len_in_bits, - |a, b| a & !b, - ) + left.bitwise_binary(right, left_offset_in_bits, right_offset_in_bits, len_in_bits, |a, b| a & !b) } /// Apply a bitwise not to one input and return the result as a Buffer. /// The input is treated as a bitmap, meaning that offset and length are specified in number of bits. +#[deprecated(note = "use Buffer::bitwise_unary with |a| !a or BooleanArray::unary instead")] pub fn buffer_unary_not(left: &Buffer, offset_in_bits: usize, len_in_bits: usize) -> Buffer { - bitwise_unary_op_helper(left, offset_in_bits, len_in_bits, |a| !a) + left.bitwise_unary(offset_in_bits, len_in_bits, |a| !a) } diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index 8e3cc7d56c71..cc7603f1ca32 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -17,11 +17,50 @@ //! Implements the `nullif` function for Arrow arrays. +/* + * NULLIF Implementation Contract + * + * For any ArrayData: + * len = data.len() // logical elements + * offset = data.offset() // logical starting index into buffers + * + * Validity bitmap (if present) is a Buffer B. + * Invariant: + * Logical index i in [0, len) is valid iff get_bit(B, offset + i) == true. + * + * For the result of nullif: + * We will build a fresh ArrayData with offset = 0. + * For that result: + * Logical index i is valid iff get_bit(result_validity, i) == true. + * Values buffer is laid out so element 0 is first result value, etc. + * + * For nullif semantics: + * Let V(i) = left is valid at i + * C(i) = condition "nullify at i" is true (depends on left, right, type) + * Then: + * result_valid(i) = V(i) & !C(i) + * result_value(i) = left_value(i) // when result_valid(i) == true + * + * This contract is the law. All nullif implementations must follow it. + * See docs/arraydata_bitmap_layout_contract.md for full details. + */ + use arrow_array::{Array, ArrayRef, BooleanArray, make_array}; -use arrow_buffer::buffer::{bitwise_bin_op_helper, bitwise_unary_op_helper}; -use arrow_buffer::{BooleanBuffer, NullBuffer}; +use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer}; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType}; +fn get_element_size(data_type: &DataType) -> usize { + match data_type { + DataType::Boolean => 1, + DataType::Int8 | DataType::UInt8 => 1, + DataType::Int16 | DataType::UInt16 => 2, + DataType::Int32 | DataType::UInt32 | DataType::Float32 => 4, + DataType::Int64 | DataType::UInt64 | DataType::Float64 => 8, + _ => panic!("unsupported"), + } +} + /// Returns a new array with the same values and the validity bit to false where /// the corresponding element of`right` is true. /// @@ -55,65 +94,164 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result right.values() & nulls.inner(), None => right.values().clone(), }; - - // Compute left null bitmap & !right - - let (combined, null_count) = match left_data.nulls() { - Some(left) => { - let mut valid_count = 0; - let b = bitwise_bin_op_helper( - left.buffer(), - left.offset(), - right.inner(), - right.offset(), - len, - |l, r| { - let t = l & !r; - valid_count += t.count_ones() as usize; - t - }, - ); - (b, len - valid_count) - } - None => { - let mut null_count = 0; - let buffer = bitwise_unary_op_helper(right.inner(), right.offset(), len, |b| { - let t = !b; - null_count += t.count_zeros() as usize; - t - }); - (buffer, null_count) + let cond_offset = cond_mask.offset(); + + let validity = if let Some(left_nulls) = left_data.nulls() { + let left_valid = left_nulls.inner().inner(); + let left_bit_offset = left_nulls.inner().offset(); + compute_nullif_validity(&left_valid, left_bit_offset, &cond_mask.inner(), cond_offset, len) + } else { + // left is all valid + let left_valid_len = (len + 7) / 8; + let mut left_valid_data = vec![255u8; left_valid_len]; + if len % 8 != 0 { + let mask = (1u8 << (len % 8)) - 1; + left_valid_data[left_valid_len - 1] = mask; } + let left_valid = Buffer::from(left_valid_data); + let left_bit_offset = 0; + compute_nullif_validity(&left_valid, left_bit_offset, &cond_mask.inner(), cond_offset, len) }; - let combined = BooleanBuffer::new(combined, 0, len); - // Safety: - // Counted nulls whilst computing - let nulls = unsafe { NullBuffer::new_unchecked(combined, null_count) }; - let data = left_data.into_builder().nulls(Some(nulls)); + let (null_buffer, _) = compute_null_buffer(&validity, len); + let nulls = NullBuffer::new(null_buffer); + + let data_without_nulls = copy_array_data_with_offset_zero(&left_data)?; + let data = ArrayDataBuilder::new(left_data.data_type().clone()) + .len(len) + .nulls(Some(nulls)) + .offset(0) + .buffers(data_without_nulls.buffers().to_vec()) + .child_data(data_without_nulls.child_data().to_vec()) + .build() + .unwrap(); + + Ok(make_array(data)) +} + +/// Computes the null buffer and null count from the validity buffer. +/// +/// The null buffer has bits set where validity is 1 (not null). +/// The null count is computed as len - number of set bits in validity. +fn compute_null_buffer(validity: &Buffer, len: usize) -> (BooleanBuffer, usize) { + let null_buffer = BooleanBuffer::new(validity.clone(), 0, len); + let null_count = len - validity.count_set_bits_offset(0, len); + (null_buffer, null_count) +} + +/// Computes the NULLIF validity bitmap from left validity and condition mask. +/// +/// For each logical index `i` in `0..len`: +/// - `left_bit = bit(left_valid, left_offset + i)` +/// - `cond_bit = bit(cond_mask, cond_offset + i)` +/// - `result_bit = left_bit & !cond_bit` +/// +/// This implements the core bitmap logic for NULLIF, isolating it from array-level concerns. +fn compute_nullif_validity( + left_valid: &Buffer, + left_offset: usize, + cond_mask: &Buffer, + cond_offset: usize, + len: usize, +) -> Buffer { + left_valid.bitwise_binary(cond_mask, left_offset, cond_offset, len, |l, c| l & !c) +} + +/// Creates a new ArrayData with offset=0 by slicing the buffers from the given ArrayData. +/// This ensures the result ArrayData has offset=0 as per the nullif contract. +fn copy_array_data_with_offset_zero(left_data: &ArrayData) -> Result { + let len = left_data.len(); + let offset = left_data.offset(); + let data_type = left_data.data_type(); + let mut buffers = Vec::new(); + let mut child_data = Vec::new(); + + match data_type { + DataType::Boolean => { + let values = left_data.buffers()[0].bitwise_unary(offset, len, |b| b); + buffers.push(values); + } + DataType::Int8 | DataType::UInt8 | DataType::Int16 | DataType::UInt16 | + DataType::Int32 | DataType::UInt32 | DataType::Float32 | + DataType::Int64 | DataType::UInt64 | DataType::Float64 => { + let element_size = get_element_size(data_type); + let start = offset * element_size; + let end = start + len * element_size; + let values = Buffer::from(&left_data.buffers()[0].as_slice()[start..end]); + buffers.push(values); + } + DataType::Utf8 => { + let offsets_buf = &left_data.buffers()[0]; + let data_buf = &left_data.buffers()[1]; + let offsets_start = offset; + let offsets_end = offset + len + 1; + let offsets_slice = &offsets_buf.as_slice()[offsets_start * 4..offsets_end * 4]; + let base_offset = i32::from_le_bytes(offsets_slice[0..4].try_into().unwrap()) as usize; + let data_end = i32::from_le_bytes(offsets_slice[offsets_slice.len() - 4..].try_into().unwrap()) as usize; + let data_slice = &data_buf.as_slice()[base_offset..data_end]; + let mut new_offsets = Vec::with_capacity(len + 1); + for i in 0..=len { + let old_offset = i32::from_le_bytes(offsets_slice[i * 4..(i + 1) * 4].try_into().unwrap()) as usize; + new_offsets.push((old_offset - base_offset) as i32); + } + let new_offsets_buf = Buffer::from(new_offsets.into_iter().flat_map(|x| x.to_le_bytes()).collect::>()); + buffers.push(new_offsets_buf); + buffers.push(Buffer::from(data_slice)); + } + DataType::LargeUtf8 => { + let offsets_buf = &left_data.buffers()[0]; + let data_buf = &left_data.buffers()[1]; + let offsets_start = offset; + let offsets_end = offset + len + 1; + let offsets_slice = &offsets_buf.as_slice()[offsets_start * 8..offsets_end * 8]; + let base_offset = i64::from_le_bytes(offsets_slice[0..8].try_into().unwrap()) as usize; + let data_end = i64::from_le_bytes(offsets_slice[offsets_slice.len() - 8..].try_into().unwrap()) as usize; + let data_slice = &data_buf.as_slice()[base_offset..data_end]; + let mut new_offsets = Vec::with_capacity(len + 1); + for i in 0..=len { + let old_offset = i64::from_le_bytes(offsets_slice[i * 8..(i + 1) * 8].try_into().unwrap()) as usize; + new_offsets.push((old_offset - base_offset) as i64); + } + let new_offsets_buf = Buffer::from(new_offsets.into_iter().flat_map(|x| x.to_le_bytes()).collect::>()); + buffers.push(new_offsets_buf); + buffers.push(Buffer::from(data_slice)); + } + DataType::Struct(_) => { + // No buffers for struct + child_data = left_data.child_data().iter() + .map(|child| copy_array_data_with_offset_zero(child)) + .collect::, _>>()?; + } + _ => { + // For other types, copy buffers as is and keep offset (for now, to avoid breaking) + // TODO: implement slicing for other types like List, Binary, etc. + buffers = left_data.buffers().to_vec(); + child_data = left_data.child_data().to_vec(); + return Ok(ArrayDataBuilder::new(data_type.clone()) + .len(len) + .offset(offset) + .buffers(buffers) + .child_data(child_data) + .build()?); + } + } - // SAFETY: - // Only altered null mask - Ok(make_array(unsafe { data.build_unchecked() })) + Ok(ArrayDataBuilder::new(data_type.clone()) + .len(len) + .offset(0) + .buffers(buffers) + .child_data(child_data) + .build()?) } #[cfg(test)] mod tests { use super::*; + use arrow_buffer::bit_util::get_bit; use arrow_array::builder::{BooleanBuilder, Int32Builder, StructBuilder}; use arrow_array::cast::AsArray; use arrow_array::types::Int32Type; @@ -526,4 +664,230 @@ mod tests { } } } + + #[test] + fn test_nullif_with_offsets_and_nulls() { + // Build arrays using builder to ensure correct null count + let array = Int32Array::from(vec![Some(15), None, Some(8), Some(1), Some(9)]); + + let condition = BooleanArray::from(vec![Some(false), None, Some(true), Some(false), None]); + + // Test on full arrays + let result_full = nullif(&array, &condition).unwrap(); + let expected_full = Int32Array::from(vec![ + Some(15), // false -> keep + None, // null condition -> keep (was null) + None, // true -> null + Some(1), // false -> keep + Some(9), // null condition -> keep + ]); + assert_eq!(result_full.as_primitive::(), &expected_full); + + // Test on sliced arrays + let array_sliced = array.slice(1, 3); // None, Some(8), Some(1) + let condition_sliced = condition.slice(1, 3); // None, Some(true), Some(false) + let result_sliced = nullif(&array_sliced, &condition_sliced).unwrap(); + + let expected_sliced = Int32Array::from(vec![ + None, // null condition -> keep (was null) + None, // true -> null + Some(1), // false -> keep + ]); + assert_eq!(result_sliced.as_primitive::(), &expected_sliced); + + // Test with different condition + let array2 = Int32Array::from(vec![Some(10), None, Some(20), Some(30), None]); + + let condition2 = BooleanArray::from(vec![Some(false), Some(true), Some(false), Some(true), Some(false)]); + + let result = nullif(&array2, &condition2).unwrap(); + let expected = Int32Array::from(vec![ + Some(10), // false -> keep + None, // true -> null (was null) + Some(20), // false -> keep + None, // true -> null + None, // false -> keep (was null) + ]); + assert_eq!(result.as_primitive::(), &expected); + } + + #[test] + fn test_nullif_null_count_and_offsets_regressions() { + // Construct a small Int32Array with nulls + let values = Int32Array::from(vec![Some(1), None, Some(2), Some(1), None, Some(3)]); + // Construct a "condition" array (equals values for nullif) + let equals = BooleanArray::from(vec![Some(true), Some(false), None, Some(true), Some(false), None]); + + // Helper to build expected Vec> + fn build_expected(values: &[Option], equals: &[Option]) -> Vec> { + values.iter().zip(equals.iter()).map(|(&v, &e)| { + if v.is_none() { + None + } else if e == Some(true) { + None + } else { + v + } + }).collect() + } + + // Test full arrays + let result_full = nullif(&values, &equals).unwrap(); + assert_eq!(result_full.len(), values.len()); + let actual_nulls: Vec> = result_full.as_primitive::().iter().collect(); + let expected_full = build_expected(&[Some(1), None, Some(2), Some(1), None, Some(3)], &[Some(true), Some(false), None, Some(true), Some(false), None]); + assert_eq!(actual_nulls, expected_full); + assert_eq!(result_full.null_count(), actual_nulls.iter().filter(|x| x.is_none()).count()); + + // Test sliced arrays: values.slice(1, 4) and equals.slice(1, 4) + // values[1..5]: None, Some(2), Some(1), None + // equals[1..5]: Some(false), None, Some(true), Some(false) + let values_slice1 = values.slice(1, 4); + let equals_slice1 = equals.slice(1, 4); + let result_slice1 = nullif(&values_slice1, &equals_slice1).unwrap(); + assert_eq!(result_slice1.len(), 4); + let actual_slice1: Vec> = result_slice1.as_primitive::().iter().collect(); + let expected_slice1 = build_expected(&[None, Some(2), Some(1), None], &[Some(false), None, Some(true), Some(false)]); + assert_eq!(actual_slice1, expected_slice1); + assert_eq!(result_slice1.null_count(), actual_slice1.iter().filter(|x| x.is_none()).count()); + + // Test another slice: values.slice(2, 3) and equals.slice(2, 3) + // values[2..5]: Some(2), Some(1), None + // equals[2..5]: None, Some(true), Some(false) + let values_slice2 = values.slice(2, 3); + let equals_slice2 = equals.slice(2, 3); + let result_slice2 = nullif(&values_slice2, &equals_slice2).unwrap(); + assert_eq!(result_slice2.len(), 3); + let actual_slice2: Vec> = result_slice2.as_primitive::().iter().collect(); + let expected_slice2 = build_expected(&[Some(2), Some(1), None], &[None, Some(true), Some(false)]); + assert_eq!(actual_slice2, expected_slice2); + assert_eq!(result_slice2.null_count(), actual_slice2.iter().filter(|x| x.is_none()).count()); + } + + #[test] + fn test_nullif_boolean_offsets_regression() { + // Use BooleanArray to reproduce BooleanBuffer slicing issues + let values = BooleanArray::from(vec![Some(true), None, Some(false), Some(true), None, Some(false), Some(true), Some(false)]); + let equals = BooleanArray::from(vec![Some(false), Some(true), None, Some(true), Some(false), None, Some(true), Some(false)]); + + // Helper to build expected Vec> + fn build_expected_bool(values: &[Option], equals: &[Option]) -> Vec> { + values.iter().zip(equals.iter()).map(|(&v, &e)| { + if v.is_none() { + None + } else if e == Some(true) { + None + } else { + v + } + }).collect() + } + + // Test slices that cross byte boundaries (e.g., offset=1,len=5; offset=3,len=5; offset=7,len=1) + // Slice 1: offset=1, len=5 (values[1..6]: None, Some(false), Some(true), None, Some(false)) + // equals[1..6]: Some(true), None, Some(true), Some(false), None + let values_slice1 = values.slice(1, 5); + let equals_slice1 = equals.slice(1, 5); + let result_slice1 = nullif(&values_slice1, &equals_slice1).unwrap(); + assert_eq!(result_slice1.len(), 5); + let actual_slice1: Vec> = result_slice1.as_boolean().iter().collect(); + let expected_slice1 = build_expected_bool(&[None, Some(false), Some(true), None, Some(false)], &[Some(true), None, Some(true), Some(false), None]); + assert_eq!(actual_slice1, expected_slice1); + assert_eq!(result_slice1.null_count(), actual_slice1.iter().filter(|x| x.is_none()).count()); + + // Slice 2: offset=3, len=5 (values[3..8]: Some(true), None, Some(false), Some(true), Some(false)) + // equals[3..8]: Some(true), Some(false), None, Some(true), Some(false) + let values_slice2 = values.slice(3, 5); + let equals_slice2 = equals.slice(3, 5); + let result_slice2 = nullif(&values_slice2, &equals_slice2).unwrap(); + assert_eq!(result_slice2.len(), 5); + let actual_slice2: Vec> = result_slice2.as_boolean().iter().collect(); + let expected_slice2 = build_expected_bool(&[Some(true), None, Some(false), Some(true), Some(false)], &[Some(true), Some(false), None, Some(true), Some(false)]); + assert_eq!(actual_slice2, expected_slice2); + assert_eq!(result_slice2.null_count(), actual_slice2.iter().filter(|x| x.is_none()).count()); + + // Slice 3: offset=7, len=1 (values[7..8]: Some(false)) + // equals[7..8]: Some(false) + let values_slice3 = values.slice(7, 1); + let equals_slice3 = equals.slice(7, 1); + let result_slice3 = nullif(&values_slice3, &equals_slice3).unwrap(); + assert_eq!(result_slice3.len(), 1); + let actual_slice3: Vec> = result_slice3.as_boolean().iter().collect(); + let expected_slice3 = build_expected_bool(&[Some(false)], &[Some(false)]); + assert_eq!(actual_slice3, expected_slice3); + assert_eq!(result_slice3.null_count(), actual_slice3.iter().filter(|x| x.is_none()).count()); + } + + #[test] + fn test_nullif_all_null_and_empty() { + // Test all-null input + let all_null = Int32Array::from(vec![None, None, None]); + let condition = BooleanArray::from(vec![Some(true), Some(false), None]); + let result = nullif(&all_null, &condition).unwrap(); + assert_eq!(result.len(), 3); + assert_eq!(result.null_count(), 3); + let actual: Vec> = result.as_primitive::().iter().collect(); + assert_eq!(actual, vec![None, None, None]); + + // Test no-null input + let no_null = Int32Array::from(vec![Some(1), Some(2), Some(3)]); + let condition = BooleanArray::from(vec![Some(false), Some(true), Some(false)]); + let result = nullif(&no_null, &condition).unwrap(); + assert_eq!(result.len(), 3); + assert_eq!(result.null_count(), 1); + let actual: Vec> = result.as_primitive::().iter().collect(); + assert_eq!(actual, vec![Some(1), None, Some(3)]); + + // Test empty arrays + let empty_values = Int32Array::from(Vec::>::new()); + let empty_condition = BooleanArray::from(Vec::>::new()); + let result = nullif(&empty_values, &empty_condition).unwrap(); + assert_eq!(result.len(), 0); + assert_eq!(result.null_count(), 0); + } + + #[test] + fn test_compute_nullif_validity_basic_and_offsets() { + // left: 0b11110000, 0b00001111 + // cond: 0b10101010, 0b01010101 + let left = Buffer::from(vec![0b11110000u8, 0b00001111u8]); + let cond = Buffer::from(vec![0b10101010u8, 0b01010101u8]); + + // Cases: (left_offset, cond_offset, len, expected_bits as Vec) + let cases = vec![ + // No offset, 8 bits + (0, 0, 8, vec![ + // left bits: 0 0 0 0 1 1 1 1 + // cond bits: 0 1 0 1 0 1 0 1 + // result: 0&!0=0, 0&!1=0, 0&!0=0, 0&!1=0, 1&!0=1, 1&!1=0, 1&!0=1, 1&!1=0 + // => 0,0,0,0,1,0,1,0 + false, false, false, false, true, false, true, false + ]), + // Offsets that make (offset+len)%8==0 + (1, 1, 7, vec![ + // left_offset=1, left bits 1-7: 0,0,0,1,1,1,1 + // cond_offset=1, cond bits 1-7: 1,0,1,0,1,0,1 + // result: 0&!1=0, 0&!0=0, 0&!1=0, 1&!0=1, 1&!1=0, 1&!0=1, 1&!1=0 + // => 0,0,0,1,0,1,0 + false, false, false, true, false, true, false + ]), + // A cross-byte case + (3, 3, 5, vec![ + // left_offset=3, left bits 3-7: 0,1,1,1,1 + // cond_offset=3, cond bits 3-7: 1,0,1,0,1 + // result: 0&!1=0, 1&!0=1, 1&!1=0, 1&!0=1, 1&!1=0 + // => 0,1,0,1,0 + false, true, false, true, false + ]), + ]; + + for (loff, coff, len, expected) in cases { + let mask = compute_nullif_validity(&left, loff, &cond, coff, len); + // Check each bit in the resulting Buffer against expected + for i in 0..len { + let bit = get_bit(mask.as_slice(), i); + assert_eq!(bit, expected[i], "mismatch at bit {} for offsets ({}, {})", i, loff, coff); + } + } + } } diff --git a/arrow/benches/buffer_bit_ops.rs b/arrow/benches/buffer_bit_ops.rs index c569224b0f9b..2d0cca0e601d 100644 --- a/arrow/benches/buffer_bit_ops.rs +++ b/arrow/benches/buffer_bit_ops.rs @@ -22,7 +22,7 @@ use criterion::{Criterion, Throughput}; extern crate arrow; -use arrow::buffer::{Buffer, MutableBuffer, buffer_bin_and, buffer_bin_or, buffer_unary_not}; +use arrow::buffer::{Buffer, MutableBuffer}; use std::hint; /// Helper function to create arrays @@ -37,15 +37,15 @@ fn create_buffer(size: usize) -> Buffer { } fn bench_buffer_and(left: &Buffer, right: &Buffer) { - hint::black_box(buffer_bin_and(left, 0, right, 0, left.len() * 8)); + hint::black_box(left.bitwise_binary(right, 0, 0, left.len() * 8, |a, b| a & b)); } fn bench_buffer_or(left: &Buffer, right: &Buffer) { - hint::black_box(buffer_bin_or(left, 0, right, 0, left.len() * 8)); + hint::black_box(left.bitwise_binary(right, 0, 0, left.len() * 8, |a, b| a | b)); } fn bench_buffer_not(buffer: &Buffer) { - hint::black_box(buffer_unary_not(buffer, 0, buffer.len() * 8)); + hint::black_box(buffer.bitwise_unary(0, buffer.len() * 8, |a| !a)); } fn bench_buffer_and_with_offsets( @@ -55,7 +55,7 @@ fn bench_buffer_and_with_offsets( right_offset: usize, len: usize, ) { - hint::black_box(buffer_bin_and(left, left_offset, right, right_offset, len)); + hint::black_box(left.bitwise_binary(right, left_offset, right_offset, len, |a, b| a & b)); } fn bench_buffer_or_with_offsets( @@ -65,18 +65,33 @@ fn bench_buffer_or_with_offsets( right_offset: usize, len: usize, ) { - hint::black_box(buffer_bin_or(left, left_offset, right, right_offset, len)); + hint::black_box(left.bitwise_binary(right, left_offset, right_offset, len, |a, b| a | b)); } fn bench_buffer_not_with_offsets(buffer: &Buffer, offset: usize, len: usize) { - hint::black_box(buffer_unary_not(buffer, offset, len)); + hint::black_box(buffer.bitwise_unary(offset, len, |a| !a)); +} + +fn bench_mutable_buffer_and(left: &mut MutableBuffer, right: &Buffer) { + hint::black_box(left.bitwise_binary_inplace(right, 0, 0, left.len() * 8, |a, b| a & b)); +} + +fn bench_mutable_buffer_or(left: &mut MutableBuffer, right: &Buffer) { + hint::black_box(left.bitwise_binary_inplace(right, 0, 0, left.len() * 8, |a, b| a | b)); +} + +fn bench_mutable_buffer_not(buffer: &mut MutableBuffer) { + hint::black_box(buffer.bitwise_unary_inplace(0, buffer.len() * 8, |a| !a)); } fn bit_ops_benchmark(c: &mut Criterion) { let left = create_buffer(512 * 10); let right = create_buffer(512 * 10); + let mut mutable_left = MutableBuffer::from(left.as_slice().to_vec()); + let mutable_right = Buffer::from(right.as_slice().to_vec()); - c.benchmark_group("buffer_binary_ops") + // Allocating benchmarks + c.benchmark_group("buffer_bitwise_alloc_binary_ops") .throughput(Throughput::Bytes(3 * left.len() as u64)) .bench_function("and", |b| b.iter(|| bench_buffer_and(&left, &right))) .bench_function("or", |b| b.iter(|| bench_buffer_or(&left, &right))) @@ -87,12 +102,22 @@ fn bit_ops_benchmark(c: &mut Criterion) { b.iter(|| bench_buffer_or_with_offsets(&left, 1, &right, 2, left.len() * 8 - 5)) }); - c.benchmark_group("buffer_unary_ops") + c.benchmark_group("buffer_bitwise_alloc_unary_ops") .throughput(Throughput::Bytes(2 * left.len() as u64)) .bench_function("not", |b| b.iter(|| bench_buffer_not(&left))) .bench_function("not_with_offset", |b| { b.iter(|| bench_buffer_not_with_offsets(&left, 1, left.len() * 8 - 5)) }); + + // In-place benchmarks + c.benchmark_group("buffer_bitwise_inplace_binary_ops") + .throughput(Throughput::Bytes(2 * mutable_left.len() as u64)) + .bench_function("and", |b| b.iter(|| bench_mutable_buffer_and(&mut mutable_left, &mutable_right))) + .bench_function("or", |b| b.iter(|| bench_mutable_buffer_or(&mut mutable_left, &mutable_right))); + + c.benchmark_group("buffer_bitwise_inplace_unary_ops") + .throughput(Throughput::Bytes(mutable_left.len() as u64)) + .bench_function("not", |b| b.iter(|| bench_mutable_buffer_not(&mut mutable_left))); } criterion_group!(benches, bit_ops_benchmark); diff --git a/docs/arraydata_bitmap_layout_contract.md b/docs/arraydata_bitmap_layout_contract.md new file mode 100644 index 000000000000..73d401878414 --- /dev/null +++ b/docs/arraydata_bitmap_layout_contract.md @@ -0,0 +1,76 @@ +# ArrayData + Bitmap Layout Contract + +This document specifies the invariants for `ArrayData`, offsets, and bitmap buffers (validity and condition bitmaps) in Arrow-rs. These rules are derived from the actual implementation and must be treated as ground truth for bitwise and null-handling operations like `nullif`. + +## 1. Basics + +- **Bit Numbering**: Bits are numbered starting from 0 as the least significant bit (LSB) of byte 0, then bit 1 as the next LSB, and so on, across bytes in little-endian order. +- **Validity Bitmap Invariant**: + > For an `ArrayData` with `len = L`, `offset = O` (in elements), and validity bitmap buffer `B`: + > Logical index `i` in `[0, L)` is valid iff `get_bit(B, O + i) == true`. +- **Null Count**: If `null_count` is set, it MUST equal the count of `false` bits in the validity bitmap over the logical range `[0, L)`. + +## 2. Slicing + +- **Slice Operation**: `array.slice(offset', len')` produces a new `ArrayData` with: + - `len = len'`, + - `offset = O + offset'`, + - Shared underlying buffers (no copying). +- **Sliced Validity Invariant**: + > For the sliced `ArrayData` with `len = L'`, `offset = O'`, and the same validity bitmap buffer `B`: + > Logical index `i` in `[0, L')` is valid iff `get_bit(B, O' + i) == true`. + +## 3. Constructing New ArrayData with a Fresh Validity Buffer + +- **Case 1: Offset = 0 and New Validity Bitmap**: + - Bit `i` in the validity bitmap `B'` corresponds directly to logical index `i` for `i` in `[0, L)`. + - `null_count` MUST equal the number of `false` bits in `B'` over `[0, L)`, or be left unset for Arrow to compute. +- **Case 2: Reusing Existing Null Buffer with Offset**: + - When reusing a `NullBuffer` from an existing array, the `offset` in the new `ArrayData` adjusts the logical-to-physical bit mapping as per the basic invariant. + - Ensure the reused buffer's bits align with the new `len` and `offset`; `null_count` must reflect the new logical range. + +## 4. `nullif`-Specific Contract + +- **Condition Bitmap**: A condition bitmap `C` indicates elements to nullify: logical index `i` should be nulled if `C[i] == true`. +- **Validity Computation Rule**: + > Given left validity `V` and condition `C` (both in logical index space): + > `result_valid(i) = V(i) & !C(i)`. +- **Buffer Mapping**: + - For left array with `offset_left`, bit index for `V(i)` is `offset_left + i`. + - For condition array with `offset_cond`, bit index for `C(i)` is `offset_cond + i`. + - The result validity bitmap should be constructed with `offset = 0` and bits computed as above. + +## 5. Do/Don't Guidelines + +- **DO**: Centralize bitmap math in helpers that take `(buffer, bit_offset, len)` and return a new bitmap with `offset = 0`. +- **DO**: Ensure `null_count` matches the validity bitmap or leave it unset for automatic computation. +- **DON'T**: Manually slice buffers and add offsets (avoid double-compensation); use the slicing semantics instead. + +## 6. Nullif Implementation Contract + +For any ArrayData: + +len = data.len() // logical elements +offset = data.offset() // logical starting index into buffers + +Validity bitmap (if present) is a Buffer B. + +Invariant: + Logical index i in [0, len) is valid iff get_bit(B, offset + i) == true. + +For the result of nullif: + +We will build a fresh ArrayData with offset = 0. + +For that result: + Logical index i is valid iff get_bit(result_validity, i) == true. + Values buffer is laid out so element 0 is first result value, etc. + +For nullif semantics: + +Let V(i) = left is valid at i + C(i) = condition "nullify at i" is true (depends on left, right, type) + +Then: + result_valid(i) = V(i) & !C(i) + result_value(i) = left_value(i) // when result_valid(i) == true \ No newline at end of file