Open
Description
Describe the bug
There are 10 x 10k rows StringViewArray
, permute all elements randomly, and use interleave
kernel to generate the same shape 10 x 10k rows array, the memory consumption will grow by 10X
To Reproduce
(place in arrow/examples)
extern crate arrow;
use std::sync::Arc;
use arrow::array::ArrayRef;
use arrow_array::StringViewArray;
use arrow_select::interleave::interleave;
use rand::{seq::SliceRandom, thread_rng};
fn main() {
// Create 10 StringViewArray, each with 100 char long, and 10k elements
let num_arrays = 10;
let num_elements = 10_000;
let string_length = 100;
let mut arrays: Vec<ArrayRef> = Vec::with_capacity(num_arrays);
for _ in 0..num_arrays {
let strings: Vec<String> = (0..num_elements)
.map(|_| "a".repeat(string_length))
.collect();
let array = Arc::new(StringViewArray::from(strings)) as ArrayRef;
arrays.push(array);
}
// Measure memory consumption before interleaving
let memory_before = measure_memory(&arrays);
// Randomly permute indices for interleaving
let mut rng = thread_rng();
let mut all_indices: Vec<(usize, usize)> = (0..num_arrays)
.flat_map(|array_index| {
(0..num_elements).map(move |element_index| (array_index, element_index))
})
.collect();
all_indices.shuffle(&mut rng);
let indices: Vec<Vec<(usize, usize)>> = all_indices
.chunks(num_elements)
.map(|chunk| chunk.to_vec())
.collect();
// Interleave into 10 smaller arrays
let interleaved_arrays: Vec<ArrayRef> = indices
.iter()
.map(|index_set| {
interleave(
&arrays.iter().map(|a| a.as_ref()).collect::<Vec<_>>(),
index_set,
)
.expect("Interleave failed")
})
.collect();
// Measure memory consumption after interleaving
let memory_after = measure_memory(&interleaved_arrays);
println!("Memory before interleaving: {} bytes", memory_before);
println!("Memory after interleaving: {} bytes", memory_after);
}
fn measure_memory(arrays: &Vec<ArrayRef>) -> usize {
arrays
.iter()
.map(|array| array.get_array_memory_size())
.sum()
}
Result:
Memory before interleaving: 11923120 bytes
Memory after interleaving: 104820400 bytes
Expected behavior
Memory overhead should be similar
Additional context
Originally founded by apache/datafusion#12136 (comment)