Skip to content

Commit 894e797

Browse files
committed
v2 impl
1 parent 042d725 commit 894e797

File tree

2 files changed

+155
-19
lines changed

2 files changed

+155
-19
lines changed

arrow-array/src/array/byte_view_array.rs

+29
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,21 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
261261
unsafe { self.value_unchecked(i) }
262262
}
263263

264+
/// Returns the inline view data at index `i`
265+
pub unsafe fn prefix_bytes_unchecked(&self, prefix_len: usize, idx: usize) -> &[u8] {
266+
let v = self.views.get_unchecked(idx);
267+
let len = (*v as u32) as usize;
268+
269+
if prefix_len <= 4 || (prefix_len <= 12 && len <= 12) {
270+
Self::inline_value(v, prefix_len)
271+
} else {
272+
let view = ByteView::from(*v);
273+
let data = self.buffers.get_unchecked(view.buffer_index as usize);
274+
let offset = view.offset as usize;
275+
data.get_unchecked(offset..offset + prefix_len)
276+
}
277+
}
278+
264279
/// Returns the element at index `i`
265280
/// # Safety
266281
/// Caller is responsible for ensuring that the index is within the bounds of the array
@@ -278,6 +293,20 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
278293
T::Native::from_bytes_unchecked(b)
279294
}
280295

296+
/// Returns the bytes at index `i`
297+
pub unsafe fn bytes_unchecked(&self, idx: usize) -> &[u8] {
298+
let v = self.views.get_unchecked(idx);
299+
let len = *v as u32;
300+
if len <= 12 {
301+
Self::inline_value(v, len as usize)
302+
} else {
303+
let view = ByteView::from(*v);
304+
let data = self.buffers.get_unchecked(view.buffer_index as usize);
305+
let offset = view.offset as usize;
306+
data.get_unchecked(offset..offset + len as usize)
307+
}
308+
}
309+
281310
/// Returns the inline value of the view.
282311
///
283312
/// # Safety

arrow-string/src/predicate.rs

+126-19
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use arrow_array::{ArrayAccessor, BooleanArray};
18+
use arrow_array::{Array, ArrayAccessor, BooleanArray, StringViewArray};
19+
use arrow_buffer::BooleanBuffer;
1920
use arrow_schema::ArrowError;
2021
use memchr::memchr2;
2122
use memchr::memmem::Finder;
@@ -111,24 +112,130 @@ impl<'a> Predicate<'a> {
111112
Predicate::Eq(v) => BooleanArray::from_unary(array, |haystack| {
112113
(haystack.len() == v.len() && haystack == *v) != negate
113114
}),
114-
Predicate::IEqAscii(v) => BooleanArray::from_unary(array, |haystack| {
115-
haystack.eq_ignore_ascii_case(v) != negate
116-
}),
117-
Predicate::Contains(finder) => BooleanArray::from_unary(array, |haystack| {
118-
finder.find(haystack.as_bytes()).is_some() != negate
119-
}),
120-
Predicate::StartsWith(v) => BooleanArray::from_unary(array, |haystack| {
121-
starts_with(haystack, v, equals_kernel) != negate
122-
}),
123-
Predicate::IStartsWithAscii(v) => BooleanArray::from_unary(array, |haystack| {
124-
starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
125-
}),
126-
Predicate::EndsWith(v) => BooleanArray::from_unary(array, |haystack| {
127-
ends_with(haystack, v, equals_kernel) != negate
128-
}),
129-
Predicate::IEndsWithAscii(v) => BooleanArray::from_unary(array, |haystack| {
130-
ends_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
131-
}),
115+
Predicate::IEqAscii(v) => {
116+
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
117+
let neddle_bytes = v.as_bytes();
118+
let null_buffer = string_view_array.logical_nulls();
119+
let boolean_buffer =
120+
BooleanBuffer::collect_bool(string_view_array.len(), |i| {
121+
unsafe { string_view_array.bytes_unchecked(i) }
122+
.eq_ignore_ascii_case(neddle_bytes)
123+
!= negate
124+
});
125+
126+
BooleanArray::new(boolean_buffer, null_buffer)
127+
} else {
128+
BooleanArray::from_unary(array, |haystack| {
129+
haystack.eq_ignore_ascii_case(v) != negate
130+
})
131+
}
132+
}
133+
Predicate::Contains(finder) => {
134+
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
135+
let null_buffer = string_view_array.logical_nulls();
136+
let boolean_buffer =
137+
BooleanBuffer::collect_bool(string_view_array.len(), |i| {
138+
finder
139+
.find(unsafe { string_view_array.bytes_unchecked(i) })
140+
.is_some()
141+
!= negate
142+
});
143+
144+
BooleanArray::new(boolean_buffer, null_buffer)
145+
} else {
146+
BooleanArray::from_unary(array, |haystack| {
147+
finder.find(haystack.as_bytes()).is_some() != negate
148+
})
149+
}
150+
}
151+
Predicate::StartsWith(v) => {
152+
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
153+
let needle_bytes = v.as_bytes();
154+
let needle_len = needle_bytes.len();
155+
let null_buffer = string_view_array.logical_nulls();
156+
let boolean_buffer =
157+
BooleanBuffer::collect_bool(string_view_array.len(), |i| {
158+
zip(
159+
unsafe { string_view_array.prefix_bytes_unchecked(needle_len, i) },
160+
needle_bytes,
161+
)
162+
.all(equals_kernel)
163+
});
164+
165+
BooleanArray::new(boolean_buffer, null_buffer)
166+
} else {
167+
BooleanArray::from_unary(array, |haystack| {
168+
starts_with(haystack, v, equals_kernel) != negate
169+
})
170+
}
171+
}
172+
Predicate::IStartsWithAscii(v) => {
173+
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
174+
let needle_bytes = v.as_bytes();
175+
let needle_len = needle_bytes.len();
176+
let null_buffer = string_view_array.logical_nulls();
177+
let boolean_buffer =
178+
BooleanBuffer::collect_bool(string_view_array.len(), |i| {
179+
zip(
180+
unsafe { string_view_array.prefix_bytes_unchecked(needle_len, i) },
181+
needle_bytes,
182+
)
183+
.all(equals_ignore_ascii_case_kernel)
184+
});
185+
186+
BooleanArray::new(boolean_buffer, null_buffer)
187+
} else {
188+
BooleanArray::from_unary(array, |haystack| {
189+
starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
190+
})
191+
}
192+
}
193+
Predicate::EndsWith(v) => {
194+
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
195+
let needle_bytes = v.as_bytes();
196+
let needle_len = needle_bytes.len();
197+
let null_buffer = string_view_array.logical_nulls();
198+
let boolean_buffer =
199+
BooleanBuffer::collect_bool(string_view_array.len(), |i| {
200+
zip(
201+
unsafe { string_view_array.prefix_bytes_unchecked(needle_len, i) }
202+
.iter()
203+
.rev(),
204+
needle_bytes.iter().rev(),
205+
)
206+
.all(equals_kernel)
207+
});
208+
209+
BooleanArray::new(boolean_buffer, null_buffer)
210+
} else {
211+
BooleanArray::from_unary(array, |haystack| {
212+
ends_with(haystack, v, equals_kernel) != negate
213+
})
214+
}
215+
}
216+
Predicate::IEndsWithAscii(v) => {
217+
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
218+
let needle_bytes = v.as_bytes();
219+
let needle_len = needle_bytes.len();
220+
let null_buffer = string_view_array.logical_nulls();
221+
let boolean_buffer =
222+
BooleanBuffer::collect_bool(string_view_array.len(), |i| {
223+
zip(
224+
unsafe { string_view_array.prefix_bytes_unchecked(needle_len, i) }
225+
.iter()
226+
.rev(),
227+
needle_bytes.iter().rev(),
228+
)
229+
.all(equals_ignore_ascii_case_kernel)
230+
});
231+
232+
BooleanArray::new(boolean_buffer, null_buffer)
233+
} else {
234+
BooleanArray::from_unary(array, |haystack| {
235+
ends_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
236+
})
237+
}
238+
}
132239
Predicate::Regex(v) => {
133240
BooleanArray::from_unary(array, |haystack| v.is_match(haystack) != negate)
134241
}

0 commit comments

Comments
 (0)