-
Notifications
You must be signed in to change notification settings - Fork 3.9k
GH-47532: [C++] Fixes for Arrow STL iterator for custom types #47531
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -248,6 +248,68 @@ TEST(ArrayIterator, StdMerge) { | |
| ASSERT_EQ(values, expected); | ||
| } | ||
|
|
||
| // Custom ValueAccessor for DictionaryArray that decodes values | ||
| struct TestDictionaryValueAccessor { | ||
| using ValueType = std::string_view; | ||
|
|
||
| inline ValueType operator()(const DictionaryArray& array, int64_t index) { | ||
| // Get the dictionary index for this position | ||
| int64_t dict_index = array.GetValueIndex(index); | ||
|
|
||
| // Get the dictionary and cast it to StringArray | ||
| auto dict = checked_pointer_cast<StringArray>(array.dictionary()); | ||
|
|
||
| // Return the decoded string value | ||
| return dict->GetView(dict_index); | ||
| } | ||
| }; | ||
|
|
||
| TEST(ArrayIterator, CustomValueAccessorDictionary) { | ||
| // Create a dictionary array with string values | ||
| auto dict = ArrayFromJSON(utf8(), R"(["apple", "banana", "cherry", "date"])"); | ||
| auto indices = ArrayFromJSON(int32(), "[0, 1, 2, 3, 2, 1, 0, null, 3]"); | ||
|
|
||
| auto dict_type = dictionary(int32(), utf8()); | ||
| auto dict_array = std::make_shared<DictionaryArray>(dict_type, indices, dict); | ||
|
|
||
| // Use custom accessor to iterate over decoded values | ||
| ArrayIterator<DictionaryArray, TestDictionaryValueAccessor> it(*dict_array); | ||
|
|
||
| // Test basic access | ||
| ASSERT_EQ(*it, "apple"); | ||
| ASSERT_EQ(it[1], "banana"); | ||
| ASSERT_EQ(it[2], "cherry"); | ||
| ASSERT_EQ(it[3], "date"); | ||
| ASSERT_EQ(it[4], "cherry"); | ||
| ASSERT_EQ(it[5], "banana"); | ||
| ASSERT_EQ(it[6], "apple"); | ||
| ASSERT_EQ(it[7], nullopt); // null index | ||
| ASSERT_EQ(it[8], "date"); | ||
|
|
||
| // Test iteration | ||
| std::vector<optional<std::string_view>> values; | ||
| for (auto end = it + 9; it != end; ++it) { | ||
| values.push_back(*it); | ||
| } | ||
|
|
||
| std::vector<optional<std::string_view>> expected{ | ||
| "apple", "banana", "cherry", "date", "cherry", "banana", "apple", nullopt, "date"}; | ||
| ASSERT_EQ(values, expected); | ||
|
|
||
| // Test with algorithms - find a specific value | ||
| ArrayIterator<DictionaryArray, TestDictionaryValueAccessor> begin(*dict_array); | ||
| ArrayIterator<DictionaryArray, TestDictionaryValueAccessor> end(*dict_array, | ||
| dict_array->length()); | ||
|
|
||
| auto found = std::find(begin, end, "cherry"); | ||
| ASSERT_NE(found, end); | ||
| ASSERT_EQ(found.index(), 2); // First occurrence of "cherry" | ||
|
|
||
| // Count occurrences of "banana" | ||
| auto count = std::count(begin, end, "banana"); | ||
| ASSERT_EQ(count, 2); | ||
| } | ||
|
|
||
| TEST(ChunkedArrayIterator, Basics) { | ||
| auto result = ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([6])"}); | ||
| auto it = Begin<Int32Type>(*result); | ||
|
|
@@ -545,5 +607,64 @@ TEST(ChunkedArrayIterator, ForEachIterator) { | |
| ASSERT_EQ(values, expected); | ||
| } | ||
|
|
||
| TEST(ChunkedArrayIterator, CustomValueAccessorDictionary) { | ||
| // Create multiple dictionary arrays with the same dictionary | ||
| auto dict = ArrayFromJSON(utf8(), R"(["red", "green", "blue", "yellow"])"); | ||
|
|
||
| auto indices1 = ArrayFromJSON(int32(), "[0, 1, 2]"); | ||
| auto indices2 = ArrayFromJSON(int32(), "[3, 2, null]"); | ||
| auto indices3 = ArrayFromJSON(int32(), "[1, 0, 3, 2]"); | ||
|
|
||
| auto dict_type = dictionary(int32(), utf8()); | ||
| auto dict_array1 = std::make_shared<DictionaryArray>(dict_type, indices1, dict); | ||
| auto dict_array2 = std::make_shared<DictionaryArray>(dict_type, indices2, dict); | ||
| auto dict_array3 = std::make_shared<DictionaryArray>(dict_type, indices3, dict); | ||
|
|
||
| // Create chunked array from dictionary arrays | ||
| auto chunked_array = std::make_shared<ChunkedArray>( | ||
| std::vector<std::shared_ptr<Array>>{dict_array1, dict_array2, dict_array3}, | ||
| dict_type); | ||
|
|
||
| // Use custom accessor to iterate over decoded values across chunks | ||
| auto it = | ||
| Begin<DictionaryType, DictionaryArray, TestDictionaryValueAccessor>(*chunked_array); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think rather than requiring users to provide all of these template arguments we should add for (int i : chunked_array->range<Int32Type>()) {}Or for (int i : chunked_array->range<DictionaryType, TestDictionaryValueAccessor>()) {}The same member function could be added to ... Actually, using the auto accessor = [](const DictionaryArray& array, int64_t index) {
int64_t dict_index = array.GetValueIndex(index);
const auto& dict = checked_cast<const StringArray&>(*array.dictionary());
return dict->GetView(dict_index);
};
for (int i : chunked_array->range(accessor)) {}There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I looked at how to do this for a while, it seems to me that this will require adding the include to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Alright, but even if you'd prefer not to modify auto accessor = [](const DictionaryArray& array, int64_t index) {
int64_t dict_index = array.GetValueIndex(index);
const auto& dict = checked_cast<const StringArray&>(*array.dictionary());
return dict->GetView(dict_index);
};
for (int i : Iterate(chunked_array, accessor)) {}I think it'd be sufficient to write: template <typename ValueAccessor>
auto Iterate(const ChunkedArray& chunked_array, ValueAccessor value_accessor) {
using arrow::internal::call::traits;
static_assert(!call_traits::is_overloaded<ValueAccessor>::value,
"Cannot infer template arguments from overloaded ValueAccessor");
using ArrayType = call_traits::argument_type<ValueAccessor, 0>;
return stl::ChunkedArrayRange<ArrayType, ValueAccessor>{&chunked_array, value_accessor};
} |
||
| auto end = | ||
| End<DictionaryType, DictionaryArray, TestDictionaryValueAccessor>(*chunked_array); | ||
|
|
||
| // Test sequential access across chunks | ||
| ASSERT_EQ(*it, "red"); // chunk 0, index 0 | ||
| ASSERT_EQ(*(it + 1), "green"); // chunk 0, index 1 | ||
| ASSERT_EQ(*(it + 2), "blue"); // chunk 0, index 2 | ||
| ASSERT_EQ(*(it + 3), "yellow"); // chunk 1, index 0 | ||
| ASSERT_EQ(*(it + 4), "blue"); // chunk 1, index 1 | ||
| ASSERT_EQ(*(it + 5), nullopt); // chunk 1, index 2 (null) | ||
| ASSERT_EQ(*(it + 6), "green"); // chunk 2, index 0 | ||
| ASSERT_EQ(*(it + 7), "red"); // chunk 2, index 1 | ||
| ASSERT_EQ(*(it + 8), "yellow"); // chunk 2, index 2 | ||
| ASSERT_EQ(*(it + 9), "blue"); // chunk 2, index 3 | ||
|
|
||
| // Collect all values | ||
| std::vector<optional<std::string_view>> values; | ||
|
|
||
| for (auto elem : Iterate<DictionaryType, DictionaryArray, TestDictionaryValueAccessor>( | ||
| *chunked_array)) { | ||
| values.push_back(elem); | ||
| } | ||
|
|
||
| std::vector<optional<std::string_view>> expected{"red", "green", "blue", "yellow", | ||
| "blue", nullopt, "green", "red", | ||
| "yellow", "blue"}; | ||
| ASSERT_EQ(values, expected); | ||
|
|
||
| // Test with algorithms - count occurrences of "blue" | ||
| auto count = std::count(it, end, "blue"); | ||
| ASSERT_EQ(count, 3); | ||
|
|
||
| // Find first occurrence of "yellow" | ||
| auto found = std::find(it, end, "yellow"); | ||
| ASSERT_NE(found, end); | ||
| ASSERT_EQ(found.index(), 3); | ||
| } | ||
|
|
||
| } // namespace stl | ||
| } // namespace arrow | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please keep an instance of ValueAccessor as a data member here and elsewhere; otherwise it must be default constructible and empty. It wouldn't (for example) be able to reference a lookup table by reference unless that table was global.