Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 30 additions & 23 deletions cpp/src/arrow/stl_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,13 @@ class ArrayIterator {
// Value access
value_type operator*() const {
assert(array_);
return array_->IsNull(index_) ? value_type{} : array_->GetView(index_);
return array_->IsNull(index_) ? value_type{} : ValueAccessor{}(*array_, index_);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please keep an instance of ValueAccessor as a data member here and elsewhere; otherwise it must be default constructible and empty. It wouldn't (for example) be able to reference a lookup table by reference unless that table was global.

}

value_type operator[](difference_type n) const {
assert(array_);
return array_->IsNull(index_ + n) ? value_type{} : array_->GetView(index_ + n);
return array_->IsNull(index_ + n) ? value_type{}
: ValueAccessor{}(*array_, index_ + n);
}

int64_t index() const { return index_; }
Expand Down Expand Up @@ -154,7 +155,7 @@ class ChunkedArrayIterator {
// Value access
value_type operator*() const {
auto chunk_location = GetChunkLocation(index_);
ArrayIterator<ArrayType> target_iterator{
ArrayIterator<ArrayType, ValueAccessor> target_iterator{
arrow::internal::checked_cast<const ArrayType&>(
*chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
return target_iterator[chunk_location.index_in_chunk];
Expand Down Expand Up @@ -247,53 +248,59 @@ class ChunkedArrayIterator {
};

/// Return an iterator to the beginning of the chunked array
template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
ChunkedArrayIterator<ArrayType> Begin(const ChunkedArray& chunked_array) {
return ChunkedArrayIterator<ArrayType>(chunked_array);
template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType,
typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
ChunkedArrayIterator<ArrayType, ValueAccessor> Begin(const ChunkedArray& chunked_array) {
return ChunkedArrayIterator<ArrayType, ValueAccessor>(chunked_array);
}

/// Return an iterator to the end of the chunked array
template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
ChunkedArrayIterator<ArrayType> End(const ChunkedArray& chunked_array) {
return ChunkedArrayIterator<ArrayType>(chunked_array, chunked_array.length());
template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType,
typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
ChunkedArrayIterator<ArrayType, ValueAccessor> End(const ChunkedArray& chunked_array) {
return ChunkedArrayIterator<ArrayType, ValueAccessor>(chunked_array,
chunked_array.length());
}

template <typename ArrayType>
template <typename ArrayType,
typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
struct ChunkedArrayRange {
const ChunkedArray* chunked_array;

ChunkedArrayIterator<ArrayType> begin() {
return stl::ChunkedArrayIterator<ArrayType>(*chunked_array);
ChunkedArrayIterator<ArrayType, ValueAccessor> begin() {
return stl::ChunkedArrayIterator<ArrayType, ValueAccessor>(*chunked_array);
}
ChunkedArrayIterator<ArrayType> end() {
return stl::ChunkedArrayIterator<ArrayType>(*chunked_array, chunked_array->length());
ChunkedArrayIterator<ArrayType, ValueAccessor> end() {
return stl::ChunkedArrayIterator<ArrayType, ValueAccessor>(*chunked_array,
chunked_array->length());
}
};

/// Return an iterable range over the chunked array
template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
ChunkedArrayRange<ArrayType> Iterate(const ChunkedArray& chunked_array) {
return stl::ChunkedArrayRange<ArrayType>{&chunked_array};
template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType,
typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
ChunkedArrayRange<ArrayType, ValueAccessor> Iterate(const ChunkedArray& chunked_array) {
return stl::ChunkedArrayRange<ArrayType, ValueAccessor>{&chunked_array};
}

} // namespace stl
} // namespace arrow

namespace std {

template <typename ArrayType>
struct iterator_traits<::arrow::stl::ArrayIterator<ArrayType>> {
using IteratorType = ::arrow::stl::ArrayIterator<ArrayType>;
template <typename ArrayType, typename ValueAccessor>
struct iterator_traits<::arrow::stl::ArrayIterator<ArrayType, ValueAccessor>> {
using IteratorType = ::arrow::stl::ArrayIterator<ArrayType, ValueAccessor>;
using difference_type = typename IteratorType::difference_type;
using value_type = typename IteratorType::value_type;
using pointer = typename IteratorType::pointer;
using reference = typename IteratorType::reference;
using iterator_category = typename IteratorType::iterator_category;
};

template <typename ArrayType>
struct iterator_traits<::arrow::stl::ChunkedArrayIterator<ArrayType>> {
using IteratorType = ::arrow::stl::ChunkedArrayIterator<ArrayType>;
template <typename ArrayType, typename ValueAccessor>
struct iterator_traits<::arrow::stl::ChunkedArrayIterator<ArrayType, ValueAccessor>> {
using IteratorType = ::arrow::stl::ChunkedArrayIterator<ArrayType, ValueAccessor>;
using difference_type = typename IteratorType::difference_type;
using value_type = typename IteratorType::value_type;
using pointer = typename IteratorType::pointer;
Expand Down
121 changes: 121 additions & 0 deletions cpp/src/arrow/stl_iterator_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,68 @@ TEST(ArrayIterator, StdMerge) {
ASSERT_EQ(values, expected);
}

// Custom ValueAccessor for DictionaryArray that decodes values
struct TestDictionaryValueAccessor {
using ValueType = std::string_view;

inline ValueType operator()(const DictionaryArray& array, int64_t index) {
// Get the dictionary index for this position
int64_t dict_index = array.GetValueIndex(index);

// Get the dictionary and cast it to StringArray
auto dict = checked_pointer_cast<StringArray>(array.dictionary());

// Return the decoded string value
return dict->GetView(dict_index);
}
};

TEST(ArrayIterator, CustomValueAccessorDictionary) {
// Create a dictionary array with string values
auto dict = ArrayFromJSON(utf8(), R"(["apple", "banana", "cherry", "date"])");
auto indices = ArrayFromJSON(int32(), "[0, 1, 2, 3, 2, 1, 0, null, 3]");

auto dict_type = dictionary(int32(), utf8());
auto dict_array = std::make_shared<DictionaryArray>(dict_type, indices, dict);

// Use custom accessor to iterate over decoded values
ArrayIterator<DictionaryArray, TestDictionaryValueAccessor> it(*dict_array);

// Test basic access
ASSERT_EQ(*it, "apple");
ASSERT_EQ(it[1], "banana");
ASSERT_EQ(it[2], "cherry");
ASSERT_EQ(it[3], "date");
ASSERT_EQ(it[4], "cherry");
ASSERT_EQ(it[5], "banana");
ASSERT_EQ(it[6], "apple");
ASSERT_EQ(it[7], nullopt); // null index
ASSERT_EQ(it[8], "date");

// Test iteration
std::vector<optional<std::string_view>> values;
for (auto end = it + 9; it != end; ++it) {
values.push_back(*it);
}

std::vector<optional<std::string_view>> expected{
"apple", "banana", "cherry", "date", "cherry", "banana", "apple", nullopt, "date"};
ASSERT_EQ(values, expected);

// Test with algorithms - find a specific value
ArrayIterator<DictionaryArray, TestDictionaryValueAccessor> begin(*dict_array);
ArrayIterator<DictionaryArray, TestDictionaryValueAccessor> end(*dict_array,
dict_array->length());

auto found = std::find(begin, end, "cherry");
ASSERT_NE(found, end);
ASSERT_EQ(found.index(), 2); // First occurrence of "cherry"

// Count occurrences of "banana"
auto count = std::count(begin, end, "banana");
ASSERT_EQ(count, 2);
}

TEST(ChunkedArrayIterator, Basics) {
auto result = ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([6])"});
auto it = Begin<Int32Type>(*result);
Expand Down Expand Up @@ -545,5 +607,64 @@ TEST(ChunkedArrayIterator, ForEachIterator) {
ASSERT_EQ(values, expected);
}

TEST(ChunkedArrayIterator, CustomValueAccessorDictionary) {
// Create multiple dictionary arrays with the same dictionary
auto dict = ArrayFromJSON(utf8(), R"(["red", "green", "blue", "yellow"])");

auto indices1 = ArrayFromJSON(int32(), "[0, 1, 2]");
auto indices2 = ArrayFromJSON(int32(), "[3, 2, null]");
auto indices3 = ArrayFromJSON(int32(), "[1, 0, 3, 2]");

auto dict_type = dictionary(int32(), utf8());
auto dict_array1 = std::make_shared<DictionaryArray>(dict_type, indices1, dict);
auto dict_array2 = std::make_shared<DictionaryArray>(dict_type, indices2, dict);
auto dict_array3 = std::make_shared<DictionaryArray>(dict_type, indices3, dict);

// Create chunked array from dictionary arrays
auto chunked_array = std::make_shared<ChunkedArray>(
std::vector<std::shared_ptr<Array>>{dict_array1, dict_array2, dict_array3},
dict_type);

// Use custom accessor to iterate over decoded values across chunks
auto it =
Begin<DictionaryType, DictionaryArray, TestDictionaryValueAccessor>(*chunked_array);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think rather than requiring users to provide all of these template arguments we should add ChunkedArray::range<ArrowType, ValueAccessor = DefaultEtc>(). Then we can write

for (int i : chunked_array->range<Int32Type>()) {}

Or

for (int i : chunked_array->range<DictionaryType, TestDictionaryValueAccessor>()) {}

The same member function could be added to Array, in which case, the above could also be used on a dictionary array.

... Actually, using the argument_type trait in util/functional.h it would be possible to infer all template arguments from a lambda value accessor:

auto accessor = [](const DictionaryArray& array, int64_t index) {
  int64_t dict_index = array.GetValueIndex(index);
  const auto& dict = checked_cast<const StringArray&>(*array.dictionary());
  return dict->GetView(dict_index);
};
for (int i : chunked_array->range(accessor)) {}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I looked at how to do this for a while, it seems to me that this will require adding the include to stl_iterator.h in chunked_array.h. I think for now we can first fix the existing accessor, and I can add the range function in a later PR.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alright, but even if you'd prefer not to modify chunked_array.h... I don't think that the current PR is very usable since it requires so many redundant template arguments. A non member function could still sidestep all the template arguments:

auto accessor = [](const DictionaryArray& array, int64_t index) {
  int64_t dict_index = array.GetValueIndex(index);
  const auto& dict = checked_cast<const StringArray&>(*array.dictionary());
  return dict->GetView(dict_index);
};
for (int i : Iterate(chunked_array, accessor)) {}

I think it'd be sufficient to write:

template <typename ValueAccessor>
auto Iterate(const ChunkedArray& chunked_array, ValueAccessor value_accessor) {
  using arrow::internal::call::traits;
  static_assert(!call_traits::is_overloaded<ValueAccessor>::value,
                "Cannot infer template arguments from overloaded ValueAccessor");

  using ArrayType = call_traits::argument_type<ValueAccessor, 0>;
  return stl::ChunkedArrayRange<ArrayType, ValueAccessor>{&chunked_array, value_accessor};
}

auto end =
End<DictionaryType, DictionaryArray, TestDictionaryValueAccessor>(*chunked_array);

// Test sequential access across chunks
ASSERT_EQ(*it, "red"); // chunk 0, index 0
ASSERT_EQ(*(it + 1), "green"); // chunk 0, index 1
ASSERT_EQ(*(it + 2), "blue"); // chunk 0, index 2
ASSERT_EQ(*(it + 3), "yellow"); // chunk 1, index 0
ASSERT_EQ(*(it + 4), "blue"); // chunk 1, index 1
ASSERT_EQ(*(it + 5), nullopt); // chunk 1, index 2 (null)
ASSERT_EQ(*(it + 6), "green"); // chunk 2, index 0
ASSERT_EQ(*(it + 7), "red"); // chunk 2, index 1
ASSERT_EQ(*(it + 8), "yellow"); // chunk 2, index 2
ASSERT_EQ(*(it + 9), "blue"); // chunk 2, index 3

// Collect all values
std::vector<optional<std::string_view>> values;

for (auto elem : Iterate<DictionaryType, DictionaryArray, TestDictionaryValueAccessor>(
*chunked_array)) {
values.push_back(elem);
}

std::vector<optional<std::string_view>> expected{"red", "green", "blue", "yellow",
"blue", nullopt, "green", "red",
"yellow", "blue"};
ASSERT_EQ(values, expected);

// Test with algorithms - count occurrences of "blue"
auto count = std::count(it, end, "blue");
ASSERT_EQ(count, 3);

// Find first occurrence of "yellow"
auto found = std::find(it, end, "yellow");
ASSERT_NE(found, end);
ASSERT_EQ(found.index(), 3);
}

} // namespace stl
} // namespace arrow
Loading