-
Notifications
You must be signed in to change notification settings - Fork 3.9k
GH-48202: [C++][Parquet] Fix encoder & decoder logic to enable Parque… #48203
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,6 +41,7 @@ | |
| #include "arrow/util/bitmap_ops.h" | ||
| #include "arrow/util/byte_stream_split_internal.h" | ||
| #include "arrow/util/checked_cast.h" | ||
| #include "arrow/util/endian.h" | ||
| #include "arrow/util/int_util_overflow.h" | ||
| #include "arrow/util/logging_internal.h" | ||
| #include "arrow/util/rle_encoding_internal.h" | ||
|
|
@@ -406,9 +407,20 @@ int PlainDecoder<DType>::DecodeArrow( | |
| VisitBitRuns(valid_bits, valid_bits_offset, num_values, | ||
| [&](int64_t position, int64_t run_length, bool is_valid) { | ||
| if (is_valid) { | ||
| #if ARROW_LITTLE_ENDIAN | ||
| RETURN_NOT_OK(builder->AppendValues( | ||
| reinterpret_cast<const value_type*>(data), run_length)); | ||
| data += run_length * sizeof(value_type); | ||
| #else | ||
| // On big-endian systems, we need to byte-swap each value | ||
| // since Parquet data is stored in little-endian format | ||
| for (int64_t i = 0; i < run_length; ++i) { | ||
| value_type value = ::arrow::bit_util::FromLittleEndian( | ||
| SafeLoadAs<value_type>(data)); | ||
| RETURN_NOT_OK(builder->Append(value)); | ||
| data += sizeof(value_type); | ||
| } | ||
| #endif | ||
| } else { | ||
| RETURN_NOT_OK(builder->AppendNulls(run_length)); | ||
| } | ||
|
|
@@ -458,7 +470,24 @@ inline int DecodePlain(const uint8_t* data, int64_t data_size, int num_values, | |
| } | ||
| // If bytes_to_decode == 0, data could be null | ||
| if (bytes_to_decode > 0) { | ||
| #if ARROW_LITTLE_ENDIAN | ||
| memcpy(out, data, static_cast<size_t>(bytes_to_decode)); | ||
| #else | ||
| // On big-endian systems, we need to byte-swap each value | ||
| // since Parquet data is stored in little-endian format. | ||
| // Only apply to integer and floating-point types that have FromLittleEndian support. | ||
| if constexpr (std::is_same_v<T, int32_t> || std::is_same_v<T, uint32_t> || | ||
| std::is_same_v<T, int64_t> || std::is_same_v<T, uint64_t> || | ||
| std::is_same_v<T, float> || std::is_same_v<T, double>) { | ||
Vishwanatha-HD marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| for (int i = 0; i < num_values; ++i) { | ||
| out[i] = ::arrow::bit_util::FromLittleEndian(SafeLoadAs<T>(data)); | ||
| data += sizeof(T); | ||
| } | ||
| } else { | ||
| // For other types (bool, Int96, etc.), just do memcpy | ||
Vishwanatha-HD marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| memcpy(out, data, static_cast<size_t>(bytes_to_decode)); | ||
| } | ||
| #endif | ||
| } | ||
| return static_cast<int>(bytes_to_decode); | ||
| } | ||
|
|
@@ -471,7 +500,7 @@ static inline int64_t ReadByteArray(const uint8_t* data, int64_t data_size, | |
| if (ARROW_PREDICT_FALSE(data_size < 4)) { | ||
| ParquetException::EofException(); | ||
| } | ||
| const int32_t len = SafeLoadAs<int32_t>(data); | ||
| const int32_t len = ::arrow::bit_util::FromLittleEndian(SafeLoadAs<int32_t>(data)); | ||
| if (len < 0) { | ||
| throw ParquetException("Invalid BYTE_ARRAY value"); | ||
| } | ||
|
|
@@ -772,7 +801,8 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType> { | |
| return Status::Invalid( | ||
| "Invalid or truncated PLAIN-encoded BYTE_ARRAY data"); | ||
| } | ||
| auto value_len = SafeLoadAs<int32_t>(data_); | ||
| auto value_len = | ||
| ::arrow::bit_util::FromLittleEndian(SafeLoadAs<int32_t>(data_)); | ||
| if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > len_ - 4)) { | ||
| return Status::Invalid( | ||
| "Invalid or truncated PLAIN-encoded BYTE_ARRAY data"); | ||
|
|
@@ -817,7 +847,8 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType> { | |
| return Status::Invalid( | ||
| "Invalid or truncated PLAIN-encoded BYTE_ARRAY data"); | ||
| } | ||
| auto value_len = SafeLoadAs<int32_t>(data_); | ||
| auto value_len = | ||
| ::arrow::bit_util::FromLittleEndian(SafeLoadAs<int32_t>(data_)); | ||
| if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > len_ - 4)) { | ||
| return Status::Invalid( | ||
| "Invalid or truncated PLAIN-encoded BYTE_ARRAY data"); | ||
|
|
@@ -1616,9 +1647,17 @@ class DeltaBitPackDecoder : public TypedDecoderImpl<DType> { | |
| for (int j = 0; j < values_decode; ++j) { | ||
| // Addition between min_delta, packed int and last_value should be treated as | ||
| // unsigned addition. Overflow is as expected. | ||
| #if ARROW_LITTLE_ENDIAN | ||
| buffer[i + j] = static_cast<UT>(min_delta_) + static_cast<UT>(buffer[i + j]) + | ||
| static_cast<UT>(last_value_); | ||
| last_value_ = buffer[i + j]; | ||
| #else | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why this change? The added code for BE doesn't seem to do anything different. |
||
| UT temp = static_cast<UT>(min_delta_) + | ||
| static_cast<UT>(static_cast<uint64_t>(buffer[i + j])) + | ||
| static_cast<UT>(last_value_); | ||
| buffer[i + j] = static_cast<T>(temp); | ||
| last_value_ = static_cast<T>(temp); | ||
| #endif | ||
| } | ||
| values_remaining_current_mini_block_ -= values_decode; | ||
| i += values_decode; | ||
|
|
@@ -2306,6 +2345,17 @@ class ByteStreamSplitDecoder<FLBAType> : public ByteStreamSplitDecoderBase<FLBAT | |
| const int num_decoded = this->DecodeRaw(decode_out, max_values); | ||
| DCHECK_EQ(num_decoded, max_values); | ||
|
|
||
| #if !ARROW_LITTLE_ENDIAN | ||
| // On big-endian, ByteStreamSplitDecode (DoMergeStreams) reverses stream positions | ||
| // to produce numeric values in native byte order. For FLBA (opaque byte arrays), | ||
| // we need to undo this reversal to preserve the original byte sequence. | ||
|
Comment on lines
+2349
to
+2351
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, doing a second byteswap to undo the first one sounds entirely wasteful. Perhaps the ByteStreamSplit routines should instead take an optional flag to do the byte-swapping? For example have something like: // If fix_endianness is `true`, output will be converted from little endian to native endian
inline void ByteStreamSplitDecode(const uint8_t* data, int width, int64_t num_values,
int64_t stride, bool fix_endianness, uint8_t* out) { |
||
| const int type_length = this->type_length_; | ||
| for (int i = 0; i < num_decoded; ++i) { | ||
| uint8_t* value_ptr = decode_out + static_cast<int64_t>(type_length) * i; | ||
| std::reverse(value_ptr, value_ptr + type_length); | ||
| } | ||
| #endif | ||
|
|
||
| for (int i = 0; i < num_decoded; ++i) { | ||
| buffer[i] = | ||
| FixedLenByteArray(decode_out + static_cast<int64_t>(this->type_length_) * i); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -162,7 +162,8 @@ class PlainEncoder : public EncoderImpl, virtual public TypedEncoder<DType> { | |
|
|
||
| void UnsafePutByteArray(const void* data, uint32_t length) { | ||
| DCHECK(length == 0 || data != nullptr) << "Value ptr cannot be NULL"; | ||
| sink_.UnsafeAppend(&length, sizeof(uint32_t)); | ||
| uint32_t length_le = ::arrow::bit_util::ToLittleEndian(length); | ||
| sink_.UnsafeAppend(&length_le, sizeof(uint32_t)); | ||
| sink_.UnsafeAppend(data, static_cast<int64_t>(length)); | ||
| unencoded_byte_array_data_bytes_ += length; | ||
| } | ||
|
|
@@ -201,7 +202,27 @@ class PlainEncoder : public EncoderImpl, virtual public TypedEncoder<DType> { | |
| template <typename DType> | ||
| void PlainEncoder<DType>::Put(const T* buffer, int num_values) { | ||
| if (num_values > 0) { | ||
| #if ARROW_LITTLE_ENDIAN | ||
| PARQUET_THROW_NOT_OK(sink_.Append(buffer, num_values * sizeof(T))); | ||
| #else | ||
| // On big-endian systems, we need to byte-swap each value | ||
| // since Parquet data must be stored in little-endian format. | ||
| if constexpr (std::is_same_v<T, int32_t> || std::is_same_v<T, uint32_t> || | ||
| std::is_same_v<T, int64_t> || std::is_same_v<T, uint64_t> || | ||
| std::is_same_v<T, float> || std::is_same_v<T, double>) { | ||
| PARQUET_ASSIGN_OR_THROW( | ||
| auto temp_buffer, | ||
| ::arrow::AllocateBuffer(num_values * sizeof(T), this->memory_pool())); | ||
|
Comment on lines
+213
to
+215
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wow, can you use a small-sized stack-allocated buffer instead? (for example a |
||
| T* temp_data = temp_buffer->template mutable_data_as<T>(); | ||
| for (int i = 0; i < num_values; ++i) { | ||
| temp_data[i] = ::arrow::bit_util::ToLittleEndian(buffer[i]); | ||
| } | ||
| PARQUET_THROW_NOT_OK(sink_.Append(temp_data, num_values * sizeof(T))); | ||
| } else { | ||
| // For other types (Int96, etc.), just do memcpy | ||
| PARQUET_THROW_NOT_OK(sink_.Append(buffer, num_values * sizeof(T))); | ||
| } | ||
| #endif | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -224,6 +245,7 @@ void DirectPutImpl(const ::arrow::Array& values, ::arrow::BufferBuilder* sink) { | |
| constexpr auto value_size = sizeof(value_type); | ||
| auto raw_values = checked_cast<const ArrayType&>(values).raw_values(); | ||
|
|
||
| #if ARROW_LITTLE_ENDIAN | ||
| if (values.null_count() == 0) { | ||
| // no nulls, just dump the data | ||
| PARQUET_THROW_NOT_OK(sink->Append(raw_values, values.length() * value_size)); | ||
|
|
@@ -237,6 +259,32 @@ void DirectPutImpl(const ::arrow::Array& values, ::arrow::BufferBuilder* sink) { | |
| } | ||
| } | ||
| } | ||
| #else | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| // On big-endian systems, we need to byte-swap each value | ||
| // since Parquet data must be stored in little-endian format. | ||
| if constexpr (std::is_same_v<value_type, int32_t> || | ||
| std::is_same_v<value_type, uint32_t> || | ||
| std::is_same_v<value_type, int64_t> || | ||
| std::is_same_v<value_type, uint64_t> || | ||
| std::is_same_v<value_type, float> || std::is_same_v<value_type, double>) { | ||
| PARQUET_THROW_NOT_OK( | ||
| sink->Reserve((values.length() - values.null_count()) * value_size)); | ||
| for (int64_t i = 0; i < values.length(); i++) { | ||
| if (values.IsValid(i)) { | ||
| auto le_value = ::arrow::bit_util::ToLittleEndian(raw_values[i]); | ||
| sink->UnsafeAppend(&le_value, value_size); | ||
| } | ||
| } | ||
| } else { | ||
| PARQUET_THROW_NOT_OK( | ||
| sink->Reserve((values.length() - values.null_count()) * value_size)); | ||
| for (int64_t i = 0; i < values.length(); i++) { | ||
| if (values.IsValid(i)) { | ||
| sink->UnsafeAppend(&raw_values[i], value_size); | ||
| } | ||
| } | ||
| } | ||
| #endif | ||
| } | ||
|
|
||
| template <> | ||
|
|
@@ -649,17 +697,36 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder<DType> { | |
|
|
||
| template <typename DType> | ||
| void DictEncoderImpl<DType>::WriteDict(uint8_t* buffer) const { | ||
| // For primitive types, only a memcpy | ||
| // For primitive types, copy values with endianness conversion | ||
| DCHECK_EQ(static_cast<size_t>(dict_encoded_size_), sizeof(T) * memo_table_.size()); | ||
| #if ARROW_LITTLE_ENDIAN | ||
| memo_table_.CopyValues(0 /* start_pos */, reinterpret_cast<T*>(buffer)); | ||
| #else | ||
| // On big-endian systems, we need to byte-swap each value | ||
| // since Parquet data must be stored in little-endian format. | ||
| if constexpr (std::is_same_v<T, int32_t> || std::is_same_v<T, uint32_t> || | ||
| std::is_same_v<T, int64_t> || std::is_same_v<T, uint64_t> || | ||
| std::is_same_v<T, float> || std::is_same_v<T, double>) { | ||
| std::vector<T> temp(memo_table_.size()); | ||
| memo_table_.CopyValues(0 /* start_pos */, temp.data()); | ||
| T* out = reinterpret_cast<T*>(buffer); | ||
| for (size_t i = 0; i < temp.size(); ++i) { | ||
| out[i] = ::arrow::bit_util::ToLittleEndian(temp[i]); | ||
| } | ||
| } else { | ||
| // For other types (Int96, etc.), just do memcpy | ||
| memo_table_.CopyValues(0 /* start_pos */, reinterpret_cast<T*>(buffer)); | ||
| } | ||
| #endif | ||
| } | ||
|
|
||
| // ByteArray and FLBA already have the dictionary encoded in their data heaps | ||
| template <> | ||
| void DictEncoderImpl<ByteArrayType>::WriteDict(uint8_t* buffer) const { | ||
| memo_table_.VisitValues(0, [&buffer](::std::string_view v) { | ||
| uint32_t len = static_cast<uint32_t>(v.length()); | ||
| memcpy(buffer, &len, sizeof(len)); | ||
| uint32_t len_le = ::arrow::bit_util::ToLittleEndian(len); | ||
| memcpy(buffer, &len_le, sizeof(len_le)); | ||
| buffer += sizeof(len); | ||
| memcpy(buffer, v.data(), len); | ||
| buffer += len; | ||
|
|
@@ -924,6 +991,8 @@ class ByteStreamSplitEncoder : public ByteStreamSplitEncoderBase<DType> { | |
|
|
||
| void Put(const T* buffer, int num_values) override { | ||
| if (num_values > 0) { | ||
| // ByteStreamSplitEncode (DoSplitStreams) handles endianness correctly, | ||
| // so we can directly append the native byte representation | ||
| PARQUET_THROW_NOT_OK( | ||
| this->sink_.Append(reinterpret_cast<const uint8_t*>(buffer), | ||
| num_values * static_cast<int64_t>(sizeof(T)))); | ||
|
|
@@ -964,10 +1033,22 @@ class ByteStreamSplitEncoder<FLBAType> : public ByteStreamSplitEncoderBase<FLBAT | |
| if (byte_width_ > 0) { | ||
| const int64_t total_bytes = static_cast<int64_t>(num_values) * byte_width_; | ||
| PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes)); | ||
| #if !ARROW_LITTLE_ENDIAN | ||
| // On big-endian, reverse bytes before encoding to compensate for | ||
| // DoSplitStreams reversal, ensuring FLBA bytes are preserved as-is | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment as for the decoder part. |
||
| std::vector<uint8_t> temp_buffer(byte_width_); | ||
| #endif | ||
| for (int i = 0; i < num_values; ++i) { | ||
| // Write the result to the output stream | ||
| DCHECK(buffer[i].ptr != nullptr) << "Value ptr cannot be NULL"; | ||
| #if !ARROW_LITTLE_ENDIAN | ||
| // Reverse bytes before appending | ||
| std::reverse_copy(buffer[i].ptr, buffer[i].ptr + byte_width_, | ||
| temp_buffer.begin()); | ||
| sink_.UnsafeAppend(temp_buffer.data(), byte_width_); | ||
| #else | ||
| sink_.UnsafeAppend(buffer[i].ptr, byte_width_); | ||
| #endif | ||
| } | ||
| } | ||
| this->num_values_in_buffer_ += num_values; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is fine, but of course this is less performant than calling
AppendValueson the run.