Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 53 additions & 3 deletions cpp/src/parquet/decoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/byte_stream_split_internal.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/endian.h"
#include "arrow/util/int_util_overflow.h"
#include "arrow/util/logging_internal.h"
#include "arrow/util/rle_encoding_internal.h"
Expand Down Expand Up @@ -406,9 +407,20 @@ int PlainDecoder<DType>::DecodeArrow(
VisitBitRuns(valid_bits, valid_bits_offset, num_values,
[&](int64_t position, int64_t run_length, bool is_valid) {
if (is_valid) {
#if ARROW_LITTLE_ENDIAN
RETURN_NOT_OK(builder->AppendValues(
reinterpret_cast<const value_type*>(data), run_length));
data += run_length * sizeof(value_type);
#else
// On big-endian systems, we need to byte-swap each value
// since Parquet data is stored in little-endian format
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is fine, but of course this is less performant than calling AppendValues on the run.

for (int64_t i = 0; i < run_length; ++i) {
value_type value = ::arrow::bit_util::FromLittleEndian(
SafeLoadAs<value_type>(data));
RETURN_NOT_OK(builder->Append(value));
data += sizeof(value_type);
}
#endif
} else {
RETURN_NOT_OK(builder->AppendNulls(run_length));
}
Expand Down Expand Up @@ -458,7 +470,24 @@ inline int DecodePlain(const uint8_t* data, int64_t data_size, int num_values,
}
// If bytes_to_decode == 0, data could be null
if (bytes_to_decode > 0) {
#if ARROW_LITTLE_ENDIAN
memcpy(out, data, static_cast<size_t>(bytes_to_decode));
#else
// On big-endian systems, we need to byte-swap each value
// since Parquet data is stored in little-endian format.
// Only apply to integer and floating-point types that have FromLittleEndian support.
if constexpr (std::is_same_v<T, int32_t> || std::is_same_v<T, uint32_t> ||
std::is_same_v<T, int64_t> || std::is_same_v<T, uint64_t> ||
std::is_same_v<T, float> || std::is_same_v<T, double>) {
for (int i = 0; i < num_values; ++i) {
out[i] = ::arrow::bit_util::FromLittleEndian(SafeLoadAs<T>(data));
data += sizeof(T);
}
} else {
// For other types (bool, Int96, etc.), just do memcpy
memcpy(out, data, static_cast<size_t>(bytes_to_decode));
}
#endif
}
return static_cast<int>(bytes_to_decode);
}
Expand All @@ -471,7 +500,7 @@ static inline int64_t ReadByteArray(const uint8_t* data, int64_t data_size,
if (ARROW_PREDICT_FALSE(data_size < 4)) {
ParquetException::EofException();
}
const int32_t len = SafeLoadAs<int32_t>(data);
const int32_t len = ::arrow::bit_util::FromLittleEndian(SafeLoadAs<int32_t>(data));
if (len < 0) {
throw ParquetException("Invalid BYTE_ARRAY value");
}
Expand Down Expand Up @@ -772,7 +801,8 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType> {
return Status::Invalid(
"Invalid or truncated PLAIN-encoded BYTE_ARRAY data");
}
auto value_len = SafeLoadAs<int32_t>(data_);
auto value_len =
::arrow::bit_util::FromLittleEndian(SafeLoadAs<int32_t>(data_));
if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > len_ - 4)) {
return Status::Invalid(
"Invalid or truncated PLAIN-encoded BYTE_ARRAY data");
Expand Down Expand Up @@ -817,7 +847,8 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType> {
return Status::Invalid(
"Invalid or truncated PLAIN-encoded BYTE_ARRAY data");
}
auto value_len = SafeLoadAs<int32_t>(data_);
auto value_len =
::arrow::bit_util::FromLittleEndian(SafeLoadAs<int32_t>(data_));
if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > len_ - 4)) {
return Status::Invalid(
"Invalid or truncated PLAIN-encoded BYTE_ARRAY data");
Expand Down Expand Up @@ -1616,9 +1647,17 @@ class DeltaBitPackDecoder : public TypedDecoderImpl<DType> {
for (int j = 0; j < values_decode; ++j) {
// Addition between min_delta, packed int and last_value should be treated as
// unsigned addition. Overflow is as expected.
#if ARROW_LITTLE_ENDIAN
buffer[i + j] = static_cast<UT>(min_delta_) + static_cast<UT>(buffer[i + j]) +
static_cast<UT>(last_value_);
last_value_ = buffer[i + j];
#else
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why this change? The added code for BE doesn't seem to do anything different.

UT temp = static_cast<UT>(min_delta_) +
static_cast<UT>(static_cast<uint64_t>(buffer[i + j])) +
static_cast<UT>(last_value_);
buffer[i + j] = static_cast<T>(temp);
last_value_ = static_cast<T>(temp);
#endif
}
values_remaining_current_mini_block_ -= values_decode;
i += values_decode;
Expand Down Expand Up @@ -2306,6 +2345,17 @@ class ByteStreamSplitDecoder<FLBAType> : public ByteStreamSplitDecoderBase<FLBAT
const int num_decoded = this->DecodeRaw(decode_out, max_values);
DCHECK_EQ(num_decoded, max_values);

#if !ARROW_LITTLE_ENDIAN
// On big-endian, ByteStreamSplitDecode (DoMergeStreams) reverses stream positions
// to produce numeric values in native byte order. For FLBA (opaque byte arrays),
// we need to undo this reversal to preserve the original byte sequence.
Comment on lines +2349 to +2351
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, doing a second byteswap to undo the first one sounds entirely wasteful. Perhaps the ByteStreamSplit routines should instead take an optional flag to do the byte-swapping?

For example have something like:

// If fix_endianness is `true`, output will be converted from little endian to native endian
inline void ByteStreamSplitDecode(const uint8_t* data, int width, int64_t num_values,
                                  int64_t stride, bool fix_endianness, uint8_t* out) {

const int type_length = this->type_length_;
for (int i = 0; i < num_decoded; ++i) {
uint8_t* value_ptr = decode_out + static_cast<int64_t>(type_length) * i;
std::reverse(value_ptr, value_ptr + type_length);
}
#endif

for (int i = 0; i < num_decoded; ++i) {
buffer[i] =
FixedLenByteArray(decode_out + static_cast<int64_t>(this->type_length_) * i);
Expand Down
87 changes: 84 additions & 3 deletions cpp/src/parquet/encoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,8 @@ class PlainEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {

void UnsafePutByteArray(const void* data, uint32_t length) {
DCHECK(length == 0 || data != nullptr) << "Value ptr cannot be NULL";
sink_.UnsafeAppend(&length, sizeof(uint32_t));
uint32_t length_le = ::arrow::bit_util::ToLittleEndian(length);
sink_.UnsafeAppend(&length_le, sizeof(uint32_t));
sink_.UnsafeAppend(data, static_cast<int64_t>(length));
unencoded_byte_array_data_bytes_ += length;
}
Expand Down Expand Up @@ -201,7 +202,27 @@ class PlainEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
template <typename DType>
void PlainEncoder<DType>::Put(const T* buffer, int num_values) {
if (num_values > 0) {
#if ARROW_LITTLE_ENDIAN
PARQUET_THROW_NOT_OK(sink_.Append(buffer, num_values * sizeof(T)));
#else
// On big-endian systems, we need to byte-swap each value
// since Parquet data must be stored in little-endian format.
if constexpr (std::is_same_v<T, int32_t> || std::is_same_v<T, uint32_t> ||
std::is_same_v<T, int64_t> || std::is_same_v<T, uint64_t> ||
std::is_same_v<T, float> || std::is_same_v<T, double>) {
PARQUET_ASSIGN_OR_THROW(
auto temp_buffer,
::arrow::AllocateBuffer(num_values * sizeof(T), this->memory_pool()));
Comment on lines +213 to +215
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wow, can you use a small-sized stack-allocated buffer instead? (for example a std::array<T, 128>).

T* temp_data = temp_buffer->template mutable_data_as<T>();
for (int i = 0; i < num_values; ++i) {
temp_data[i] = ::arrow::bit_util::ToLittleEndian(buffer[i]);
}
PARQUET_THROW_NOT_OK(sink_.Append(temp_data, num_values * sizeof(T)));
} else {
// For other types (Int96, etc.), just do memcpy
PARQUET_THROW_NOT_OK(sink_.Append(buffer, num_values * sizeof(T)));
}
#endif
}
}

Expand All @@ -224,6 +245,7 @@ void DirectPutImpl(const ::arrow::Array& values, ::arrow::BufferBuilder* sink) {
constexpr auto value_size = sizeof(value_type);
auto raw_values = checked_cast<const ArrayType&>(values).raw_values();

#if ARROW_LITTLE_ENDIAN
if (values.null_count() == 0) {
// no nulls, just dump the data
PARQUET_THROW_NOT_OK(sink->Append(raw_values, values.length() * value_size));
Expand All @@ -237,6 +259,32 @@ void DirectPutImpl(const ::arrow::Array& values, ::arrow::BufferBuilder* sink) {
}
}
}
#else
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ToLittleEndian works on all systems, please let's not duplicate codepaths for no good reason.

// On big-endian systems, we need to byte-swap each value
// since Parquet data must be stored in little-endian format.
if constexpr (std::is_same_v<value_type, int32_t> ||
std::is_same_v<value_type, uint32_t> ||
std::is_same_v<value_type, int64_t> ||
std::is_same_v<value_type, uint64_t> ||
std::is_same_v<value_type, float> || std::is_same_v<value_type, double>) {
PARQUET_THROW_NOT_OK(
sink->Reserve((values.length() - values.null_count()) * value_size));
for (int64_t i = 0; i < values.length(); i++) {
if (values.IsValid(i)) {
auto le_value = ::arrow::bit_util::ToLittleEndian(raw_values[i]);
sink->UnsafeAppend(&le_value, value_size);
}
}
} else {
PARQUET_THROW_NOT_OK(
sink->Reserve((values.length() - values.null_count()) * value_size));
for (int64_t i = 0; i < values.length(); i++) {
if (values.IsValid(i)) {
sink->UnsafeAppend(&raw_values[i], value_size);
}
}
}
#endif
}

template <>
Expand Down Expand Up @@ -649,17 +697,36 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder<DType> {

template <typename DType>
void DictEncoderImpl<DType>::WriteDict(uint8_t* buffer) const {
// For primitive types, only a memcpy
// For primitive types, copy values with endianness conversion
DCHECK_EQ(static_cast<size_t>(dict_encoded_size_), sizeof(T) * memo_table_.size());
#if ARROW_LITTLE_ENDIAN
memo_table_.CopyValues(0 /* start_pos */, reinterpret_cast<T*>(buffer));
#else
// On big-endian systems, we need to byte-swap each value
// since Parquet data must be stored in little-endian format.
if constexpr (std::is_same_v<T, int32_t> || std::is_same_v<T, uint32_t> ||
std::is_same_v<T, int64_t> || std::is_same_v<T, uint64_t> ||
std::is_same_v<T, float> || std::is_same_v<T, double>) {
std::vector<T> temp(memo_table_.size());
memo_table_.CopyValues(0 /* start_pos */, temp.data());
T* out = reinterpret_cast<T*>(buffer);
for (size_t i = 0; i < temp.size(); ++i) {
out[i] = ::arrow::bit_util::ToLittleEndian(temp[i]);
}
} else {
// For other types (Int96, etc.), just do memcpy
memo_table_.CopyValues(0 /* start_pos */, reinterpret_cast<T*>(buffer));
}
#endif
}

// ByteArray and FLBA already have the dictionary encoded in their data heaps
template <>
void DictEncoderImpl<ByteArrayType>::WriteDict(uint8_t* buffer) const {
memo_table_.VisitValues(0, [&buffer](::std::string_view v) {
uint32_t len = static_cast<uint32_t>(v.length());
memcpy(buffer, &len, sizeof(len));
uint32_t len_le = ::arrow::bit_util::ToLittleEndian(len);
memcpy(buffer, &len_le, sizeof(len_le));
buffer += sizeof(len);
memcpy(buffer, v.data(), len);
buffer += len;
Expand Down Expand Up @@ -924,6 +991,8 @@ class ByteStreamSplitEncoder : public ByteStreamSplitEncoderBase<DType> {

void Put(const T* buffer, int num_values) override {
if (num_values > 0) {
// ByteStreamSplitEncode (DoSplitStreams) handles endianness correctly,
// so we can directly append the native byte representation
PARQUET_THROW_NOT_OK(
this->sink_.Append(reinterpret_cast<const uint8_t*>(buffer),
num_values * static_cast<int64_t>(sizeof(T))));
Expand Down Expand Up @@ -964,10 +1033,22 @@ class ByteStreamSplitEncoder<FLBAType> : public ByteStreamSplitEncoderBase<FLBAT
if (byte_width_ > 0) {
const int64_t total_bytes = static_cast<int64_t>(num_values) * byte_width_;
PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes));
#if !ARROW_LITTLE_ENDIAN
// On big-endian, reverse bytes before encoding to compensate for
// DoSplitStreams reversal, ensuring FLBA bytes are preserved as-is
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment as for the decoder part.

std::vector<uint8_t> temp_buffer(byte_width_);
#endif
for (int i = 0; i < num_values; ++i) {
// Write the result to the output stream
DCHECK(buffer[i].ptr != nullptr) << "Value ptr cannot be NULL";
#if !ARROW_LITTLE_ENDIAN
// Reverse bytes before appending
std::reverse_copy(buffer[i].ptr, buffer[i].ptr + byte_width_,
temp_buffer.begin());
sink_.UnsafeAppend(temp_buffer.data(), byte_width_);
#else
sink_.UnsafeAppend(buffer[i].ptr, byte_width_);
#endif
}
}
this->num_values_in_buffer_ += num_values;
Expand Down