Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion cpp/src/parquet/column_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
#include "arrow/util/checked_cast.h"
#include "arrow/util/compression.h"
#include "arrow/util/crc32.h"
#include "arrow/util/endian.h"
#include "arrow/util/int_util_overflow.h"
#include "arrow/util/logging.h"
#include "arrow/util/rle_encoding_internal.h"
Expand Down Expand Up @@ -112,7 +113,8 @@ int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level,
if (data_size < 4) {
throw ParquetException("Received invalid levels (corrupt data page?)");
}
num_bytes = ::arrow::util::SafeLoadAs<int32_t>(data);
num_bytes =
::arrow::bit_util::FromLittleEndian(::arrow::util::SafeLoadAs<int32_t>(data));
if (num_bytes < 0 || num_bytes > data_size - 4) {
throw ParquetException("Received invalid number of bytes (corrupt data page?)");
}
Expand Down
43 changes: 41 additions & 2 deletions cpp/src/parquet/column_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -953,7 +953,8 @@ int64_t ColumnWriterImpl::RleEncodeLevels(const void* src_buffer,
DCHECK_EQ(encoded, num_buffered_values_);

if (include_length_prefix) {
reinterpret_cast<int32_t*>(dest_buffer->mutable_data())[0] = level_encoder_.len();
::arrow::util::SafeStore(dest_buffer->mutable_data(),
::arrow::bit_util::ToLittleEndian(level_encoder_.len()));
}

return level_encoder_.len() + prefix_size;
Expand Down Expand Up @@ -2578,13 +2579,31 @@ struct SerializeFunctor<
if constexpr (std::is_same_v<ArrowType, ::arrow::Decimal64Type>) {
*p++ = ::arrow::bit_util::ToBigEndian(u64_in[0]);
} else if constexpr (std::is_same_v<ArrowType, ::arrow::Decimal128Type>) {
#if ARROW_LITTLE_ENDIAN
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please take a step back and read the comments above:

// Requires a custom serializer because decimal in parquet are in big-endian
// format. Thus, a temporary local buffer is required.

If we're on a big-endian system, this entire code is unnecessary and we can just use the FIXED_LEN_BYTE_ARRAY SerializeFunctor.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pitrou.. Thanks for your review comments.. I will probably work on this change in the next pass. Thanks..

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Vishwanatha-HD Please don't resolve discussions until they are actually resolved. This one hasn't been addressed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pitrou.. Ok.. sure.. thanks..

@Vishwanatha-HD Please don't resolve discussions until they are actually resolved. This one hasn't been addressed.

Copy link
Contributor Author

@Vishwanatha-HD Vishwanatha-HD Nov 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pitrou.. I have rebased this to git main and removed the below piece of code..

-#if ARROW_LITTLE_ENDIAN
-      if (num_bytes < 0 || num_bytes > data_size - 4) {
-#else
       if (num_bytes < 0 || num_bytes > data_size) {            ----------->>>>> Only retaining this line now.. 
-#endif

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Vishwanatha-HD Sure, but you haven't addressed the initial comment.

// On little-endian: u64_in[0] = low, u64_in[1] = high
// Write high first for big-endian output
*p++ = ::arrow::bit_util::ToBigEndian(u64_in[1]);
*p++ = ::arrow::bit_util::ToBigEndian(u64_in[0]);
#else
// On big-endian: u64_in[0] = high, u64_in[1] = low
// Write high first for big-endian output
*p++ = ::arrow::bit_util::ToBigEndian(u64_in[0]);
*p++ = ::arrow::bit_util::ToBigEndian(u64_in[1]);
#endif
} else if constexpr (std::is_same_v<ArrowType, ::arrow::Decimal256Type>) {
#if ARROW_LITTLE_ENDIAN
// On little-endian: write words in reverse order (high to low)
*p++ = ::arrow::bit_util::ToBigEndian(u64_in[3]);
*p++ = ::arrow::bit_util::ToBigEndian(u64_in[2]);
*p++ = ::arrow::bit_util::ToBigEndian(u64_in[1]);
*p++ = ::arrow::bit_util::ToBigEndian(u64_in[0]);
#else
// On big-endian: write words in natural order (high to low)
*p++ = ::arrow::bit_util::ToBigEndian(u64_in[0]);
*p++ = ::arrow::bit_util::ToBigEndian(u64_in[1]);
*p++ = ::arrow::bit_util::ToBigEndian(u64_in[2]);
*p++ = ::arrow::bit_util::ToBigEndian(u64_in[3]);
#endif
}
scratch = reinterpret_cast<uint8_t*>(p);
}
Expand All @@ -2603,7 +2622,24 @@ struct SerializeFunctor<
template <>
struct SerializeFunctor<::parquet::FLBAType, ::arrow::HalfFloatType> {
Status Serialize(const ::arrow::HalfFloatArray& array, ArrowWriteContext*, FLBA* out) {
#if ARROW_LITTLE_ENDIAN
return SerializeLittleEndianValues(array, array.raw_values(), out);
#else
const uint16_t* values = array.raw_values();
const int64_t length = array.length();
Copy link
Member

@kiszk kiszk Dec 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we create or do versioning SerializeLittleEndianValues() for big-endian, which calls ::arrow::bit_util::ToLittleEndian() when &values[i] is called?
Then we can use it instead of storing converted values to a vector? This can reduce memory accesses.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already have endian-fixing logic for FLBA in the decimal serializer above. It should be factored out and reused instead of reinventing something different here.

converted_values_.resize(length);
for (int64_t i = 0; i < length; ++i) {
// We don't need IsValid() here. Non valid values are just ignored in
// SerializeLittleEndianValues().
converted_values_[i] = ::arrow::bit_util::ToLittleEndian(values[i]);
}
return SerializeLittleEndianValues(array, converted_values_.data(), out);
#endif
}

private:
Status SerializeLittleEndianValues(const ::arrow::HalfFloatArray& array,
const uint16_t* values, FLBA* out) {
if (array.null_count() == 0) {
for (int64_t i = 0; i < array.length(); ++i) {
out[i] = ToFLBA(&values[i]);
Expand All @@ -2616,10 +2652,13 @@ struct SerializeFunctor<::parquet::FLBAType, ::arrow::HalfFloatType> {
return Status::OK();
}

private:
FLBA ToFLBA(const uint16_t* value_ptr) const {
return FLBA{reinterpret_cast<const uint8_t*>(value_ptr)};
}

#if !ARROW_LITTLE_ENDIAN
std::vector<uint16_t> converted_values_;
#endif
};

template <>
Expand Down
7 changes: 6 additions & 1 deletion cpp/src/parquet/column_writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -267,9 +267,14 @@ inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96* impala_ti
}
impala_timestamp->value[2] = static_cast<uint32_t>(julian_days);
uint64_t last_day_nanos = static_cast<uint64_t>(last_day_units) * NanosecondsPerUnit;
#if ARROW_LITTLE_ENDIAN
// impala_timestamp will be unaligned every other entry so do memcpy instead
// of assign and reinterpret cast to avoid undefined behavior.
std::memcpy(impala_timestamp, &last_day_nanos, sizeof(uint64_t));
std::memcpy(impala_timestamp, &last_day_nanos, sizeof(int64_t));
#else
(*impala_timestamp).value[0] = static_cast<uint32_t>(last_day_nanos);
(*impala_timestamp).value[1] = static_cast<uint32_t>(last_day_nanos >> 32);
Comment on lines +275 to +276
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this not changing the endianness of values?

#endif
}

constexpr int64_t kSecondsInNanos = INT64_C(1000000000);
Expand Down
Loading