diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 8ecb774022f..d6ff09b710b 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -41,6 +41,7 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/compression.h" #include "arrow/util/crc32.h" +#include "arrow/util/endian.h" #include "arrow/util/int_util_overflow.h" #include "arrow/util/logging.h" #include "arrow/util/rle_encoding_internal.h" @@ -112,7 +113,8 @@ int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level, if (data_size < 4) { throw ParquetException("Received invalid levels (corrupt data page?)"); } - num_bytes = ::arrow::util::SafeLoadAs(data); + num_bytes = + ::arrow::bit_util::FromLittleEndian(::arrow::util::SafeLoadAs(data)); if (num_bytes < 0 || num_bytes > data_size - 4) { throw ParquetException("Received invalid number of bytes (corrupt data page?)"); } @@ -132,7 +134,11 @@ int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level, "Number of buffered values too large (corrupt data page?)"); } num_bytes = static_cast(bit_util::BytesForBits(num_bits)); +#if ARROW_LITTLE_ENDIAN if (num_bytes < 0 || num_bytes > data_size - 4) { +#else + if (num_bytes < 0 || num_bytes > data_size) { +#endif throw ParquetException("Received invalid number of bytes (corrupt data page?)"); } if (!bit_packed_decoder_) { diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 94b67dfa807..8e07d67008f 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -953,7 +953,8 @@ int64_t ColumnWriterImpl::RleEncodeLevels(const void* src_buffer, DCHECK_EQ(encoded, num_buffered_values_); if (include_length_prefix) { - reinterpret_cast(dest_buffer->mutable_data())[0] = level_encoder_.len(); + ::arrow::util::SafeStore(dest_buffer->mutable_data(), + ::arrow::bit_util::ToLittleEndian(level_encoder_.len())); } return level_encoder_.len() + prefix_size; @@ -2578,13 +2579,31 @@ struct SerializeFunctor< if constexpr (std::is_same_v) { *p++ = ::arrow::bit_util::ToBigEndian(u64_in[0]); } else if constexpr (std::is_same_v) { +#if ARROW_LITTLE_ENDIAN + // On little-endian: u64_in[0] = low, u64_in[1] = high + // Write high first for big-endian output *p++ = ::arrow::bit_util::ToBigEndian(u64_in[1]); *p++ = ::arrow::bit_util::ToBigEndian(u64_in[0]); +#else + // On big-endian: u64_in[0] = high, u64_in[1] = low + // Write high first for big-endian output + *p++ = ::arrow::bit_util::ToBigEndian(u64_in[0]); + *p++ = ::arrow::bit_util::ToBigEndian(u64_in[1]); +#endif } else if constexpr (std::is_same_v) { +#if ARROW_LITTLE_ENDIAN + // On little-endian: write words in reverse order (high to low) *p++ = ::arrow::bit_util::ToBigEndian(u64_in[3]); *p++ = ::arrow::bit_util::ToBigEndian(u64_in[2]); *p++ = ::arrow::bit_util::ToBigEndian(u64_in[1]); *p++ = ::arrow::bit_util::ToBigEndian(u64_in[0]); +#else + // On big-endian: write words in natural order (high to low) + *p++ = ::arrow::bit_util::ToBigEndian(u64_in[0]); + *p++ = ::arrow::bit_util::ToBigEndian(u64_in[1]); + *p++ = ::arrow::bit_util::ToBigEndian(u64_in[2]); + *p++ = ::arrow::bit_util::ToBigEndian(u64_in[3]); +#endif } scratch = reinterpret_cast(p); } @@ -2603,7 +2622,24 @@ struct SerializeFunctor< template <> struct SerializeFunctor<::parquet::FLBAType, ::arrow::HalfFloatType> { Status Serialize(const ::arrow::HalfFloatArray& array, ArrowWriteContext*, FLBA* out) { +#if ARROW_LITTLE_ENDIAN + return SerializeLittleEndianValues(array, array.raw_values(), out); +#else const uint16_t* values = array.raw_values(); + const int64_t length = array.length(); + converted_values_.resize(length); + for (int64_t i = 0; i < length; ++i) { + // We don't need IsValid() here. Non valid values are just ignored in + // SerializeLittleEndianValues(). + converted_values_[i] = ::arrow::bit_util::ToLittleEndian(values[i]); + } + return SerializeLittleEndianValues(array, converted_values_.data(), out); +#endif + } + + private: + Status SerializeLittleEndianValues(const ::arrow::HalfFloatArray& array, + const uint16_t* values, FLBA* out) { if (array.null_count() == 0) { for (int64_t i = 0; i < array.length(); ++i) { out[i] = ToFLBA(&values[i]); @@ -2616,10 +2652,13 @@ struct SerializeFunctor<::parquet::FLBAType, ::arrow::HalfFloatType> { return Status::OK(); } - private: FLBA ToFLBA(const uint16_t* value_ptr) const { return FLBA{reinterpret_cast(value_ptr)}; } + +#if !ARROW_LITTLE_ENDIAN + std::vector converted_values_; +#endif }; template <> diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 2a046a0ca5d..6173411d2fe 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -260,13 +260,18 @@ constexpr int64_t kJulianEpochOffsetDays = INT64_C(2440588); template inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96* impala_timestamp) { int64_t julian_days = (time / UnitPerDay) + kJulianEpochOffsetDays; - (*impala_timestamp).value[2] = (uint32_t)julian_days; + (*impala_timestamp).value[2] = static_cast(julian_days); int64_t last_day_units = time % UnitPerDay; auto last_day_nanos = last_day_units * NanosecondsPerUnit; +#if ARROW_LITTLE_ENDIAN // impala_timestamp will be unaligned every other entry so do memcpy instead // of assign and reinterpret cast to avoid undefined behavior. std::memcpy(impala_timestamp, &last_day_nanos, sizeof(int64_t)); +#else + (*impala_timestamp).value[0] = static_cast(last_day_nanos); + (*impala_timestamp).value[1] = static_cast(last_day_nanos >> 32); +#endif } constexpr int64_t kSecondsInNanos = INT64_C(1000000000);