-
Notifications
You must be signed in to change notification settings - Fork 3.9k
GH-48204: [C++][Parquet] Fix Column Reader & Writer logic to enable Parquet DB support on s390x #48205
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
|
…support on s390x
82d9390 to
9959543
Compare
| int64_t julian_days = (time / UnitPerDay) + kJulianEpochOffsetDays; | ||
| #if ARROW_LITTLE_ENDIAN | ||
| (*impala_timestamp).value[2] = (uint32_t)julian_days; | ||
| #endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need this #if?
It seems that the below (*impala_timestamp).value[2] = static_cast<uint32_t>(julian_days); does the same thing.
| auto last_day_nanos = last_day_units * NanosecondsPerUnit; | ||
| #if ARROW_LITTLE_ENDIAN | ||
| // impala_timestamp will be unaligned every other entry so do memcpy instead | ||
| // of assign and reinterpret cast to avoid undefined behavior. | ||
| std::memcpy(impala_timestamp, &last_day_nanos, sizeof(int64_t)); | ||
| #else | ||
| (*impala_timestamp).value[0] = static_cast<uint32_t>(last_day_nanos); | ||
| (*impala_timestamp).value[1] = static_cast<uint32_t>(last_day_nanos >> 32); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we use the following instead of #if?
auto last_day_nanos = last_day_units * NanosecondsPerUnit;
auto last_day_nanos_little_endian = ::arrow::bit_util::ToLittleEndian(last_day_nanos);
std::memcpy(impala_timestamp, &last_day_nanos_little_endian, sizeof(int64_t));| #else | ||
| template <> | ||
| struct SerializeFunctor<::parquet::FLBAType, ::arrow::HalfFloatType> { | ||
| Status Serialize(const ::arrow::HalfFloatArray& array, ArrowWriteContext*, FLBA* out) { | ||
| const uint16_t* values = array.raw_values(); | ||
| const int64_t length = array.length(); | ||
|
|
||
| // Allocate buffer for little-endian converted values | ||
| converted_values_.resize(length); | ||
|
|
||
| if (array.null_count() == 0) { | ||
| for (int64_t i = 0; i < length; ++i) { | ||
| converted_values_[i] = ::arrow::bit_util::ToLittleEndian(values[i]); | ||
| out[i] = FLBA{reinterpret_cast<const uint8_t*>(&converted_values_[i])}; | ||
| } | ||
| } else { | ||
| for (int64_t i = 0; i < length; ++i) { | ||
| if (array.IsValid(i)) { | ||
| converted_values_[i] = ::arrow::bit_util::ToLittleEndian(values[i]); | ||
| out[i] = FLBA{reinterpret_cast<const uint8_t*>(&converted_values_[i])}; | ||
| } else { | ||
| out[i] = FLBA{}; | ||
| } | ||
| } | ||
| } | ||
| return Status::OK(); | ||
| } | ||
|
|
||
| private: | ||
| std::vector<uint16_t> converted_values_; | ||
| }; | ||
| #endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you share implementation as much as possible something like:
template <>
struct SerializeFunctor<::parquet::FLBAType, ::arrow::HalfFloatType> {
Status Serialize(const ::arrow::HalfFloatArray& array, ArrowWriteContext*, FLBA* out) {
#if ARROW_LITTLE_ENDIAN
return SerializeLittleEndianValues(array.raw_values(), out);
#else
const uint16_t* values = array.raw_values();
const int64_t length = array.length();
converted_values_.resize(length);
for (int64_t i = 0; i < length; ++i) {
// We don't need IsValid() here. Non valid values are just ignored in SerializeLittleEndianValues().
converted_values_[i] = ::arrow::bit_util::ToLittleEndian(values[i]);
}
return SerializeLittleEndianValues(converted_values_.data(), out);
#endif
}
private:
Status SerializeLittleEndianValues(const uint16_t* values, FLBA* out) {
if (array.null_count() == 0) {
for (int64_t i = 0; i < array.length(); ++i) {
out[i] = ToFLBA(&values[i]);
}
} else {
for (int64_t i = 0; i < array.length(); ++i) {
out[i] = array.IsValid(i) ? ToFLBA(&values[i]) : FLBA{};
}
}
return Status::OK();
}
FLBA ToFLBA(const uint16_t* value_ptr) const {
return FLBA{reinterpret_cast<const uint8_t*>(value_ptr)};
}
#if !ARROW_LITTLE_ENDIAN
std::vector<uint16_t> converted_values_;
#endif
};| #if ARROW_LITTLE_ENDIAN | ||
| if (num_bytes < 0 || num_bytes > data_size - 4) { | ||
| #else | ||
| if (num_bytes < 0 || num_bytes > data_size) { | ||
| #endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Rationale for this change
This PR is intended to enable Parquet DB support on Big-endian (s390x) systems. The fix in this PR fixes the column reader & writer logic. Column Reader & Writer are the main part of most of the parquet & arrow-parquet testcases.
What changes are included in this PR?
The fix includes changes to following files:
cpp/src/parquet/column_reader.cc
cpp/src/parquet/column_writer.cc
cpp/src/parquet/column_writer.h
Are these changes tested?
Yes. The changes are tested on s390x arch to make sure things are working fine. The fix is also tested on x86 arch, to make sure there is no new regression introduced.
Are there any user-facing changes?
No
GitHub main Issue link: #48151