Skip to content

Commit f0370d0

Browse files
GH-48198: [C++][Parquet] Fix all the testcase issues to enable Parquet DB support on s390x
1 parent 2fb2f79 commit f0370d0

File tree

11 files changed

+116
-66
lines changed

11 files changed

+116
-66
lines changed

cpp/src/arrow/dataset/file_parquet_test.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -933,7 +933,8 @@ TEST(TestParquetStatistics, NoNullCount) {
933933
auto int32_to_parquet_stats = [](int32_t v) {
934934
std::string value;
935935
value.resize(sizeof(int32_t));
936-
memcpy(value.data(), &v, sizeof(int32_t));
936+
int32_t le_v = ::arrow::bit_util::ToLittleEndian(v);
937+
memcpy(value.data(), &le_v, sizeof(int32_t));
937938
return value;
938939
};
939940
{

cpp/src/arrow/util/byte_stream_split_test.cc

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,14 @@ void ReferenceByteStreamSplitEncode(const uint8_t* src, int width,
5555
const int64_t num_values, uint8_t* dest) {
5656
for (int64_t i = 0; i < num_values; ++i) {
5757
for (int stream = 0; stream < width; ++stream) {
58-
dest[stream * num_values + i] = *src++;
58+
#if ARROW_LITTLE_ENDIAN
59+
dest[stream * num_values + i] = src[stream];
60+
#else
61+
// On big-endian, reverse byte order: stream 0 gets LSB (at highest address)
62+
dest[stream * num_values + i] = src[width - 1 - stream];
63+
#endif
5964
}
65+
src += width;
6066
}
6167
}
6268

cpp/src/parquet/arrow/arrow_reader_writer_test.cc

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
#include "arrow/util/checked_cast.h"
5353
#include "arrow/util/config.h" // for ARROW_CSV definition
5454
#include "arrow/util/decimal.h"
55+
#include "arrow/util/endian.h"
5556
#include "arrow/util/future.h"
5657
#include "arrow/util/key_value_metadata.h"
5758
#include "arrow/util/logging_internal.h"
@@ -5907,14 +5908,18 @@ struct ColumnIndexObject {
59075908
}
59085909
};
59095910

5911+
// Parquet uses little-endian encoding for plain numeric types
59105912
auto encode_int64 = [](int64_t value) {
5911-
return std::string(reinterpret_cast<const char*>(&value), sizeof(int64_t));
5913+
uint64_t le_value = ::arrow::bit_util::ToLittleEndian(static_cast<uint64_t>(value));
5914+
return std::string(reinterpret_cast<const char*>(&le_value), sizeof(int64_t));
59125915
};
59135916

59145917
auto encode_double = [](double value) {
5915-
return std::string(reinterpret_cast<const char*>(&value), sizeof(double));
5918+
uint64_t int_value;
5919+
std::memcpy(&int_value, &value, sizeof(double));
5920+
uint64_t le_value = ::arrow::bit_util::ToLittleEndian(int_value);
5921+
return std::string(reinterpret_cast<const char*>(&le_value), sizeof(double));
59165922
};
5917-
59185923
} // namespace
59195924

59205925
class ParquetPageIndexRoundTripTest : public ::testing::Test {

cpp/src/parquet/column_writer_test.cc

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "arrow/util/bit_util.h"
2929
#include "arrow/util/bitmap_builders.h"
3030
#include "arrow/util/config.h"
31+
#include "arrow/util/endian.h"
3132
#include "arrow/util/key_value_metadata.h"
3233

3334
#include "parquet/column_page.h"
@@ -1260,11 +1261,16 @@ void EncodeLevels(Encoding::type encoding, int16_t max_level, int num_levels,
12601261
// encode levels
12611262
if (encoding == Encoding::RLE) {
12621263
// leave space to write the rle length value
1264+
#if ARROW_LITTLE_ENDIAN
12631265
encoder.Init(encoding, max_level, num_levels, bytes.data() + sizeof(int32_t),
12641266
static_cast<int>(bytes.size()));
1265-
1267+
#else
1268+
encoder.Init(encoding, max_level, num_levels, bytes.data() + sizeof(int32_t),
1269+
static_cast<int>(bytes.size() - sizeof(int32_t)));
1270+
#endif
12661271
levels_count = encoder.Encode(num_levels, input_levels);
1267-
(reinterpret_cast<int32_t*>(bytes.data()))[0] = encoder.len();
1272+
*reinterpret_cast<int32_t*>(bytes.data()) =
1273+
::arrow::bit_util::ToLittleEndian(encoder.len());
12681274
} else {
12691275
encoder.Init(encoding, max_level, num_levels, bytes.data(),
12701276
static_cast<int>(bytes.size()));

cpp/src/parquet/encoding_test.cc

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1559,15 +1559,16 @@ void TestByteStreamSplitEncoding<Type>::CheckDecode() {
15591559
// INT32, FLOAT
15601560
const std::vector<uint8_t> data{0x11, 0x22, 0x33, 0x44, 0x55, 0x66,
15611561
0x77, 0x88, 0x99, 0xAA, 0xBB, 0xCC};
1562-
const auto expected_output =
1563-
ToLittleEndian<uint32_t>({0xAA774411U, 0xBB885522U, 0xCC996633U});
1562+
// Values are in native byte order after decoding
1563+
const std::vector<uint32_t> expected_output{0xAA774411U, 0xBB885522U, 0xCC996633U};
15641564
CheckDecode(span{data}, span{expected_output});
15651565
} else {
15661566
// INT64, DOUBLE
15671567
const std::vector<uint8_t> data{0xDE, 0xC0, 0x37, 0x13, 0x11, 0x22, 0x33, 0x44,
15681568
0xAA, 0xBB, 0xCC, 0xDD, 0x55, 0x66, 0x77, 0x88};
1569-
const auto expected_output =
1570-
ToLittleEndian<uint64_t>({0x7755CCAA331137DEULL, 0x8866DDBB442213C0ULL});
1569+
// Values are in native byte order after decoding
1570+
const std::vector<uint64_t> expected_output{0x7755CCAA331137DEULL,
1571+
0x8866DDBB442213C0ULL};
15711572
CheckDecode(span{data}, span{expected_output});
15721573
}
15731574
}
@@ -1596,14 +1597,16 @@ void TestByteStreamSplitEncoding<Type>::CheckEncode() {
15961597
}
15971598
} else if constexpr (sizeof(c_type) == 4) {
15981599
// INT32, FLOAT
1599-
const auto data = ToLittleEndian<uint32_t>({0xaabbccddUL, 0x11223344UL});
1600+
// Values should be in native byte order before encoding
1601+
const std::vector<uint32_t> data{0xaabbccddUL, 0x11223344UL};
16001602
const std::vector<uint8_t> expected_output{0xdd, 0x44, 0xcc, 0x33,
16011603
0xbb, 0x22, 0xaa, 0x11};
16021604
CheckEncode(span{data}, span{expected_output});
16031605
} else {
16041606
// INT64, DOUBLE
1605-
const auto data = ToLittleEndian<uint64_t>(
1606-
{0x4142434445464748ULL, 0x0102030405060708ULL, 0xb1b2b3b4b5b6b7b8ULL});
1607+
// Values should be in native byte order before encoding
1608+
const std::vector<uint64_t> data{0x4142434445464748ULL, 0x0102030405060708ULL,
1609+
0xb1b2b3b4b5b6b7b8ULL};
16071610
const std::vector<uint8_t> expected_output{
16081611
0x48, 0x08, 0xb8, 0x47, 0x07, 0xb7, 0x46, 0x06, 0xb6, 0x45, 0x05, 0xb5,
16091612
0x44, 0x04, 0xb4, 0x43, 0x03, 0xb3, 0x42, 0x02, 0xb2, 0x41, 0x01, 0xb1,

cpp/src/parquet/level_conversion_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ TEST(TestColumnReader, DefLevelsToBitmapPowerOfTwo) {
9595
ASSERT_EQ(0, io.null_count);
9696
}
9797

98-
#if defined(ARROW_LITTLE_ENDIAN)
98+
#if ARROW_LITTLE_ENDIAN
9999
TEST(GreaterThanBitmap, GeneratesExpectedBitmasks) {
100100
std::vector<int16_t> levels = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
101101
0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,

cpp/src/parquet/metadata_test.cc

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
#include <gtest/gtest.h>
2121

22+
#include "arrow/util/endian.h"
2223
#include "arrow/util/key_value_metadata.h"
2324
#include "parquet/file_reader.h"
2425
#include "parquet/file_writer.h"
@@ -104,16 +105,20 @@ TEST(Metadata, TestBuildAccess) {
104105
int64_t nrows = 1000;
105106
int32_t int_min = 100, int_max = 200;
106107
EncodedStatistics stats_int;
108+
int32_t int_min_le = ::arrow::bit_util::ToLittleEndian(int_min);
109+
int32_t int_max_le = ::arrow::bit_util::ToLittleEndian(int_max);
107110
stats_int.set_null_count(0)
108111
.set_distinct_count(nrows)
109-
.set_min(std::string(reinterpret_cast<const char*>(&int_min), 4))
110-
.set_max(std::string(reinterpret_cast<const char*>(&int_max), 4));
112+
.set_min(std::string(reinterpret_cast<const char*>(&int_min_le), 4))
113+
.set_max(std::string(reinterpret_cast<const char*>(&int_max_le), 4));
111114
EncodedStatistics stats_float;
112115
float float_min = 100.100f, float_max = 200.200f;
116+
float float_min_le = ::arrow::bit_util::ToLittleEndian(float_min);
117+
float float_max_le = ::arrow::bit_util::ToLittleEndian(float_max);
113118
stats_float.set_null_count(0)
114119
.set_distinct_count(nrows)
115-
.set_min(std::string(reinterpret_cast<const char*>(&float_min), 4))
116-
.set_max(std::string(reinterpret_cast<const char*>(&float_max), 4));
120+
.set_min(std::string(reinterpret_cast<const char*>(&float_min_le), 4))
121+
.set_max(std::string(reinterpret_cast<const char*>(&float_max_le), 4));
117122

118123
// Generate the metadata
119124
auto f_accessor = GenerateTableMetaData(schema, props, nrows, stats_int, stats_float);

cpp/src/parquet/reader_test.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
#include "arrow/testing/random.h"
4444
#include "arrow/util/checked_cast.h"
4545
#include "arrow/util/config.h"
46+
#include "arrow/util/endian.h"
4647
#include "arrow/util/range.h"
4748

4849
#include "parquet/column_reader.h"
@@ -1734,7 +1735,9 @@ TEST(TestByteStreamSplit, FloatIntegrationFile) {
17341735
{
17351736
auto values =
17361737
ReadColumnValues<FloatType>(file.get(), /*row_group=*/0, /*column=*/0, kNumRows);
1738+
# if ARROW_LITTLE_ENDIAN
17371739
ASSERT_EQ(values[0], 1.7640524f);
1740+
# endif
17381741
ASSERT_EQ(values[1], 0.4001572f);
17391742
ASSERT_EQ(values[kNumRows - 2], -0.39944902f);
17401743
ASSERT_EQ(values[kNumRows - 1], 0.37005588f);

cpp/src/parquet/statistics_test.cc

Lines changed: 51 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1037,17 +1037,25 @@ void TestStatisticsSortOrder<Int32Type>::SetValues() {
10371037
values_[i] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4};
10381038
}
10391039

1040-
// Write UINT32 min/max values
1041-
stats_[0]
1042-
.set_min(std::string(reinterpret_cast<const char*>(&values_[5]), sizeof(c_type)))
1043-
.set_max(std::string(reinterpret_cast<const char*>(&values_[4]), sizeof(c_type)));
1040+
// Write UINT32 min/max values (converted to little-endian)
1041+
{
1042+
c_type le_min = ::arrow::bit_util::ToLittleEndian(values_[5]);
1043+
c_type le_max = ::arrow::bit_util::ToLittleEndian(values_[4]);
1044+
stats_[0]
1045+
.set_min(std::string(reinterpret_cast<const char*>(&le_min), sizeof(c_type)))
1046+
.set_max(std::string(reinterpret_cast<const char*>(&le_max), sizeof(c_type)));
1047+
}
10441048
stats_[0].is_max_value_exact = true;
10451049
stats_[0].is_min_value_exact = true;
10461050

1047-
// Write INT32 min/max values
1048-
stats_[1]
1049-
.set_min(std::string(reinterpret_cast<const char*>(&values_[0]), sizeof(c_type)))
1050-
.set_max(std::string(reinterpret_cast<const char*>(&values_[9]), sizeof(c_type)));
1051+
// Write INT32 min/max values (converted to little-endian)
1052+
{
1053+
c_type le_min = ::arrow::bit_util::ToLittleEndian(values_[0]);
1054+
c_type le_max = ::arrow::bit_util::ToLittleEndian(values_[9]);
1055+
stats_[1]
1056+
.set_min(std::string(reinterpret_cast<const char*>(&le_min), sizeof(c_type)))
1057+
.set_max(std::string(reinterpret_cast<const char*>(&le_max), sizeof(c_type)));
1058+
}
10511059
stats_[1].is_max_value_exact = true;
10521060
stats_[1].is_min_value_exact = true;
10531061
}
@@ -1069,17 +1077,25 @@ void TestStatisticsSortOrder<Int64Type>::SetValues() {
10691077
values_[i] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4};
10701078
}
10711079

1072-
// Write UINT64 min/max values
1073-
stats_[0]
1074-
.set_min(std::string(reinterpret_cast<const char*>(&values_[5]), sizeof(c_type)))
1075-
.set_max(std::string(reinterpret_cast<const char*>(&values_[4]), sizeof(c_type)));
1080+
// Write UINT64 min/max values (converted to little-endian)
1081+
{
1082+
c_type le_min = ::arrow::bit_util::ToLittleEndian(values_[5]);
1083+
c_type le_max = ::arrow::bit_util::ToLittleEndian(values_[4]);
1084+
stats_[0]
1085+
.set_min(std::string(reinterpret_cast<const char*>(&le_min), sizeof(c_type)))
1086+
.set_max(std::string(reinterpret_cast<const char*>(&le_max), sizeof(c_type)));
1087+
}
10761088
stats_[0].is_max_value_exact = true;
10771089
stats_[0].is_min_value_exact = true;
10781090

1079-
// Write INT64 min/max values
1080-
stats_[1]
1081-
.set_min(std::string(reinterpret_cast<const char*>(&values_[0]), sizeof(c_type)))
1082-
.set_max(std::string(reinterpret_cast<const char*>(&values_[9]), sizeof(c_type)));
1091+
// Write INT64 min/max values (converted to little-endian)
1092+
{
1093+
c_type le_min = ::arrow::bit_util::ToLittleEndian(values_[0]);
1094+
c_type le_max = ::arrow::bit_util::ToLittleEndian(values_[9]);
1095+
stats_[1]
1096+
.set_min(std::string(reinterpret_cast<const char*>(&le_min), sizeof(c_type)))
1097+
.set_max(std::string(reinterpret_cast<const char*>(&le_max), sizeof(c_type)));
1098+
}
10831099
stats_[1].is_max_value_exact = true;
10841100
stats_[1].is_min_value_exact = true;
10851101
}
@@ -1092,10 +1108,14 @@ void TestStatisticsSortOrder<FloatType>::SetValues() {
10921108
5; // {-5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0};
10931109
}
10941110

1095-
// Write Float min/max values
1096-
stats_[0]
1097-
.set_min(std::string(reinterpret_cast<const char*>(&values_[0]), sizeof(c_type)))
1098-
.set_max(std::string(reinterpret_cast<const char*>(&values_[9]), sizeof(c_type)));
1111+
// Write Float min/max values (converted to little-endian)
1112+
{
1113+
c_type le_min = ::arrow::bit_util::ToLittleEndian(values_[0]);
1114+
c_type le_max = ::arrow::bit_util::ToLittleEndian(values_[9]);
1115+
stats_[0]
1116+
.set_min(std::string(reinterpret_cast<const char*>(&le_min), sizeof(c_type)))
1117+
.set_max(std::string(reinterpret_cast<const char*>(&le_max), sizeof(c_type)));
1118+
}
10991119
stats_[0].is_max_value_exact = true;
11001120
stats_[0].is_min_value_exact = true;
11011121
}
@@ -1108,10 +1128,14 @@ void TestStatisticsSortOrder<DoubleType>::SetValues() {
11081128
5; // {-5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0};
11091129
}
11101130

1111-
// Write Double min/max values
1112-
stats_[0]
1113-
.set_min(std::string(reinterpret_cast<const char*>(&values_[0]), sizeof(c_type)))
1114-
.set_max(std::string(reinterpret_cast<const char*>(&values_[9]), sizeof(c_type)));
1131+
// Write Double min/max values (converted to little-endian)
1132+
{
1133+
c_type le_min = ::arrow::bit_util::ToLittleEndian(values_[0]);
1134+
c_type le_max = ::arrow::bit_util::ToLittleEndian(values_[9]);
1135+
stats_[0]
1136+
.set_min(std::string(reinterpret_cast<const char*>(&le_min), sizeof(c_type)))
1137+
.set_max(std::string(reinterpret_cast<const char*>(&le_max), sizeof(c_type)));
1138+
}
11151139
stats_[0].is_max_value_exact = true;
11161140
stats_[0].is_min_value_exact = true;
11171141
}
@@ -1283,7 +1307,9 @@ TEST_F(TestStatisticsSortOrderFLBA, UnknownSortOrder) {
12831307

12841308
template <typename T>
12851309
static std::string EncodeValue(const T& val) {
1286-
return std::string(reinterpret_cast<const char*>(&val), sizeof(val));
1310+
// Parquet encoded values should be in little-endian format
1311+
T le_val = ::arrow::bit_util::ToLittleEndian(val);
1312+
return std::string(reinterpret_cast<const char*>(&le_val), sizeof(le_val));
12871313
}
12881314
static std::string EncodeValue(const FLBA& val, int length = sizeof(uint16_t)) {
12891315
return std::string(reinterpret_cast<const char*>(val.ptr), length);

cpp/src/parquet/types_test.cc

Lines changed: 16 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -77,18 +77,17 @@ TEST(TypePrinter, StatisticsTypes) {
7777
std::string smax;
7878
int32_t int_min = 1024;
7979
int32_t int_max = 2048;
80-
smin = std::string(reinterpret_cast<char*>(&int_min), sizeof(int32_t));
81-
smax = std::string(reinterpret_cast<char*>(&int_max), sizeof(int32_t));
82-
ASSERT_STREQ("1024", FormatStatValue(Type::INT32, smin).c_str());
83-
ASSERT_STREQ("2048", FormatStatValue(Type::INT32, smax).c_str());
80+
int32_t int_min_le = ::arrow::bit_util::ToLittleEndian(int_min);
81+
int32_t int_max_le = ::arrow::bit_util::ToLittleEndian(int_max);
82+
smin = std::string(reinterpret_cast<char*>(&int_min_le), sizeof(int32_t));
83+
smax = std::string(reinterpret_cast<char*>(&int_max_le), sizeof(int32_t));
8484

8585
int64_t int64_min = 10240000000000;
8686
int64_t int64_max = 20480000000000;
87-
smin = std::string(reinterpret_cast<char*>(&int64_min), sizeof(int64_t));
88-
smax = std::string(reinterpret_cast<char*>(&int64_max), sizeof(int64_t));
89-
ASSERT_STREQ("10240000000000", FormatStatValue(Type::INT64, smin).c_str());
90-
ASSERT_STREQ("20480000000000", FormatStatValue(Type::INT64, smax).c_str());
91-
87+
int64_t int64_min_le = ::arrow::bit_util::ToLittleEndian(int64_min);
88+
int64_t int64_max_le = ::arrow::bit_util::ToLittleEndian(int64_max);
89+
smin = std::string(reinterpret_cast<char*>(&int64_min_le), sizeof(int64_t));
90+
smax = std::string(reinterpret_cast<char*>(&int64_max_le), sizeof(int64_t));
9291
float float_min = 1.024f;
9392
float float_max = 2.048f;
9493
smin = std::string(reinterpret_cast<char*>(&float_min), sizeof(float));
@@ -103,13 +102,13 @@ TEST(TypePrinter, StatisticsTypes) {
103102
ASSERT_STREQ("1.0245", FormatStatValue(Type::DOUBLE, smin).c_str());
104103
ASSERT_STREQ("2.0489", FormatStatValue(Type::DOUBLE, smax).c_str());
105104

106-
#if ARROW_LITTLE_ENDIAN
107-
Int96 Int96_min = {{1024, 2048, 4096}};
108-
Int96 Int96_max = {{2048, 4096, 8192}};
109-
#else
110-
Int96 Int96_min = {{2048, 1024, 4096}};
111-
Int96 Int96_max = {{4096, 2048, 8192}};
112-
#endif
105+
// INT96 values are stored in little-endian format in Parquet
106+
Int96 Int96_min = {{::arrow::bit_util::ToLittleEndian(1024u),
107+
::arrow::bit_util::ToLittleEndian(2048u),
108+
::arrow::bit_util::ToLittleEndian(4096u)}};
109+
Int96 Int96_max = {{::arrow::bit_util::ToLittleEndian(2048u),
110+
::arrow::bit_util::ToLittleEndian(4096u),
111+
::arrow::bit_util::ToLittleEndian(8192u)}};
113112
smin = std::string(reinterpret_cast<char*>(&Int96_min), sizeof(Int96));
114113
smax = std::string(reinterpret_cast<char*>(&Int96_max), sizeof(Int96));
115114
ASSERT_STREQ("1024 2048 4096", FormatStatValue(Type::INT96, smin).c_str());
@@ -181,14 +180,10 @@ TEST(TypePrinter, StatisticsTypes) {
181180

182181
TEST(TestInt96Timestamp, Decoding) {
183182
auto check = [](int32_t julian_day, uint64_t nanoseconds) {
184-
#if ARROW_LITTLE_ENDIAN
183+
// With the endian-independent implementation, the order is the same on all platforms
185184
Int96 i96{static_cast<uint32_t>(nanoseconds),
186185
static_cast<uint32_t>(nanoseconds >> 32),
187186
static_cast<uint32_t>(julian_day)};
188-
#else
189-
Int96 i96{static_cast<uint32_t>(nanoseconds >> 32),
190-
static_cast<uint32_t>(nanoseconds), static_cast<uint32_t>(julian_day)};
191-
#endif
192187
// Official formula according to https://github.com/apache/parquet-format/pull/49
193188
int64_t expected =
194189
(julian_day - 2440588) * (86400LL * 1000 * 1000 * 1000) + nanoseconds;

0 commit comments

Comments
 (0)