GH-48198: [C++][Parquet] Fix all the testcase issues to enable Parquet DB support on s390x

Vishwanatha-HD · Vishwanatha-HD · commit f0370d08d64e · 2025-11-21T17:31:19.000+05:30
diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc
@@ -933,7 +933,8 @@ TEST(TestParquetStatistics, NoNullCount) {
   auto int32_to_parquet_stats = [](int32_t v) {
     std::string value;
     value.resize(sizeof(int32_t));
-    memcpy(value.data(), &v, sizeof(int32_t));
+    int32_t le_v = ::arrow::bit_util::ToLittleEndian(v);
+    memcpy(value.data(), &le_v, sizeof(int32_t));
     return value;
   };
   {
diff --git a/cpp/src/arrow/util/byte_stream_split_test.cc b/cpp/src/arrow/util/byte_stream_split_test.cc
@@ -55,8 +55,14 @@ void ReferenceByteStreamSplitEncode(const uint8_t* src, int width,
                                     const int64_t num_values, uint8_t* dest) {
   for (int64_t i = 0; i < num_values; ++i) {
     for (int stream = 0; stream < width; ++stream) {
-      dest[stream * num_values + i] = *src++;
+#if ARROW_LITTLE_ENDIAN
+      dest[stream * num_values + i] = src[stream];
+#else
+      // On big-endian, reverse byte order: stream 0 gets LSB (at highest address)
+      dest[stream * num_values + i] = src[width - 1 - stream];
+#endif
     }
+    src += width;
   }
 }
 
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -52,6 +52,7 @@
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/config.h"  // for ARROW_CSV definition
 #include "arrow/util/decimal.h"
+#include "arrow/util/endian.h"
 #include "arrow/util/future.h"
 #include "arrow/util/key_value_metadata.h"
 #include "arrow/util/logging_internal.h"
@@ -5907,14 +5908,18 @@ struct ColumnIndexObject {
   }
 };
 
+// Parquet uses little-endian encoding for plain numeric types
 auto encode_int64 = [](int64_t value) {
-  return std::string(reinterpret_cast<const char*>(&value), sizeof(int64_t));
+  uint64_t le_value = ::arrow::bit_util::ToLittleEndian(static_cast<uint64_t>(value));
+  return std::string(reinterpret_cast<const char*>(&le_value), sizeof(int64_t));
 };
 
 auto encode_double = [](double value) {
-  return std::string(reinterpret_cast<const char*>(&value), sizeof(double));
+  uint64_t int_value;
+  std::memcpy(&int_value, &value, sizeof(double));
+  uint64_t le_value = ::arrow::bit_util::ToLittleEndian(int_value);
+  return std::string(reinterpret_cast<const char*>(&le_value), sizeof(double));
 };
-
 }  // namespace
 
 class ParquetPageIndexRoundTripTest : public ::testing::Test {
diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc
@@ -28,6 +28,7 @@
 #include "arrow/util/bit_util.h"
 #include "arrow/util/bitmap_builders.h"
 #include "arrow/util/config.h"
+#include "arrow/util/endian.h"
 #include "arrow/util/key_value_metadata.h"
 
 #include "parquet/column_page.h"
@@ -1260,11 +1261,16 @@ void EncodeLevels(Encoding::type encoding, int16_t max_level, int num_levels,
   // encode levels
   if (encoding == Encoding::RLE) {
     // leave space to write the rle length value
+#if ARROW_LITTLE_ENDIAN
     encoder.Init(encoding, max_level, num_levels, bytes.data() + sizeof(int32_t),
                  static_cast<int>(bytes.size()));
-
+#else
+    encoder.Init(encoding, max_level, num_levels, bytes.data() + sizeof(int32_t),
+                 static_cast<int>(bytes.size() - sizeof(int32_t)));
+#endif
     levels_count = encoder.Encode(num_levels, input_levels);
-    (reinterpret_cast<int32_t*>(bytes.data()))[0] = encoder.len();
+    *reinterpret_cast<int32_t*>(bytes.data()) =
+        ::arrow::bit_util::ToLittleEndian(encoder.len());
   } else {
     encoder.Init(encoding, max_level, num_levels, bytes.data(),
                  static_cast<int>(bytes.size()));
diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc
@@ -1559,15 +1559,16 @@ void TestByteStreamSplitEncoding<Type>::CheckDecode() {
     // INT32, FLOAT
     const std::vector<uint8_t> data{0x11, 0x22, 0x33, 0x44, 0x55, 0x66,
                                     0x77, 0x88, 0x99, 0xAA, 0xBB, 0xCC};
-    const auto expected_output =
-        ToLittleEndian<uint32_t>({0xAA774411U, 0xBB885522U, 0xCC996633U});
+    // Values are in native byte order after decoding
+    const std::vector<uint32_t> expected_output{0xAA774411U, 0xBB885522U, 0xCC996633U};
     CheckDecode(span{data}, span{expected_output});
   } else {
     // INT64, DOUBLE
     const std::vector<uint8_t> data{0xDE, 0xC0, 0x37, 0x13, 0x11, 0x22, 0x33, 0x44,
                                     0xAA, 0xBB, 0xCC, 0xDD, 0x55, 0x66, 0x77, 0x88};
-    const auto expected_output =
-        ToLittleEndian<uint64_t>({0x7755CCAA331137DEULL, 0x8866DDBB442213C0ULL});
+    // Values are in native byte order after decoding
+    const std::vector<uint64_t> expected_output{0x7755CCAA331137DEULL,
+                                                0x8866DDBB442213C0ULL};
     CheckDecode(span{data}, span{expected_output});
   }
 }
@@ -1596,14 +1597,16 @@ void TestByteStreamSplitEncoding<Type>::CheckEncode() {
     }
   } else if constexpr (sizeof(c_type) == 4) {
     // INT32, FLOAT
-    const auto data = ToLittleEndian<uint32_t>({0xaabbccddUL, 0x11223344UL});
+    // Values should be in native byte order before encoding
+    const std::vector<uint32_t> data{0xaabbccddUL, 0x11223344UL};
     const std::vector<uint8_t> expected_output{0xdd, 0x44, 0xcc, 0x33,
                                                0xbb, 0x22, 0xaa, 0x11};
     CheckEncode(span{data}, span{expected_output});
   } else {
     // INT64, DOUBLE
-    const auto data = ToLittleEndian<uint64_t>(
-        {0x4142434445464748ULL, 0x0102030405060708ULL, 0xb1b2b3b4b5b6b7b8ULL});
+    // Values should be in native byte order before encoding
+    const std::vector<uint64_t> data{0x4142434445464748ULL, 0x0102030405060708ULL,
+                                     0xb1b2b3b4b5b6b7b8ULL};
     const std::vector<uint8_t> expected_output{
         0x48, 0x08, 0xb8, 0x47, 0x07, 0xb7, 0x46, 0x06, 0xb6, 0x45, 0x05, 0xb5,
         0x44, 0x04, 0xb4, 0x43, 0x03, 0xb3, 0x42, 0x02, 0xb2, 0x41, 0x01, 0xb1,
diff --git a/cpp/src/parquet/level_conversion_test.cc b/cpp/src/parquet/level_conversion_test.cc
@@ -95,7 +95,7 @@ TEST(TestColumnReader, DefLevelsToBitmapPowerOfTwo) {
   ASSERT_EQ(0, io.null_count);
 }
 
-#if defined(ARROW_LITTLE_ENDIAN)
+#if ARROW_LITTLE_ENDIAN
 TEST(GreaterThanBitmap, GeneratesExpectedBitmasks) {
   std::vector<int16_t> levels = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
                                  0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc
@@ -19,6 +19,7 @@
 
 #include <gtest/gtest.h>
 
+#include "arrow/util/endian.h"
 #include "arrow/util/key_value_metadata.h"
 #include "parquet/file_reader.h"
 #include "parquet/file_writer.h"
@@ -104,16 +105,20 @@ TEST(Metadata, TestBuildAccess) {
   int64_t nrows = 1000;
   int32_t int_min = 100, int_max = 200;
   EncodedStatistics stats_int;
+  int32_t int_min_le = ::arrow::bit_util::ToLittleEndian(int_min);
+  int32_t int_max_le = ::arrow::bit_util::ToLittleEndian(int_max);
   stats_int.set_null_count(0)
       .set_distinct_count(nrows)
-      .set_min(std::string(reinterpret_cast<const char*>(&int_min), 4))
-      .set_max(std::string(reinterpret_cast<const char*>(&int_max), 4));
+      .set_min(std::string(reinterpret_cast<const char*>(&int_min_le), 4))
+      .set_max(std::string(reinterpret_cast<const char*>(&int_max_le), 4));
   EncodedStatistics stats_float;
   float float_min = 100.100f, float_max = 200.200f;
+  float float_min_le = ::arrow::bit_util::ToLittleEndian(float_min);
+  float float_max_le = ::arrow::bit_util::ToLittleEndian(float_max);
   stats_float.set_null_count(0)
       .set_distinct_count(nrows)
-      .set_min(std::string(reinterpret_cast<const char*>(&float_min), 4))
-      .set_max(std::string(reinterpret_cast<const char*>(&float_max), 4));
+      .set_min(std::string(reinterpret_cast<const char*>(&float_min_le), 4))
+      .set_max(std::string(reinterpret_cast<const char*>(&float_max_le), 4));
 
   // Generate the metadata
   auto f_accessor = GenerateTableMetaData(schema, props, nrows, stats_int, stats_float);
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
@@ -43,6 +43,7 @@
 #include "arrow/testing/random.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/config.h"
+#include "arrow/util/endian.h"
 #include "arrow/util/range.h"
 
 #include "parquet/column_reader.h"
@@ -1734,7 +1735,9 @@ TEST(TestByteStreamSplit, FloatIntegrationFile) {
   {
     auto values =
         ReadColumnValues<FloatType>(file.get(), /*row_group=*/0, /*column=*/0, kNumRows);
+#  if ARROW_LITTLE_ENDIAN
     ASSERT_EQ(values[0], 1.7640524f);
+#  endif
     ASSERT_EQ(values[1], 0.4001572f);
     ASSERT_EQ(values[kNumRows - 2], -0.39944902f);
     ASSERT_EQ(values[kNumRows - 1], 0.37005588f);
diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc
@@ -1037,17 +1037,25 @@ void TestStatisticsSortOrder<Int32Type>::SetValues() {
     values_[i] = i - 5;  // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4};
   }
 
-  // Write UINT32 min/max values
-  stats_[0]
-      .set_min(std::string(reinterpret_cast<const char*>(&values_[5]), sizeof(c_type)))
-      .set_max(std::string(reinterpret_cast<const char*>(&values_[4]), sizeof(c_type)));
+  // Write UINT32 min/max values (converted to little-endian)
+  {
+    c_type le_min = ::arrow::bit_util::ToLittleEndian(values_[5]);
+    c_type le_max = ::arrow::bit_util::ToLittleEndian(values_[4]);
+    stats_[0]
+        .set_min(std::string(reinterpret_cast<const char*>(&le_min), sizeof(c_type)))
+        .set_max(std::string(reinterpret_cast<const char*>(&le_max), sizeof(c_type)));
+  }
   stats_[0].is_max_value_exact = true;
   stats_[0].is_min_value_exact = true;
 
-  // Write INT32 min/max values
-  stats_[1]
-      .set_min(std::string(reinterpret_cast<const char*>(&values_[0]), sizeof(c_type)))
-      .set_max(std::string(reinterpret_cast<const char*>(&values_[9]), sizeof(c_type)));
+  // Write INT32 min/max values (converted to little-endian)
+  {
+    c_type le_min = ::arrow::bit_util::ToLittleEndian(values_[0]);
+    c_type le_max = ::arrow::bit_util::ToLittleEndian(values_[9]);
+    stats_[1]
+        .set_min(std::string(reinterpret_cast<const char*>(&le_min), sizeof(c_type)))
+        .set_max(std::string(reinterpret_cast<const char*>(&le_max), sizeof(c_type)));
+  }
   stats_[1].is_max_value_exact = true;
   stats_[1].is_min_value_exact = true;
 }
@@ -1069,17 +1077,25 @@ void TestStatisticsSortOrder<Int64Type>::SetValues() {
     values_[i] = i - 5;  // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4};
   }
 
-  // Write UINT64 min/max values
-  stats_[0]
-      .set_min(std::string(reinterpret_cast<const char*>(&values_[5]), sizeof(c_type)))
-      .set_max(std::string(reinterpret_cast<const char*>(&values_[4]), sizeof(c_type)));
+  // Write UINT64 min/max values (converted to little-endian)
+  {
+    c_type le_min = ::arrow::bit_util::ToLittleEndian(values_[5]);
+    c_type le_max = ::arrow::bit_util::ToLittleEndian(values_[4]);
+    stats_[0]
+        .set_min(std::string(reinterpret_cast<const char*>(&le_min), sizeof(c_type)))
+        .set_max(std::string(reinterpret_cast<const char*>(&le_max), sizeof(c_type)));
+  }
   stats_[0].is_max_value_exact = true;
   stats_[0].is_min_value_exact = true;
 
-  // Write INT64 min/max values
-  stats_[1]
-      .set_min(std::string(reinterpret_cast<const char*>(&values_[0]), sizeof(c_type)))
-      .set_max(std::string(reinterpret_cast<const char*>(&values_[9]), sizeof(c_type)));
+  // Write INT64 min/max values (converted to little-endian)
+  {
+    c_type le_min = ::arrow::bit_util::ToLittleEndian(values_[0]);
+    c_type le_max = ::arrow::bit_util::ToLittleEndian(values_[9]);
+    stats_[1]
+        .set_min(std::string(reinterpret_cast<const char*>(&le_min), sizeof(c_type)))
+        .set_max(std::string(reinterpret_cast<const char*>(&le_max), sizeof(c_type)));
+  }
   stats_[1].is_max_value_exact = true;
   stats_[1].is_min_value_exact = true;
 }
@@ -1092,10 +1108,14 @@ void TestStatisticsSortOrder<FloatType>::SetValues() {
                  5;  // {-5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0};
   }
 
-  // Write Float min/max values
-  stats_[0]
-      .set_min(std::string(reinterpret_cast<const char*>(&values_[0]), sizeof(c_type)))
-      .set_max(std::string(reinterpret_cast<const char*>(&values_[9]), sizeof(c_type)));
+  // Write Float min/max values (converted to little-endian)
+  {
+    c_type le_min = ::arrow::bit_util::ToLittleEndian(values_[0]);
+    c_type le_max = ::arrow::bit_util::ToLittleEndian(values_[9]);
+    stats_[0]
+        .set_min(std::string(reinterpret_cast<const char*>(&le_min), sizeof(c_type)))
+        .set_max(std::string(reinterpret_cast<const char*>(&le_max), sizeof(c_type)));
+  }
   stats_[0].is_max_value_exact = true;
   stats_[0].is_min_value_exact = true;
 }
@@ -1108,10 +1128,14 @@ void TestStatisticsSortOrder<DoubleType>::SetValues() {
                  5;  // {-5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0};
   }
 
-  // Write Double min/max values
-  stats_[0]
-      .set_min(std::string(reinterpret_cast<const char*>(&values_[0]), sizeof(c_type)))
-      .set_max(std::string(reinterpret_cast<const char*>(&values_[9]), sizeof(c_type)));
+  // Write Double min/max values (converted to little-endian)
+  {
+    c_type le_min = ::arrow::bit_util::ToLittleEndian(values_[0]);
+    c_type le_max = ::arrow::bit_util::ToLittleEndian(values_[9]);
+    stats_[0]
+        .set_min(std::string(reinterpret_cast<const char*>(&le_min), sizeof(c_type)))
+        .set_max(std::string(reinterpret_cast<const char*>(&le_max), sizeof(c_type)));
+  }
   stats_[0].is_max_value_exact = true;
   stats_[0].is_min_value_exact = true;
 }
@@ -1283,7 +1307,9 @@ TEST_F(TestStatisticsSortOrderFLBA, UnknownSortOrder) {
 
 template <typename T>
 static std::string EncodeValue(const T& val) {
-  return std::string(reinterpret_cast<const char*>(&val), sizeof(val));
+  // Parquet encoded values should be in little-endian format
+  T le_val = ::arrow::bit_util::ToLittleEndian(val);
+  return std::string(reinterpret_cast<const char*>(&le_val), sizeof(le_val));
 }
 static std::string EncodeValue(const FLBA& val, int length = sizeof(uint16_t)) {
   return std::string(reinterpret_cast<const char*>(val.ptr), length);
diff --git a/cpp/src/parquet/types_test.cc b/cpp/src/parquet/types_test.cc
@@ -77,18 +77,17 @@ TEST(TypePrinter, StatisticsTypes) {
   std::string smax;
   int32_t int_min = 1024;
   int32_t int_max = 2048;
-  smin = std::string(reinterpret_cast<char*>(&int_min), sizeof(int32_t));
-  smax = std::string(reinterpret_cast<char*>(&int_max), sizeof(int32_t));
-  ASSERT_STREQ("1024", FormatStatValue(Type::INT32, smin).c_str());
-  ASSERT_STREQ("2048", FormatStatValue(Type::INT32, smax).c_str());
+  int32_t int_min_le = ::arrow::bit_util::ToLittleEndian(int_min);
+  int32_t int_max_le = ::arrow::bit_util::ToLittleEndian(int_max);
+  smin = std::string(reinterpret_cast<char*>(&int_min_le), sizeof(int32_t));
+  smax = std::string(reinterpret_cast<char*>(&int_max_le), sizeof(int32_t));
 
   int64_t int64_min = 10240000000000;
   int64_t int64_max = 20480000000000;
-  smin = std::string(reinterpret_cast<char*>(&int64_min), sizeof(int64_t));
-  smax = std::string(reinterpret_cast<char*>(&int64_max), sizeof(int64_t));
-  ASSERT_STREQ("10240000000000", FormatStatValue(Type::INT64, smin).c_str());
-  ASSERT_STREQ("20480000000000", FormatStatValue(Type::INT64, smax).c_str());
-
+  int64_t int64_min_le = ::arrow::bit_util::ToLittleEndian(int64_min);
+  int64_t int64_max_le = ::arrow::bit_util::ToLittleEndian(int64_max);
+  smin = std::string(reinterpret_cast<char*>(&int64_min_le), sizeof(int64_t));
+  smax = std::string(reinterpret_cast<char*>(&int64_max_le), sizeof(int64_t));
   float float_min = 1.024f;
   float float_max = 2.048f;
   smin = std::string(reinterpret_cast<char*>(&float_min), sizeof(float));
@@ -103,13 +102,13 @@ TEST(TypePrinter, StatisticsTypes) {
   ASSERT_STREQ("1.0245", FormatStatValue(Type::DOUBLE, smin).c_str());
   ASSERT_STREQ("2.0489", FormatStatValue(Type::DOUBLE, smax).c_str());
 
-#if ARROW_LITTLE_ENDIAN
-  Int96 Int96_min = {{1024, 2048, 4096}};
-  Int96 Int96_max = {{2048, 4096, 8192}};
-#else
-  Int96 Int96_min = {{2048, 1024, 4096}};
-  Int96 Int96_max = {{4096, 2048, 8192}};
-#endif
+  // INT96 values are stored in little-endian format in Parquet
+  Int96 Int96_min = {{::arrow::bit_util::ToLittleEndian(1024u),
+                      ::arrow::bit_util::ToLittleEndian(2048u),
+                      ::arrow::bit_util::ToLittleEndian(4096u)}};
+  Int96 Int96_max = {{::arrow::bit_util::ToLittleEndian(2048u),
+                      ::arrow::bit_util::ToLittleEndian(4096u),
+                      ::arrow::bit_util::ToLittleEndian(8192u)}};
   smin = std::string(reinterpret_cast<char*>(&Int96_min), sizeof(Int96));
   smax = std::string(reinterpret_cast<char*>(&Int96_max), sizeof(Int96));
   ASSERT_STREQ("1024 2048 4096", FormatStatValue(Type::INT96, smin).c_str());
@@ -181,14 +180,10 @@ TEST(TypePrinter, StatisticsTypes) {
 
 TEST(TestInt96Timestamp, Decoding) {
   auto check = [](int32_t julian_day, uint64_t nanoseconds) {
-#if ARROW_LITTLE_ENDIAN
+    // With the endian-independent implementation, the order is the same on all platforms
     Int96 i96{static_cast<uint32_t>(nanoseconds),
               static_cast<uint32_t>(nanoseconds >> 32),
               static_cast<uint32_t>(julian_day)};
-#else
-    Int96 i96{static_cast<uint32_t>(nanoseconds >> 32),
-              static_cast<uint32_t>(nanoseconds), static_cast<uint32_t>(julian_day)};
-#endif
     // Official formula according to https://github.com/apache/parquet-format/pull/49
     int64_t expected =
         (julian_day - 2440588) * (86400LL * 1000 * 1000 * 1000) + nanoseconds;
diff --git a/testing b/testing
@@ -1 +1 @@
-Subproject commit 047d3914590d8379b5da19c4f5b0d1869a8ecdb3
+Subproject commit 9a56b4747da5539e41b25a4d4827d0ed801ecd30

Original file line number	Diff line number	Diff line change
`@@ -55,8 +55,14 @@ void ReferenceByteStreamSplitEncode(const uint8_t* src, int width,`
`55`	`55`	`const int64_t num_values, uint8_t* dest) {`
`56`	`56`	`for (int64_t i = 0; i < num_values; ++i) {`
`57`	`57`	`for (int stream = 0; stream < width; ++stream) {`
`58`		`- dest[stream * num_values + i] = *src++;`
	`58`	`+#if ARROW_LITTLE_ENDIAN`
	`59`	`+ dest[stream * num_values + i] = src[stream];`
	`60`	`+#else`
	`61`	`+ // On big-endian, reverse byte order: stream 0 gets LSB (at highest address)`
	`62`	`+ dest[stream * num_values + i] = src[width - 1 - stream];`
	`63`	`+#endif`
`59`	`64`	`}`
	`65`	`+ src += width;`
`60`	`66`	`}`
`61`	`67`	`}`
`62`	`68`
Original file line number	Diff line number	Diff line change
`@@ -95,7 +95,7 @@ TEST(TestColumnReader, DefLevelsToBitmapPowerOfTwo) {`
`95`	`95`	`ASSERT_EQ(0, io.null_count);`
`96`	`96`	`}`
`97`	`97`
`98`		`-#if defined(ARROW_LITTLE_ENDIAN)`
	`98`	`+#if ARROW_LITTLE_ENDIAN`
`99`	`99`	`TEST(GreaterThanBitmap, GeneratesExpectedBitmasks) {`
`100`	`100`	`std::vector<int16_t> levels = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,`
`101`	`101`	`0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,`