Skip to content

Commit b3dcb6d

Browse files
authored
GH-47657: [C++][Parquet] Check for integer overflow when coercing timestamps (#49615)
### Rationale for this change - When writing Arrow timestamps to Parquet, timestamp values are multiplied to convert units - This multiplication was performed without overflow checking which can silently write corrupt data. ### What changes are included in this PR? - Use MultiplyWithOverflowGeneric to handle/detect overflow ### Are these changes tested? - Yes ### Are there any user-facing changes? - Yes * GitHub Issue: #47657 Authored-by: Arnav Balyan <arnavbalyan1@gmail.com> Signed-off-by: Gang Wu <ustcwg@gmail.com>
1 parent caf7f5b commit b3dcb6d

2 files changed

Lines changed: 43 additions & 1 deletion

File tree

cpp/src/parquet/arrow/arrow_reader_writer_test.cc

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2178,6 +2178,41 @@ TEST(TestArrowReadWrite, ImplicitSecondToMillisecondTimestampCoercion) {
21782178
ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*tx, *to));
21792179
}
21802180

2181+
TEST(TestArrowReadWrite, TimestampCoercionOverflow) {
2182+
using ::arrow::ArrayFromVector;
2183+
using ::arrow::field;
2184+
using ::arrow::schema;
2185+
auto t_s = ::arrow::timestamp(TimeUnit::SECOND);
2186+
std::vector<int64_t> overflow_values = {9223372036854776LL};
2187+
std::vector<bool> is_valid = {true};
2188+
std::shared_ptr<Array> a_s;
2189+
ArrayFromVector<::arrow::TimestampType, int64_t>(t_s, is_valid, overflow_values, &a_s);
2190+
2191+
auto s = schema({field("timestamp", t_s)});
2192+
auto table = Table::Make(s, {a_s});
2193+
2194+
for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI, TimeUnit::MICRO, TimeUnit::NANO}) {
2195+
if (unit == TimeUnit::SECOND) {
2196+
ASSERT_RAISES(Invalid, WriteTable(*table, default_memory_pool(),
2197+
CreateOutputStream(), table->num_rows()));
2198+
} else {
2199+
auto coerce_props =
2200+
ArrowWriterProperties::Builder().coerce_timestamps(unit)->build();
2201+
ASSERT_RAISES(Invalid, WriteTable(*table, default_memory_pool(),
2202+
CreateOutputStream(), table->num_rows(),
2203+
default_writer_properties(), coerce_props));
2204+
}
2205+
}
2206+
2207+
std::vector<bool> null_valid = {false};
2208+
std::shared_ptr<Array> a_null;
2209+
ArrayFromVector<::arrow::TimestampType, int64_t>(t_s, null_valid, overflow_values,
2210+
&a_null);
2211+
auto null_table = Table::Make(s, {a_null});
2212+
ASSERT_OK_NO_THROW(WriteTable(*null_table, default_memory_pool(), CreateOutputStream(),
2213+
null_table->num_rows()));
2214+
}
2215+
21812216
TEST(TestArrowReadWrite, ParquetVersionTimestampDifferences) {
21822217
using ::arrow::ArrayFromVector;
21832218
using ::arrow::field;

cpp/src/parquet/column_writer.cc

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
#include "arrow/util/crc32.h"
4242
#include "arrow/util/endian.h"
4343
#include "arrow/util/float16.h"
44+
#include "arrow/util/int_util_overflow.h"
4445
#include "arrow/util/key_value_metadata.h"
4546
#include "arrow/util/logging_internal.h"
4647
#include "arrow/util/rle_encoding_internal.h"
@@ -2352,7 +2353,13 @@ struct SerializeFunctor<Int64Type, ::arrow::TimestampType> {
23522353

23532354
auto MultiplyBy = [&](const int64_t factor) {
23542355
for (int64_t i = 0; i < array.length(); i++) {
2355-
out[i] = values[i] * factor;
2356+
if (array.IsValid(i) &&
2357+
ARROW_PREDICT_FALSE(::arrow::internal::MultiplyWithOverflowGeneric(
2358+
values[i], factor, &out[i]))) {
2359+
return Status::Invalid("Integer overflow when casting timestamp value ",
2360+
values[i], " from ", source_type.ToString(), " to ",
2361+
target_type->ToString());
2362+
}
23562363
}
23572364
return Status::OK();
23582365
};

0 commit comments

Comments
 (0)