Skip to content

Commit 2c6ca2d

Browse files
committed
[C++] Test filter operations with random null probabilities
1 parent 727106f commit 2c6ca2d

1 file changed

Lines changed: 64 additions & 21 deletions

File tree

cpp/src/arrow/compute/kernels/vector_selection_test.cc

Lines changed: 64 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -484,27 +484,64 @@ Comparator<CType>* GetComparator(CompareOperator op) {
484484
}
485485

486486
template <typename T, typename Fn, typename CType = typename TypeTraits<T>::CType>
487-
std::shared_ptr<Array> CompareAndFilter(const CType* data, int64_t length, Fn&& fn) {
487+
std::shared_ptr<Array> CompareAndFilter(const std::shared_ptr<Array>& array, Fn&& fn) {
488+
using ArrayType = typename TypeTraits<T>::ArrayType;
489+
auto typed_array = checked_pointer_cast<ArrayType>(array);
490+
488491
std::vector<CType> filtered;
489-
filtered.reserve(length);
490-
std::copy_if(data, data + length, std::back_inserter(filtered), std::forward<Fn>(fn));
492+
filtered.reserve(array->length());
493+
494+
for (int64_t i = 0; i < array->length(); ++i) {
495+
if (array->IsNull(i)) {
496+
// Nulls are filtered out (comparison with null is false)
497+
continue;
498+
}
499+
CType value = typed_array->Value(i);
500+
if (fn(value)) {
501+
filtered.push_back(value);
502+
}
503+
}
504+
491505
std::shared_ptr<Array> filtered_array;
492506
ArrayFromVector<T, CType>(filtered, &filtered_array);
493507
return filtered_array;
494508
}
495509

496510
template <typename T, typename CType = typename TypeTraits<T>::CType>
497-
std::shared_ptr<Array> CompareAndFilter(const CType* data, int64_t length, CType val,
511+
std::shared_ptr<Array> CompareAndFilter(const std::shared_ptr<Array>& array, CType val,
498512
CompareOperator op) {
499513
auto cmp = GetComparator<CType>(op);
500-
return CompareAndFilter<T>(data, length, [&](CType e) { return cmp(e, val); });
514+
return CompareAndFilter<T>(array, [&](CType e) { return cmp(e, val); });
501515
}
502516

503-
template <typename T, typename CType = typename TypeTraits<T>::CType>
504-
std::shared_ptr<Array> CompareAndFilter(const CType* data, int64_t length,
505-
const CType* other, CompareOperator op) {
517+
template <typename T>
518+
std::shared_ptr<Array> CompareAndFilter(const std::shared_ptr<Array>& lhs,
519+
const std::shared_ptr<Array>& rhs,
520+
CompareOperator op) {
521+
using ArrayType = typename TypeTraits<T>::ArrayType;
522+
using CType = typename TypeTraits<T>::CType;
523+
auto lhs_typed = checked_pointer_cast<ArrayType>(lhs);
524+
auto rhs_typed = checked_pointer_cast<ArrayType>(rhs);
506525
auto cmp = GetComparator<CType>(op);
507-
return CompareAndFilter<T>(data, length, [&](CType e) { return cmp(e, *other++); });
526+
527+
std::vector<CType> filtered;
528+
filtered.reserve(lhs->length());
529+
530+
for (int64_t i = 0; i < lhs->length(); ++i) {
531+
// Skip if either element is null
532+
if (lhs->IsNull(i) || rhs->IsNull(i)) {
533+
continue;
534+
}
535+
CType lhs_value = lhs_typed->Value(i);
536+
CType rhs_value = rhs_typed->Value(i);
537+
if (cmp(lhs_value, rhs_value)) {
538+
filtered.push_back(lhs_value);
539+
}
540+
}
541+
542+
std::shared_ptr<Array> filtered_array;
543+
ArrayFromVector<T, CType>(filtered, &filtered_array);
544+
return filtered_array;
508545
}
509546

510547
TYPED_TEST(TestFilterKernelWithNumeric, CompareScalarAndFilterRandomNumeric) {
@@ -513,11 +550,13 @@ TYPED_TEST(TestFilterKernelWithNumeric, CompareScalarAndFilterRandomNumeric) {
513550
using CType = typename TypeTraits<TypeParam>::CType;
514551

515552
auto rand = random::RandomArrayGenerator(kRandomSeed);
553+
std::default_random_engine gen(kRandomSeed);
554+
::arrow::random::uniform_real_distribution<double> null_dist(0.0, 1.0);
516555
for (size_t i = 3; i < 10; i++) {
517556
const int64_t length = static_cast<int64_t>(1ULL << i);
518-
// TODO(bkietz) rewrite with some nulls
519-
auto array =
520-
checked_pointer_cast<ArrayType>(rand.Numeric<TypeParam>(length, 0, 100, 0));
557+
double null_probability = null_dist(gen);
558+
auto array = checked_pointer_cast<ArrayType>(
559+
rand.Numeric<TypeParam>(length, 0, 100, null_probability));
521560
CType c_fifty = 50;
522561
auto fifty = std::make_shared<ScalarType>(c_fifty);
523562
for (auto op : {EQUAL, NOT_EQUAL, GREATER, LESS_EQUAL}) {
@@ -527,8 +566,7 @@ TYPED_TEST(TestFilterKernelWithNumeric, CompareScalarAndFilterRandomNumeric) {
527566
ASSERT_OK_AND_ASSIGN(Datum filtered, Filter(array, selection));
528567
auto filtered_array = filtered.make_array();
529568
ValidateOutput(*filtered_array);
530-
auto expected =
531-
CompareAndFilter<TypeParam>(array->raw_values(), array->length(), c_fifty, op);
569+
auto expected = CompareAndFilter<TypeParam>(array, c_fifty, op);
532570
ASSERT_ARRAYS_EQUAL(*filtered_array, *expected);
533571
}
534572
}
@@ -538,20 +576,23 @@ TYPED_TEST(TestFilterKernelWithNumeric, CompareArrayAndFilterRandomNumeric) {
538576
using ArrayType = typename TypeTraits<TypeParam>::ArrayType;
539577

540578
auto rand = random::RandomArrayGenerator(kRandomSeed);
579+
std::default_random_engine gen(kRandomSeed);
580+
::arrow::random::uniform_real_distribution<double> null_dist(0.0, 1.0);
541581
for (size_t i = 3; i < 10; i++) {
542582
const int64_t length = static_cast<int64_t>(1ULL << i);
583+
double null_probability_lhs = null_dist(gen);
584+
double null_probability_rhs = null_dist(gen);
543585
auto lhs = checked_pointer_cast<ArrayType>(
544-
rand.Numeric<TypeParam>(length, 0, 100, /*null_probability=*/0.0));
586+
rand.Numeric<TypeParam>(length, 0, 100, null_probability_lhs));
545587
auto rhs = checked_pointer_cast<ArrayType>(
546-
rand.Numeric<TypeParam>(length, 0, 100, /*null_probability=*/0.0));
588+
rand.Numeric<TypeParam>(length, 0, 100, null_probability_rhs));
547589
for (auto op : {EQUAL, NOT_EQUAL, GREATER, LESS_EQUAL}) {
548590
ASSERT_OK_AND_ASSIGN(Datum selection,
549591
CallFunction(CompareOperatorToFunctionName(op), {lhs, rhs}));
550592
ASSERT_OK_AND_ASSIGN(Datum filtered, Filter(lhs, selection));
551593
auto filtered_array = filtered.make_array();
552594
ValidateOutput(*filtered_array);
553-
auto expected = CompareAndFilter<TypeParam>(lhs->raw_values(), lhs->length(),
554-
rhs->raw_values(), op);
595+
auto expected = CompareAndFilter<TypeParam>(lhs, rhs, op);
555596
ASSERT_ARRAYS_EQUAL(*filtered_array, *expected);
556597
}
557598
}
@@ -563,10 +604,13 @@ TYPED_TEST(TestFilterKernelWithNumeric, ScalarInRangeAndFilterRandomNumeric) {
563604
using CType = typename TypeTraits<TypeParam>::CType;
564605

565606
auto rand = random::RandomArrayGenerator(kRandomSeed);
607+
std::default_random_engine gen(kRandomSeed);
608+
::arrow::random::uniform_real_distribution<double> null_dist(0.0, 1.0);
566609
for (size_t i = 3; i < 10; i++) {
567610
const int64_t length = static_cast<int64_t>(1ULL << i);
611+
double null_probability = null_dist(gen);
568612
auto array = checked_pointer_cast<ArrayType>(
569-
rand.Numeric<TypeParam>(length, 0, 100, /*null_probability=*/0.0));
613+
rand.Numeric<TypeParam>(length, 0, 100, null_probability));
570614
CType c_fifty = 50, c_hundred = 100;
571615
auto fifty = std::make_shared<ScalarType>(c_fifty);
572616
auto hundred = std::make_shared<ScalarType>(c_hundred);
@@ -579,8 +623,7 @@ TYPED_TEST(TestFilterKernelWithNumeric, ScalarInRangeAndFilterRandomNumeric) {
579623
auto filtered_array = filtered.make_array();
580624
ValidateOutput(*filtered_array);
581625
auto expected = CompareAndFilter<TypeParam>(
582-
array->raw_values(), array->length(),
583-
[&](CType e) { return (e > c_fifty) && (e < c_hundred); });
626+
array, [&](CType e) { return (e > c_fifty) && (e < c_hundred); });
584627
ASSERT_ARRAYS_EQUAL(*filtered_array, *expected);
585628
}
586629
}

0 commit comments

Comments
 (0)