Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions cpp/src/arrow/compute/expression.cc
Original file line number Diff line number Diff line change
Expand Up @@ -918,6 +918,69 @@ std::vector<Expression> GuaranteeConjunctionMembers(
return FlattenedAssociativeChain(guaranteed_true_predicate).fringe;
}

/// \brief Expand guarantees of the form or_(and_(...), is_null(x)) into multiple
/// guarantees of the form or_(member, is_null(x)).
///
/// Parquet statistics for nullable columns generate guarantees like:
/// or_(and_(field >= min, field <= max), is_null(field))
///
/// By expanding this into:
/// [or_(field >= min, is_null(field)), or_(field <= max, is_null(field))]
///
/// each inequality can be extracted by Inequality::ExtractOne and used for
/// simplification of comparisons, is_in, etc.
///
/// This expansion is logically valid because:
/// (A AND B) OR C ≡ (A OR C) AND (B OR C)
///
/// So treating the expanded forms as separate guarantees (implicitly ANDed)
/// is equivalent to the original.
///
/// Also handles the reversed form: or_(is_null(x), and_(...))
///
/// See: https://github.com/apache/arrow/issues/36283
void ExpandNullableRangeGuarantees(std::vector<Expression>* conjunction_members) {
std::vector<Expression> expanded;

for (const auto& member : *conjunction_members) {
auto call = member.call();
if (!call || call->function_name != "or_kleene" || call->arguments.size() != 2) {
expanded.push_back(member);
continue;
}

const auto& lhs = call->arguments[0];
const auto& rhs = call->arguments[1];
auto lhs_call = lhs.call();
auto rhs_call = rhs.call();

// Detect pattern: or_(and_(...), is_null(x)) or or_(is_null(x), and_(...))
const Expression* and_expr = nullptr;
const Expression* null_expr = nullptr;

if (lhs_call && lhs_call->function_name == "and_kleene" &&
rhs_call && rhs_call->function_name == "is_null") {
and_expr = &lhs;
null_expr = &rhs;
} else if (lhs_call && lhs_call->function_name == "is_null" &&
rhs_call && rhs_call->function_name == "and_kleene") {
and_expr = &rhs;
null_expr = &lhs;
}

if (and_expr && null_expr) {
// Expand: for each member of the and_kleene, create or_(member, is_null(x))
for (const auto& and_member : FlattenedAssociativeChain(*and_expr).fringe) {
expanded.push_back(or_(and_member, *null_expr));
}
} else {
expanded.push_back(member);
}
}

*conjunction_members = std::move(expanded);
}

/// \brief Extract an equality from an expression.
///
/// Recognizes expressions of the form:
Expand Down Expand Up @@ -1436,6 +1499,11 @@ Result<Expression> SimplifyWithGuarantee(Expression expr,

RETURN_NOT_OK(ExtractKnownFieldValues(&conjunction_members, &known_values));

// Expand or_(and_(...), is_null(x)) guarantees into multiple or_(ineq, is_null(x))
// guarantees. This enables predicate pushdown for nullable columns with range
// statistics (the common case). See: https://github.com/apache/arrow/issues/36283
ExpandNullableRangeGuarantees(&conjunction_members);

ARROW_ASSIGN_OR_RAISE(expr,
ReplaceFieldsWithKnownValues(known_values, std::move(expr)));

Expand Down
127 changes: 127 additions & 0 deletions cpp/src/arrow/compute/expression_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2003,6 +2003,133 @@ TEST(Expression, SimplifyIsIn) {
.ExpectUnchanged();
}

// Test for nullable range guarantees: or_(and_(>= min, <= max), is_null(x))
// This is the pattern generated by parquet statistics for nullable columns.
// See: https://github.com/apache/arrow/issues/36283
TEST(Expression, SimplifyWithNullableRangeGuarantee) {
// Helper to create the nullable range guarantee pattern that parquet generates
auto nullable_range = [](Expression field, int min_val, int max_val) {
return or_(and_(greater_equal(field, literal(min_val)),
less_equal(field, literal(max_val))),
is_null(field));
};

// Also test reversed form: or_(is_null(x), and_(...))
auto nullable_range_reversed = [](Expression field, int min_val, int max_val) {
return or_(is_null(field),
and_(greater_equal(field, literal(min_val)),
less_equal(field, literal(max_val))));
};

// Test equal - value outside range
// For nullable guarantees, "definitely false" becomes not_(true_unless_null(x))
// which is equivalent to false for non-null rows and null for null rows
Simplify{equal(field_ref("i32"), literal(50))}
.WithGuarantee(nullable_range(field_ref("i32"), 100, 200))
.Expect(not_(true_unless_null(field_ref("i32"))));

// Test equal - value inside range (cannot simplify)
Simplify{equal(field_ref("i32"), literal(150))}
.WithGuarantee(nullable_range(field_ref("i32"), 100, 200))
.ExpectUnchanged();

// Test less - max < filter value (all rows satisfy)
Simplify{less(field_ref("i32"), literal(300))}
.WithGuarantee(nullable_range(field_ref("i32"), 100, 200))
.Expect(true_unless_null(field_ref("i32")));

// Test less - filter value <= min (no rows satisfy)
Simplify{less(field_ref("i32"), literal(50))}
.WithGuarantee(nullable_range(field_ref("i32"), 100, 200))
.Expect(not_(true_unless_null(field_ref("i32"))));

// Test greater - min > filter value (all rows satisfy)
Simplify{greater(field_ref("i32"), literal(50))}
.WithGuarantee(nullable_range(field_ref("i32"), 100, 200))
.Expect(true_unless_null(field_ref("i32")));

// Test greater - filter value >= max (no rows satisfy)
Simplify{greater(field_ref("i32"), literal(250))}
.WithGuarantee(nullable_range(field_ref("i32"), 100, 200))
.Expect(not_(true_unless_null(field_ref("i32"))));

// Test less_equal - max <= filter value (all rows satisfy)
Simplify{less_equal(field_ref("i32"), literal(200))}
.WithGuarantee(nullable_range(field_ref("i32"), 100, 200))
.Expect(true_unless_null(field_ref("i32")));

// Test greater_equal - min >= filter value (all rows satisfy)
Simplify{greater_equal(field_ref("i32"), literal(100))}
.WithGuarantee(nullable_range(field_ref("i32"), 100, 200))
.Expect(true_unless_null(field_ref("i32")));

// Test with reversed guarantee form: or_(is_null(x), and_(...))
Simplify{equal(field_ref("i32"), literal(50))}
.WithGuarantee(nullable_range_reversed(field_ref("i32"), 100, 200))
.Expect(not_(true_unless_null(field_ref("i32"))));

Simplify{greater(field_ref("i32"), literal(250))}
.WithGuarantee(nullable_range_reversed(field_ref("i32"), 100, 200))
.Expect(not_(true_unless_null(field_ref("i32"))));
}

TEST(Expression, SimplifyIsInWithNullableRangeGuarantee) {
auto is_in_i32 = [](std::string json_array,
SetLookupOptions::NullMatchingBehavior null_matching) {
SetLookupOptions options{ArrayFromJSON(int32(), json_array), null_matching};
return call("is_in", {field_ref("i32")}, options);
};

// Helper to create the nullable range guarantee pattern
auto nullable_range = [](Expression field, int min_val, int max_val) {
return or_(and_(greater_equal(field, literal(min_val)),
less_equal(field, literal(max_val))),
is_null(field));
};

for (SetLookupOptions::NullMatchingBehavior null_matching : {
SetLookupOptions::MATCH,
SetLookupOptions::SKIP,
}) {
// All values outside range -> should simplify to false
Simplify{is_in_i32("[1,2,3]", null_matching)}
.WithGuarantee(nullable_range(field_ref("i32"), 100, 200))
.Expect(false);

// All values below range
Simplify{is_in_i32("[10,20,30]", null_matching)}
.WithGuarantee(nullable_range(field_ref("i32"), 100, 200))
.Expect(false);

// All values above range
Simplify{is_in_i32("[300,400,500]", null_matching)}
.WithGuarantee(nullable_range(field_ref("i32"), 100, 200))
.Expect(false);

// Some values inside range, some outside -> keep only values in range
Simplify{is_in_i32("[50,150,250]", null_matching)}
.WithGuarantee(nullable_range(field_ref("i32"), 100, 200))
.Expect(is_in_i32("[150]", null_matching));

// Values at boundaries
Simplify{is_in_i32("[99,100,200,201]", null_matching)}
.WithGuarantee(nullable_range(field_ref("i32"), 100, 200))
.Expect(is_in_i32("[100,200]", null_matching));

// All values inside range -> unchanged
Simplify{is_in_i32("[120,150,180]", null_matching)}
.WithGuarantee(nullable_range(field_ref("i32"), 100, 200))
.ExpectUnchanged();
}

// Test with reversed guarantee form
Simplify{is_in_i32("[1,2,3]", SetLookupOptions::MATCH)}
.WithGuarantee(or_(is_null(field_ref("i32")),
and_(greater_equal(field_ref("i32"), literal(100)),
less_equal(field_ref("i32"), literal(200)))))
.Expect(false);
}

TEST(Expression, SimplifyThenExecute) {
auto filter =
or_({equal(field_ref("f32"), literal(0)),
Expand Down