diff --git a/cpp/src/arrow/compute/expression.cc b/cpp/src/arrow/compute/expression.cc index 2563674a59c..9ac738aab75 100644 --- a/cpp/src/arrow/compute/expression.cc +++ b/cpp/src/arrow/compute/expression.cc @@ -918,6 +918,69 @@ std::vector GuaranteeConjunctionMembers( return FlattenedAssociativeChain(guaranteed_true_predicate).fringe; } +/// \brief Expand guarantees of the form or_(and_(...), is_null(x)) into multiple +/// guarantees of the form or_(member, is_null(x)). +/// +/// Parquet statistics for nullable columns generate guarantees like: +/// or_(and_(field >= min, field <= max), is_null(field)) +/// +/// By expanding this into: +/// [or_(field >= min, is_null(field)), or_(field <= max, is_null(field))] +/// +/// each inequality can be extracted by Inequality::ExtractOne and used for +/// simplification of comparisons, is_in, etc. +/// +/// This expansion is logically valid because: +/// (A AND B) OR C ≡ (A OR C) AND (B OR C) +/// +/// So treating the expanded forms as separate guarantees (implicitly ANDed) +/// is equivalent to the original. +/// +/// Also handles the reversed form: or_(is_null(x), and_(...)) +/// +/// See: https://github.com/apache/arrow/issues/36283 +void ExpandNullableRangeGuarantees(std::vector* conjunction_members) { + std::vector expanded; + + for (const auto& member : *conjunction_members) { + auto call = member.call(); + if (!call || call->function_name != "or_kleene" || call->arguments.size() != 2) { + expanded.push_back(member); + continue; + } + + const auto& lhs = call->arguments[0]; + const auto& rhs = call->arguments[1]; + auto lhs_call = lhs.call(); + auto rhs_call = rhs.call(); + + // Detect pattern: or_(and_(...), is_null(x)) or or_(is_null(x), and_(...)) + const Expression* and_expr = nullptr; + const Expression* null_expr = nullptr; + + if (lhs_call && lhs_call->function_name == "and_kleene" && + rhs_call && rhs_call->function_name == "is_null") { + and_expr = &lhs; + null_expr = &rhs; + } else if (lhs_call && lhs_call->function_name == "is_null" && + rhs_call && rhs_call->function_name == "and_kleene") { + and_expr = &rhs; + null_expr = &lhs; + } + + if (and_expr && null_expr) { + // Expand: for each member of the and_kleene, create or_(member, is_null(x)) + for (const auto& and_member : FlattenedAssociativeChain(*and_expr).fringe) { + expanded.push_back(or_(and_member, *null_expr)); + } + } else { + expanded.push_back(member); + } + } + + *conjunction_members = std::move(expanded); +} + /// \brief Extract an equality from an expression. /// /// Recognizes expressions of the form: @@ -1436,6 +1499,11 @@ Result SimplifyWithGuarantee(Expression expr, RETURN_NOT_OK(ExtractKnownFieldValues(&conjunction_members, &known_values)); + // Expand or_(and_(...), is_null(x)) guarantees into multiple or_(ineq, is_null(x)) + // guarantees. This enables predicate pushdown for nullable columns with range + // statistics (the common case). See: https://github.com/apache/arrow/issues/36283 + ExpandNullableRangeGuarantees(&conjunction_members); + ARROW_ASSIGN_OR_RAISE(expr, ReplaceFieldsWithKnownValues(known_values, std::move(expr))); diff --git a/cpp/src/arrow/compute/expression_test.cc b/cpp/src/arrow/compute/expression_test.cc index 5e1f3c093ee..2ce6a07b94a 100644 --- a/cpp/src/arrow/compute/expression_test.cc +++ b/cpp/src/arrow/compute/expression_test.cc @@ -2003,6 +2003,133 @@ TEST(Expression, SimplifyIsIn) { .ExpectUnchanged(); } +// Test for nullable range guarantees: or_(and_(>= min, <= max), is_null(x)) +// This is the pattern generated by parquet statistics for nullable columns. +// See: https://github.com/apache/arrow/issues/36283 +TEST(Expression, SimplifyWithNullableRangeGuarantee) { + // Helper to create the nullable range guarantee pattern that parquet generates + auto nullable_range = [](Expression field, int min_val, int max_val) { + return or_(and_(greater_equal(field, literal(min_val)), + less_equal(field, literal(max_val))), + is_null(field)); + }; + + // Also test reversed form: or_(is_null(x), and_(...)) + auto nullable_range_reversed = [](Expression field, int min_val, int max_val) { + return or_(is_null(field), + and_(greater_equal(field, literal(min_val)), + less_equal(field, literal(max_val)))); + }; + + // Test equal - value outside range + // For nullable guarantees, "definitely false" becomes not_(true_unless_null(x)) + // which is equivalent to false for non-null rows and null for null rows + Simplify{equal(field_ref("i32"), literal(50))} + .WithGuarantee(nullable_range(field_ref("i32"), 100, 200)) + .Expect(not_(true_unless_null(field_ref("i32")))); + + // Test equal - value inside range (cannot simplify) + Simplify{equal(field_ref("i32"), literal(150))} + .WithGuarantee(nullable_range(field_ref("i32"), 100, 200)) + .ExpectUnchanged(); + + // Test less - max < filter value (all rows satisfy) + Simplify{less(field_ref("i32"), literal(300))} + .WithGuarantee(nullable_range(field_ref("i32"), 100, 200)) + .Expect(true_unless_null(field_ref("i32"))); + + // Test less - filter value <= min (no rows satisfy) + Simplify{less(field_ref("i32"), literal(50))} + .WithGuarantee(nullable_range(field_ref("i32"), 100, 200)) + .Expect(not_(true_unless_null(field_ref("i32")))); + + // Test greater - min > filter value (all rows satisfy) + Simplify{greater(field_ref("i32"), literal(50))} + .WithGuarantee(nullable_range(field_ref("i32"), 100, 200)) + .Expect(true_unless_null(field_ref("i32"))); + + // Test greater - filter value >= max (no rows satisfy) + Simplify{greater(field_ref("i32"), literal(250))} + .WithGuarantee(nullable_range(field_ref("i32"), 100, 200)) + .Expect(not_(true_unless_null(field_ref("i32")))); + + // Test less_equal - max <= filter value (all rows satisfy) + Simplify{less_equal(field_ref("i32"), literal(200))} + .WithGuarantee(nullable_range(field_ref("i32"), 100, 200)) + .Expect(true_unless_null(field_ref("i32"))); + + // Test greater_equal - min >= filter value (all rows satisfy) + Simplify{greater_equal(field_ref("i32"), literal(100))} + .WithGuarantee(nullable_range(field_ref("i32"), 100, 200)) + .Expect(true_unless_null(field_ref("i32"))); + + // Test with reversed guarantee form: or_(is_null(x), and_(...)) + Simplify{equal(field_ref("i32"), literal(50))} + .WithGuarantee(nullable_range_reversed(field_ref("i32"), 100, 200)) + .Expect(not_(true_unless_null(field_ref("i32")))); + + Simplify{greater(field_ref("i32"), literal(250))} + .WithGuarantee(nullable_range_reversed(field_ref("i32"), 100, 200)) + .Expect(not_(true_unless_null(field_ref("i32")))); +} + +TEST(Expression, SimplifyIsInWithNullableRangeGuarantee) { + auto is_in_i32 = [](std::string json_array, + SetLookupOptions::NullMatchingBehavior null_matching) { + SetLookupOptions options{ArrayFromJSON(int32(), json_array), null_matching}; + return call("is_in", {field_ref("i32")}, options); + }; + + // Helper to create the nullable range guarantee pattern + auto nullable_range = [](Expression field, int min_val, int max_val) { + return or_(and_(greater_equal(field, literal(min_val)), + less_equal(field, literal(max_val))), + is_null(field)); + }; + + for (SetLookupOptions::NullMatchingBehavior null_matching : { + SetLookupOptions::MATCH, + SetLookupOptions::SKIP, + }) { + // All values outside range -> should simplify to false + Simplify{is_in_i32("[1,2,3]", null_matching)} + .WithGuarantee(nullable_range(field_ref("i32"), 100, 200)) + .Expect(false); + + // All values below range + Simplify{is_in_i32("[10,20,30]", null_matching)} + .WithGuarantee(nullable_range(field_ref("i32"), 100, 200)) + .Expect(false); + + // All values above range + Simplify{is_in_i32("[300,400,500]", null_matching)} + .WithGuarantee(nullable_range(field_ref("i32"), 100, 200)) + .Expect(false); + + // Some values inside range, some outside -> keep only values in range + Simplify{is_in_i32("[50,150,250]", null_matching)} + .WithGuarantee(nullable_range(field_ref("i32"), 100, 200)) + .Expect(is_in_i32("[150]", null_matching)); + + // Values at boundaries + Simplify{is_in_i32("[99,100,200,201]", null_matching)} + .WithGuarantee(nullable_range(field_ref("i32"), 100, 200)) + .Expect(is_in_i32("[100,200]", null_matching)); + + // All values inside range -> unchanged + Simplify{is_in_i32("[120,150,180]", null_matching)} + .WithGuarantee(nullable_range(field_ref("i32"), 100, 200)) + .ExpectUnchanged(); + } + + // Test with reversed guarantee form + Simplify{is_in_i32("[1,2,3]", SetLookupOptions::MATCH)} + .WithGuarantee(or_(is_null(field_ref("i32")), + and_(greater_equal(field_ref("i32"), literal(100)), + less_equal(field_ref("i32"), literal(200))))) + .Expect(false); +} + TEST(Expression, SimplifyThenExecute) { auto filter = or_({equal(field_ref("f32"), literal(0)),