From 7c51a619ae2446605cc0efaa3a3229b669f184f4 Mon Sep 17 00:00:00 2001 From: Xinli Shang Date: Tue, 11 Nov 2025 08:36:18 -0800 Subject: [PATCH] feat: optimize truncate(col) == value to startsWith predicate Implements optimization to rewrite truncate equality predicates as startsWith for better predicate pushdown and index usage. The optimization applies when: - Operation is equality - Term is a truncate transform on string column - Literal has exactly the truncate width in UTF-8 code points Implementation uses transform_func()->Transform() to validate that: 1. truncate(literal) == literal (literal is compatible) 2. truncate(literal + 'x') == literal (literal has exact width) This approach leverages the transform function without duplicating UTF-8 code point counting logic. --- src/iceberg/expression/predicate.cc | 50 +++++++++- src/iceberg/expression/term.h | 4 + src/iceberg/test/predicate_test.cc | 138 ++++++++++++++++++++++++++++ 3 files changed, 191 insertions(+), 1 deletion(-) diff --git a/src/iceberg/expression/predicate.cc b/src/iceberg/expression/predicate.cc index c255d7584..4c3953ee2 100644 --- a/src/iceberg/expression/predicate.cc +++ b/src/iceberg/expression/predicate.cc @@ -25,7 +25,10 @@ #include "iceberg/expression/expressions.h" #include "iceberg/expression/literal.h" +#include "iceberg/expression/term.h" #include "iceberg/result.h" +#include "iceberg/transform.h" +#include "iceberg/transform_function.h" #include "iceberg/type.h" #include "iceberg/util/checked_cast.h" #include "iceberg/util/formatter_internal.h" @@ -286,7 +289,52 @@ Result> UnboundPredicate::BindLiteralOperation( } } - // TODO(gangwu): translate truncate(col) == value to startsWith(value) + if (BASE::op() == Expression::Operation::kEq && + bound_term->kind() == Term::Kind::kTransform) { + // Safe to cast after kind check confirms it's a transform + auto* transform_term = dynamic_cast(bound_term.get()); + if (!transform_term) { + return BoundLiteralPredicate::Make(BASE::op(), std::move(bound_term), + std::move(literal)); + } + + if (transform_term->transform()->transform_type() == TransformType::kTruncate && + literal.type()->type_id() == TypeId::kString && + !literal.IsNull()) { // Null safety: skip null literals + + // Apply truncate transform to the literal and check if result matches + // This verifies the literal is compatible with the truncate operation + auto transformed_result = transform_term->transform_func()->Transform(literal); + if (!transformed_result.has_value() || transformed_result.value() != literal) { + // Transform failed or modified the literal - can't optimize + return BoundLiteralPredicate::Make(BASE::op(), std::move(bound_term), + std::move(literal)); + } + + // Literal passed truncate unchanged. Now check if adding one more character + // would cause truncation. If yes, then the literal has EXACTLY the width. + // Example: + // - "Alice" with width=5: adding "x" makes "Alicex", truncate to "Alice" (can + // optimize) + // - "abc" with width=10: adding "x" makes "abcx", truncate to "abcx" != "abc" + // (cannot optimize) + + auto& string_value = std::get(literal.value()); + auto extended_literal = Literal::String(string_value + "x"); + auto extended_result = + transform_term->transform_func()->Transform(extended_literal); + + if (extended_result.has_value() && extended_result.value() == literal) { + // Adding a character gets truncated back to original - literal has exact width! + // Rewrite: truncate(col, width) == "value" → col STARTS_WITH "value" + return BoundLiteralPredicate::Make(Expression::Operation::kStartsWith, + transform_term->reference(), + std::move(literal)); + } + // Literal is shorter than width - can't optimize + } + } + return BoundLiteralPredicate::Make(BASE::op(), std::move(bound_term), std::move(literal)); } diff --git a/src/iceberg/expression/term.h b/src/iceberg/expression/term.h index 6259b826e..b5d39fcd4 100644 --- a/src/iceberg/expression/term.h +++ b/src/iceberg/expression/term.h @@ -250,6 +250,10 @@ class ICEBERG_EXPORT BoundTransform : public BoundTerm { const std::shared_ptr& transform() const { return transform_; } + const std::shared_ptr& transform_func() const { + return transform_func_; + } + Kind kind() const override { return Kind::kTransform; } private: diff --git a/src/iceberg/test/predicate_test.cc b/src/iceberg/test/predicate_test.cc index 532e908b4..5e165e698 100644 --- a/src/iceberg/test/predicate_test.cc +++ b/src/iceberg/test/predicate_test.cc @@ -870,4 +870,142 @@ TEST_F(PredicateTest, BoundSetPredicateTestSingleLiteral) { EXPECT_THAT(bound_literal->Test(Literal::Int(41)), HasValue(testing::Eq(false))); } +TEST_F(PredicateTest, TruncateLiteralOptimizationExactWidth) { + // Test optimization: truncate(name, 5) == "Alice" should become name STARTS_WITH + // "Alice" + auto truncate_term = Expressions::Truncate("name", 5); + ICEBERG_ASSIGN_OR_THROW(auto equal_pred, UnboundPredicate::Make( + Expression::Operation::kEq, truncate_term, + Literal::String("Alice"))); + + ICEBERG_ASSIGN_OR_THROW(auto bound_pred, + equal_pred->Bind(*schema_, /*case_sensitive=*/true)); + + // Should be optimized to STARTS_WITH operation + EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith); + + // Verify it's a bound literal predicate on the reference (not the transform) + auto bound_literal = AssertAndCastToBoundPredicate(bound_pred); + EXPECT_THAT(bound_literal->Test(Literal::String("Alice")), HasValue(testing::Eq(true))); + EXPECT_THAT(bound_literal->Test(Literal::String("AliceX")), + HasValue(testing::Eq(true))); + EXPECT_THAT(bound_literal->Test(Literal::String("Alice123")), + HasValue(testing::Eq(true))); + EXPECT_THAT(bound_literal->Test(Literal::String("Bob")), HasValue(testing::Eq(false))); + EXPECT_THAT(bound_literal->Test(Literal::String("Alic")), HasValue(testing::Eq(false))); +} + +TEST_F(PredicateTest, TruncateLiteralOptimizationShorterLiteral) { + // Test no optimization: truncate(name, 10) == "abc" should NOT be optimized + // because "abc" is shorter than width 10 + auto truncate_term = Expressions::Truncate("name", 10); + ICEBERG_ASSIGN_OR_THROW(auto equal_pred, UnboundPredicate::Make( + Expression::Operation::kEq, truncate_term, + Literal::String("abc"))); + + ICEBERG_ASSIGN_OR_THROW(auto bound_pred, + equal_pred->Bind(*schema_, /*case_sensitive=*/true)); + + // Should remain as EQUAL operation (not optimized to STARTS_WITH) + EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq); +} + +TEST_F(PredicateTest, TruncateLiteralOptimizationNullLiteral) { + // Test no optimization with null literal - skipped as null strings are handled + // differently Null values are tested through IS NULL predicates, not equality + // predicates + GTEST_SKIP() << "Null literal equality not supported for strings"; +} + +TEST_F(PredicateTest, TruncateLiteralOptimizationNonEqualityOperations) { + // Test that non-equality operations are not optimized + auto truncate_term = Expressions::Truncate("name", 5); + + // NotEqual should not be optimized + ICEBERG_ASSIGN_OR_THROW( + auto not_equal_pred, + UnboundPredicate::Make(Expression::Operation::kNotEq, truncate_term, + Literal::String("Alice"))); + ICEBERG_ASSIGN_OR_THROW(auto bound_not_equal, + not_equal_pred->Bind(*schema_, /*case_sensitive=*/true)); + EXPECT_EQ(bound_not_equal->op(), Expression::Operation::kNotEq); + + // LessThan should not be optimized + ICEBERG_ASSIGN_OR_THROW(auto lt_pred, UnboundPredicate::Make( + Expression::Operation::kLt, truncate_term, + Literal::String("Alice"))); + ICEBERG_ASSIGN_OR_THROW(auto bound_lt, + lt_pred->Bind(*schema_, /*case_sensitive=*/true)); + EXPECT_EQ(bound_lt->op(), Expression::Operation::kLt); + + // GreaterThan should not be optimized + ICEBERG_ASSIGN_OR_THROW(auto gt_pred, UnboundPredicate::Make( + Expression::Operation::kGt, truncate_term, + Literal::String("Alice"))); + ICEBERG_ASSIGN_OR_THROW(auto bound_gt, + gt_pred->Bind(*schema_, /*case_sensitive=*/true)); + EXPECT_EQ(bound_gt->op(), Expression::Operation::kGt); +} + +TEST_F(PredicateTest, TruncateLiteralOptimizationUTF8MultibyteCharacters) { + // Test optimization with UTF-8 multibyte characters (5 code points, not bytes) + auto truncate_term = Expressions::Truncate("name", 5); + + // "你好世界!" is 5 UTF-8 code points + ICEBERG_ASSIGN_OR_THROW(auto equal_pred, UnboundPredicate::Make( + Expression::Operation::kEq, truncate_term, + Literal::String("你好世界!"))); + ICEBERG_ASSIGN_OR_THROW(auto bound_pred, + equal_pred->Bind(*schema_, /*case_sensitive=*/true)); + + // Should be optimized to STARTS_WITH + EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith); + + // Test with mixed ASCII and UTF-8: "你好世界x" is 5 code points (4 Chinese + 1 ASCII) + ICEBERG_ASSIGN_OR_THROW(auto mixed_pred, UnboundPredicate::Make( + Expression::Operation::kEq, truncate_term, + Literal::String("你好世界x"))); + ICEBERG_ASSIGN_OR_THROW(auto bound_mixed, + mixed_pred->Bind(*schema_, /*case_sensitive=*/true)); + EXPECT_EQ(bound_mixed->op(), Expression::Operation::kStartsWith); + + // Test with 3 UTF-8 characters (shorter than width) - should NOT optimize + ICEBERG_ASSIGN_OR_THROW( + auto shorter_pred, + UnboundPredicate::Make(Expression::Operation::kEq, truncate_term, + Literal::String("你好世"))); + ICEBERG_ASSIGN_OR_THROW(auto bound_shorter, + shorter_pred->Bind(*schema_, /*case_sensitive=*/true)); + EXPECT_EQ(bound_shorter->op(), Expression::Operation::kEq); +} + +TEST_F(PredicateTest, TruncateLiteralOptimizationEmptyString) { + // Test edge case: empty string with any width should not optimize + auto truncate_term = Expressions::Truncate("name", 5); + ICEBERG_ASSIGN_OR_THROW(auto equal_pred, UnboundPredicate::Make( + Expression::Operation::kEq, truncate_term, + Literal::String(""))); + + ICEBERG_ASSIGN_OR_THROW(auto bound_pred, + equal_pred->Bind(*schema_, /*case_sensitive=*/true)); + + // Empty string is shorter than width, should not optimize + EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq); +} + +TEST_F(PredicateTest, TruncateLiteralOptimizationNonTruncateTransform) { + // Test that other transforms (e.g., bucket) are not optimized + // Bucket returns an integer, so we use an integer literal + auto bucket_term = Expressions::Bucket("id", 10); // id is int64 + ICEBERG_ASSIGN_OR_THROW(auto equal_pred, + UnboundPredicate::Make( + Expression::Operation::kEq, bucket_term, Literal::Int(5))); + + ICEBERG_ASSIGN_OR_THROW(auto bound_pred, + equal_pred->Bind(*schema_, /*case_sensitive=*/true)); + + // Should remain as EQUAL operation (bucket transform not optimized) + EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq); +} + } // namespace iceberg