Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 49 additions & 1 deletion src/iceberg/expression/predicate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@

#include "iceberg/expression/expressions.h"
#include "iceberg/expression/literal.h"
#include "iceberg/expression/term.h"
#include "iceberg/result.h"
#include "iceberg/transform.h"
#include "iceberg/transform_function.h"
#include "iceberg/type.h"
#include "iceberg/util/checked_cast.h"
#include "iceberg/util/formatter_internal.h"
Expand Down Expand Up @@ -286,7 +289,52 @@ Result<std::shared_ptr<Expression>> UnboundPredicate<B>::BindLiteralOperation(
}
}

// TODO(gangwu): translate truncate(col) == value to startsWith(value)
if (BASE::op() == Expression::Operation::kEq &&
bound_term->kind() == Term::Kind::kTransform) {
// Safe to cast after kind check confirms it's a transform
auto* transform_term = dynamic_cast<BoundTransform*>(bound_term.get());
if (!transform_term) {
return BoundLiteralPredicate::Make(BASE::op(), std::move(bound_term),
std::move(literal));
}

if (transform_term->transform()->transform_type() == TransformType::kTruncate &&
literal.type()->type_id() == TypeId::kString &&
!literal.IsNull()) { // Null safety: skip null literals

// Apply truncate transform to the literal and check if result matches
// This verifies the literal is compatible with the truncate operation
auto transformed_result = transform_term->transform_func()->Transform(literal);
if (!transformed_result.has_value() || transformed_result.value() != literal) {
// Transform failed or modified the literal - can't optimize
return BoundLiteralPredicate::Make(BASE::op(), std::move(bound_term),
std::move(literal));
}

// Literal passed truncate unchanged. Now check if adding one more character
// would cause truncation. If yes, then the literal has EXACTLY the width.
// Example:
// - "Alice" with width=5: adding "x" makes "Alicex", truncate to "Alice" (can
// optimize)
// - "abc" with width=10: adding "x" makes "abcx", truncate to "abcx" != "abc"
// (cannot optimize)

auto& string_value = std::get<std::string>(literal.value());
auto extended_literal = Literal::String(string_value + "x");
auto extended_result =
transform_term->transform_func()->Transform(extended_literal);

if (extended_result.has_value() && extended_result.value() == literal) {
// Adding a character gets truncated back to original - literal has exact width!
// Rewrite: truncate(col, width) == "value" → col STARTS_WITH "value"
return BoundLiteralPredicate::Make(Expression::Operation::kStartsWith,
transform_term->reference(),
std::move(literal));
}
// Literal is shorter than width - can't optimize
}
}

return BoundLiteralPredicate::Make(BASE::op(), std::move(bound_term),
std::move(literal));
}
Expand Down
4 changes: 4 additions & 0 deletions src/iceberg/expression/term.h
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,10 @@ class ICEBERG_EXPORT BoundTransform : public BoundTerm {

const std::shared_ptr<Transform>& transform() const { return transform_; }

const std::shared_ptr<TransformFunction>& transform_func() const {
return transform_func_;
}

Kind kind() const override { return Kind::kTransform; }

private:
Expand Down
138 changes: 138 additions & 0 deletions src/iceberg/test/predicate_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -870,4 +870,142 @@ TEST_F(PredicateTest, BoundSetPredicateTestSingleLiteral) {
EXPECT_THAT(bound_literal->Test(Literal::Int(41)), HasValue(testing::Eq(false)));
}

TEST_F(PredicateTest, TruncateLiteralOptimizationExactWidth) {
// Test optimization: truncate(name, 5) == "Alice" should become name STARTS_WITH
// "Alice"
auto truncate_term = Expressions::Truncate("name", 5);
ICEBERG_ASSIGN_OR_THROW(auto equal_pred, UnboundPredicate<BoundTransform>::Make(
Expression::Operation::kEq, truncate_term,
Literal::String("Alice")));

ICEBERG_ASSIGN_OR_THROW(auto bound_pred,
equal_pred->Bind(*schema_, /*case_sensitive=*/true));

// Should be optimized to STARTS_WITH operation
EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);

// Verify it's a bound literal predicate on the reference (not the transform)
auto bound_literal = AssertAndCastToBoundPredicate(bound_pred);
EXPECT_THAT(bound_literal->Test(Literal::String("Alice")), HasValue(testing::Eq(true)));
EXPECT_THAT(bound_literal->Test(Literal::String("AliceX")),
HasValue(testing::Eq(true)));
EXPECT_THAT(bound_literal->Test(Literal::String("Alice123")),
HasValue(testing::Eq(true)));
EXPECT_THAT(bound_literal->Test(Literal::String("Bob")), HasValue(testing::Eq(false)));
EXPECT_THAT(bound_literal->Test(Literal::String("Alic")), HasValue(testing::Eq(false)));
}

TEST_F(PredicateTest, TruncateLiteralOptimizationShorterLiteral) {
// Test no optimization: truncate(name, 10) == "abc" should NOT be optimized
// because "abc" is shorter than width 10
auto truncate_term = Expressions::Truncate("name", 10);
ICEBERG_ASSIGN_OR_THROW(auto equal_pred, UnboundPredicate<BoundTransform>::Make(
Expression::Operation::kEq, truncate_term,
Literal::String("abc")));

ICEBERG_ASSIGN_OR_THROW(auto bound_pred,
equal_pred->Bind(*schema_, /*case_sensitive=*/true));

// Should remain as EQUAL operation (not optimized to STARTS_WITH)
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
}

TEST_F(PredicateTest, TruncateLiteralOptimizationNullLiteral) {
// Test no optimization with null literal - skipped as null strings are handled
// differently Null values are tested through IS NULL predicates, not equality
// predicates
GTEST_SKIP() << "Null literal equality not supported for strings";
}

TEST_F(PredicateTest, TruncateLiteralOptimizationNonEqualityOperations) {
// Test that non-equality operations are not optimized
auto truncate_term = Expressions::Truncate("name", 5);

// NotEqual should not be optimized
ICEBERG_ASSIGN_OR_THROW(
auto not_equal_pred,
UnboundPredicate<BoundTransform>::Make(Expression::Operation::kNotEq, truncate_term,
Literal::String("Alice")));
ICEBERG_ASSIGN_OR_THROW(auto bound_not_equal,
not_equal_pred->Bind(*schema_, /*case_sensitive=*/true));
EXPECT_EQ(bound_not_equal->op(), Expression::Operation::kNotEq);

// LessThan should not be optimized
ICEBERG_ASSIGN_OR_THROW(auto lt_pred, UnboundPredicate<BoundTransform>::Make(
Expression::Operation::kLt, truncate_term,
Literal::String("Alice")));
ICEBERG_ASSIGN_OR_THROW(auto bound_lt,
lt_pred->Bind(*schema_, /*case_sensitive=*/true));
EXPECT_EQ(bound_lt->op(), Expression::Operation::kLt);

// GreaterThan should not be optimized
ICEBERG_ASSIGN_OR_THROW(auto gt_pred, UnboundPredicate<BoundTransform>::Make(
Expression::Operation::kGt, truncate_term,
Literal::String("Alice")));
ICEBERG_ASSIGN_OR_THROW(auto bound_gt,
gt_pred->Bind(*schema_, /*case_sensitive=*/true));
EXPECT_EQ(bound_gt->op(), Expression::Operation::kGt);
}

TEST_F(PredicateTest, TruncateLiteralOptimizationUTF8MultibyteCharacters) {
// Test optimization with UTF-8 multibyte characters (5 code points, not bytes)
auto truncate_term = Expressions::Truncate("name", 5);

// "你好世界!" is 5 UTF-8 code points
ICEBERG_ASSIGN_OR_THROW(auto equal_pred, UnboundPredicate<BoundTransform>::Make(
Expression::Operation::kEq, truncate_term,
Literal::String("你好世界!")));
ICEBERG_ASSIGN_OR_THROW(auto bound_pred,
equal_pred->Bind(*schema_, /*case_sensitive=*/true));

// Should be optimized to STARTS_WITH
EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);

// Test with mixed ASCII and UTF-8: "你好世界x" is 5 code points (4 Chinese + 1 ASCII)
ICEBERG_ASSIGN_OR_THROW(auto mixed_pred, UnboundPredicate<BoundTransform>::Make(
Expression::Operation::kEq, truncate_term,
Literal::String("你好世界x")));
ICEBERG_ASSIGN_OR_THROW(auto bound_mixed,
mixed_pred->Bind(*schema_, /*case_sensitive=*/true));
EXPECT_EQ(bound_mixed->op(), Expression::Operation::kStartsWith);

// Test with 3 UTF-8 characters (shorter than width) - should NOT optimize
ICEBERG_ASSIGN_OR_THROW(
auto shorter_pred,
UnboundPredicate<BoundTransform>::Make(Expression::Operation::kEq, truncate_term,
Literal::String("你好世")));
ICEBERG_ASSIGN_OR_THROW(auto bound_shorter,
shorter_pred->Bind(*schema_, /*case_sensitive=*/true));
EXPECT_EQ(bound_shorter->op(), Expression::Operation::kEq);
}

TEST_F(PredicateTest, TruncateLiteralOptimizationEmptyString) {
// Test edge case: empty string with any width should not optimize
auto truncate_term = Expressions::Truncate("name", 5);
ICEBERG_ASSIGN_OR_THROW(auto equal_pred, UnboundPredicate<BoundTransform>::Make(
Expression::Operation::kEq, truncate_term,
Literal::String("")));

ICEBERG_ASSIGN_OR_THROW(auto bound_pred,
equal_pred->Bind(*schema_, /*case_sensitive=*/true));

// Empty string is shorter than width, should not optimize
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
}

TEST_F(PredicateTest, TruncateLiteralOptimizationNonTruncateTransform) {
// Test that other transforms (e.g., bucket) are not optimized
// Bucket returns an integer, so we use an integer literal
auto bucket_term = Expressions::Bucket("id", 10); // id is int64
ICEBERG_ASSIGN_OR_THROW(auto equal_pred,
UnboundPredicate<BoundTransform>::Make(
Expression::Operation::kEq, bucket_term, Literal::Int(5)));

ICEBERG_ASSIGN_OR_THROW(auto bound_pred,
equal_pred->Bind(*schema_, /*case_sensitive=*/true));

// Should remain as EQUAL operation (bucket transform not optimized)
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
}

} // namespace iceberg
Loading