From b5770f9431247e07a6b9f2f090eaa52bde756589 Mon Sep 17 00:00:00 2001 From: linfeng <33561138+lyne7-sc@users.noreply.github.com> Date: Wed, 3 Jun 2026 10:01:20 +0800 Subject: [PATCH 1/6] fix regex --- .../src/simplify_expressions/regex.rs | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index b341c328e992a..ddfaefc591beb 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -283,20 +283,23 @@ fn partial_anchored_literal_to_like(v: &[Hir]) -> Option { /// Extracts a string literal expression assuming that [`is_anchored_literal`] /// returned true. -fn anchored_literal_to_expr(v: &[Hir]) -> Option { +fn anchored_literal_to_expr(v: &[Hir], string_scalar: &StringScalar) -> Option { match v.len() { - 2 => Some(lit("")), + 2 => Some(string_scalar.to_expr("")), 3 => { let HirKind::Literal(l) = v[1].kind() else { return None; }; - like_str_from_literal(l).map(lit) + like_str_from_literal(l).map(|s| string_scalar.to_expr(s)) } _ => None, } } -fn anchored_alternation_to_exprs(v: &[Hir]) -> Option> { +fn anchored_alternation_to_exprs( + v: &[Hir], + string_scalar: &StringScalar, +) -> Option> { if 3 != v.len() { return None; } @@ -308,7 +311,8 @@ fn anchored_alternation_to_exprs(v: &[Hir]) -> Option> { for hir in alters { let mut is_safe = false; if let HirKind::Literal(l) = hir.kind() - && let Some(safe_literal) = str_from_literal(l).map(lit) + && let Some(safe_literal) = + str_from_literal(l).map(|s| string_scalar.to_expr(s)) { literals.push(safe_literal); is_safe = true; @@ -321,7 +325,8 @@ fn anchored_alternation_to_exprs(v: &[Hir]) -> Option> { return Some(literals); } else if let HirKind::Literal(l) = sub.kind() { - if let Some(safe_literal) = str_from_literal(l).map(lit) { + if let Some(safe_literal) = str_from_literal(l).map(|s| string_scalar.to_expr(s)) + { return Some(vec![safe_literal]); } return None; @@ -351,12 +356,18 @@ fn lower_simple( )); } HirKind::Concat(inner) if is_anchored_literal(inner) => { - return anchored_literal_to_expr(inner).map(|right| { - mode.expr_matches_literal(Box::new(left.clone()), Box::new(right)) + return anchored_literal_to_expr(inner, string_scalar).map(|right| { + if mode.i { + // Case-insensitive: use ILIKE for exact match (no wildcards) + mode.expr(Box::new(left.clone()), Box::new(right)) + } else { + // Case-sensitive: use Eq / NotEq + mode.expr_matches_literal(Box::new(left.clone()), Box::new(right)) + } }); } - HirKind::Concat(inner) if is_anchored_capture(inner) => { - return anchored_alternation_to_exprs(inner) + HirKind::Concat(inner) if !mode.i && is_anchored_capture(inner) => { + return anchored_alternation_to_exprs(inner, string_scalar) .map(|right| left.clone().in_list(right, mode.not)); } HirKind::Concat(inner) => { From 2631cf6787bfa4df4a0ac782409c7bfd914fbc7d Mon Sep 17 00:00:00 2001 From: linfeng <33561138+lyne7-sc@users.noreply.github.com> Date: Wed, 3 Jun 2026 10:02:59 +0800 Subject: [PATCH 2/6] add slt --- .../sqllogictest/test_files/predicates.slt | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/datafusion/sqllogictest/test_files/predicates.slt b/datafusion/sqllogictest/test_files/predicates.slt index 5e68aba1f46ad..a79e916b148f6 100644 --- a/datafusion/sqllogictest/test_files/predicates.slt +++ b/datafusion/sqllogictest/test_files/predicates.slt @@ -204,12 +204,51 @@ SELECT * FROM test WHERE column1 ~ 'z' ---- Bazzz +query T +SELECT * FROM test WHERE column1 ~ '^Bazzz$' +---- +Bazzz + +query T +SELECT * FROM test WHERE column1 ~ '^(foo|Bazzz)$' +---- +foo +Bazzz + +statement ok +CREATE TABLE test_regex_utf8view(s VARCHAR) AS VALUES ('foo'), ('Bazzz'); + +query T +SELECT * FROM test_regex_utf8view WHERE s ~ '^Bazzz$' +---- +Bazzz + +query T +SELECT * FROM test_regex_utf8view WHERE s ~ '^(foo|Bazzz)$' +---- +foo +Bazzz + +statement ok +DROP TABLE test_regex_utf8view; + query T SELECT * FROM test WHERE column1 ~* 'z' ---- Bazzz ZZZZZ +query T +SELECT * FROM test WHERE column1 ~* '^barrr$' +---- +Barrr + +query T +SELECT * FROM test WHERE column1 ~* '^(barrr|bazzz)$' +---- +Barrr +Bazzz + query T SELECT * FROM test WHERE column1 !~ 'z' ---- From ae2d6b807562b489f37d8701bd0218d25ec7cb62 Mon Sep 17 00:00:00 2001 From: linfeng <33561138+lyne7-sc@users.noreply.github.com> Date: Wed, 3 Jun 2026 10:10:31 +0800 Subject: [PATCH 3/6] fmt --- datafusion/optimizer/src/simplify_expressions/regex.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index ddfaefc591beb..df4c344b2e407 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -325,7 +325,8 @@ fn anchored_alternation_to_exprs( return Some(literals); } else if let HirKind::Literal(l) = sub.kind() { - if let Some(safe_literal) = str_from_literal(l).map(|s| string_scalar.to_expr(s)) + if let Some(safe_literal) = + str_from_literal(l).map(|s| string_scalar.to_expr(s)) { return Some(vec![safe_literal]); } From 6f659f2aed6d95dc91d175e7347b18d9ae1b73bb Mon Sep 17 00:00:00 2001 From: linfeng <33561138+lyne7-sc@users.noreply.github.com> Date: Wed, 3 Jun 2026 10:19:39 +0800 Subject: [PATCH 4/6] enhance regex test cases --- datafusion/sqllogictest/test_files/predicates.slt | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/datafusion/sqllogictest/test_files/predicates.slt b/datafusion/sqllogictest/test_files/predicates.slt index a79e916b148f6..f30a350e3270f 100644 --- a/datafusion/sqllogictest/test_files/predicates.slt +++ b/datafusion/sqllogictest/test_files/predicates.slt @@ -229,6 +229,20 @@ SELECT * FROM test_regex_utf8view WHERE s ~ '^(foo|Bazzz)$' foo Bazzz +# Case-insensitive anchored match over Utf8View: must be simplified to ILIKE +# (not a case-sensitive Eq) and must keep operand types as Utf8View. +query T +SELECT * FROM test_regex_utf8view WHERE s ~* '^bazzz$' +---- +Bazzz + +# Case-insensitive anchored alternation over Utf8View +query T rowsort +SELECT * FROM test_regex_utf8view WHERE s ~* '^(foo|bazzz)$' +---- +Bazzz +foo + statement ok DROP TABLE test_regex_utf8view; From 2d3d4025b46a1aed451e1889f6a872da8f86c663 Mon Sep 17 00:00:00 2001 From: linfeng <33561138+lyne7-sc@users.noreply.github.com> Date: Fri, 5 Jun 2026 11:04:40 +0800 Subject: [PATCH 5/6] add explain and negation+regex slt --- .../sqllogictest/test_files/predicates.slt | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/datafusion/sqllogictest/test_files/predicates.slt b/datafusion/sqllogictest/test_files/predicates.slt index f30a350e3270f..59503f33842db 100644 --- a/datafusion/sqllogictest/test_files/predicates.slt +++ b/datafusion/sqllogictest/test_files/predicates.slt @@ -218,6 +218,79 @@ Bazzz statement ok CREATE TABLE test_regex_utf8view(s VARCHAR) AS VALUES ('foo'), ('Bazzz'); +statement ok +set datafusion.explain.logical_plan_only = true + +# `~` anchored literal -> `= Utf8View(..)` +query TT +EXPLAIN SELECT * FROM test_regex_utf8view WHERE s ~ '^Bazzz$' +---- +logical_plan +01)Filter: test_regex_utf8view.s = Utf8View("Bazzz") +02)--TableScan: test_regex_utf8view projection=[s] + +# `~*` anchored literal -> `ILIKE Utf8View(..)` +query TT +EXPLAIN SELECT * FROM test_regex_utf8view WHERE s ~* '^bazzz$' +---- +logical_plan +01)Filter: test_regex_utf8view.s ILIKE Utf8View("bazzz") +02)--TableScan: test_regex_utf8view projection=[s] + +# `~` anchored alternation -> OR of `= Utf8View(..)` comparisons. +query TT +EXPLAIN SELECT * FROM test_regex_utf8view WHERE s ~ '^(foo|Bazzz)$' +---- +logical_plan +01)Filter: test_regex_utf8view.s = Utf8View("foo") OR test_regex_utf8view.s = Utf8View("Bazzz") +02)--TableScan: test_regex_utf8view projection=[s] + +# `~*` anchored alternation -> NOT simplified: it falls back to a regex match, +# because `IN`/`=` cannot express case-insensitive matching. +query TT +EXPLAIN SELECT * FROM test_regex_utf8view WHERE s ~* '^(foo|bazzz)$' +---- +logical_plan +01)Filter: test_regex_utf8view.s ~* Utf8View("^(foo|bazzz)$") +02)--TableScan: test_regex_utf8view projection=[s] + +# `!~` -> `!= Utf8View(..)` +query TT +EXPLAIN SELECT * FROM test_regex_utf8view WHERE s !~ '^Bazzz$' +---- +logical_plan +01)Filter: test_regex_utf8view.s != Utf8View("Bazzz") +02)--TableScan: test_regex_utf8view projection=[s] + +# `!~*` -> `NOT ILIKE Utf8View(..)` +query TT +EXPLAIN SELECT * FROM test_regex_utf8view WHERE s !~* '^bazzz$' +---- +logical_plan +01)Filter: test_regex_utf8view.s NOT ILIKE Utf8View("bazzz") +02)--TableScan: test_regex_utf8view projection=[s] + +# `!~` anchored alternation -> AND of `!= Utf8View(..)` comparisons. +query TT +EXPLAIN SELECT * FROM test_regex_utf8view WHERE s !~ '^(foo|Bazzz)$' +---- +logical_plan +01)Filter: test_regex_utf8view.s != Utf8View("foo") AND test_regex_utf8view.s != Utf8View("Bazzz") +02)--TableScan: test_regex_utf8view projection=[s] + +# `!~*` anchored alternation -> NOT simplified: it falls back to a regex match, +# same reason as the `~*` alternation above. +query TT +EXPLAIN SELECT * FROM test_regex_utf8view WHERE s !~* '^(foo|bazzz)$' +---- +logical_plan +01)Filter: test_regex_utf8view.s !~* Utf8View("^(foo|bazzz)$") +02)--TableScan: test_regex_utf8view projection=[s] + +statement ok +set datafusion.explain.logical_plan_only = false + +# Result assertions query T SELECT * FROM test_regex_utf8view WHERE s ~ '^Bazzz$' ---- @@ -243,6 +316,25 @@ SELECT * FROM test_regex_utf8view WHERE s ~* '^(foo|bazzz)$' Bazzz foo +query T rowsort +SELECT * FROM test_regex_utf8view WHERE s !~ '^Bazzz$' +---- +foo + +query T rowsort +SELECT * FROM test_regex_utf8view WHERE s !~* '^bazzz$' +---- +foo + +# Both rows match the alternation, so the negated forms return nothing. +query T rowsort +SELECT * FROM test_regex_utf8view WHERE s !~ '^(foo|Bazzz)$' +---- + +query T rowsort +SELECT * FROM test_regex_utf8view WHERE s !~* '^(foo|bazzz)$' +---- + statement ok DROP TABLE test_regex_utf8view; From 0d10a3d7dd39b0edb4753fb77326f9a82e7fc292 Mon Sep 17 00:00:00 2001 From: linfeng <33561138+lyne7-sc@users.noreply.github.com> Date: Fri, 5 Jun 2026 11:26:06 +0800 Subject: [PATCH 6/6] add explain and negation+regex slt --- datafusion/sqllogictest/test_files/predicates.slt | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/datafusion/sqllogictest/test_files/predicates.slt b/datafusion/sqllogictest/test_files/predicates.slt index 59503f33842db..b4482a3af1beb 100644 --- a/datafusion/sqllogictest/test_files/predicates.slt +++ b/datafusion/sqllogictest/test_files/predicates.slt @@ -355,6 +355,20 @@ SELECT * FROM test WHERE column1 ~* '^(barrr|bazzz)$' Barrr Bazzz +query T rowsort +SELECT * FROM test WHERE column1 !~ '^Bazzz$' +---- +Barrr +ZZZZZ +foo + +query T rowsort +SELECT * FROM test WHERE column1 !~* '^barrr$' +---- +Bazzz +ZZZZZ +foo + query T SELECT * FROM test WHERE column1 !~ 'z' ----