From 6c88105faf5347aed6441809e0ae930ed9a3a82d Mon Sep 17 00:00:00 2001 From: Robin Date: Wed, 5 Mar 2025 12:42:43 +0100 Subject: [PATCH 01/66] Add lookaround expressions to HIR This is the first step to supporting captureless lookbehind assertions --- regex-automata/src/meta/reverse_inner.rs | 2 + regex-automata/src/nfa/thompson/compiler.rs | 1 + regex-syntax/src/hir/literal.rs | 4 +- regex-syntax/src/hir/mod.rs | 56 +++++++++++++++++++++ regex-syntax/src/hir/print.rs | 9 +++- 5 files changed, 70 insertions(+), 2 deletions(-) diff --git a/regex-automata/src/meta/reverse_inner.rs b/regex-automata/src/meta/reverse_inner.rs index 3d78779f6..b236cf2e1 100644 --- a/regex-automata/src/meta/reverse_inner.rs +++ b/regex-automata/src/meta/reverse_inner.rs @@ -170,6 +170,7 @@ fn top_concat(mut hir: &Hir) -> Option> { | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) + | HirKind::Lookaround(_) | HirKind::Repetition(_) | HirKind::Alternation(_) => return None, HirKind::Capture(hir::Capture { ref sub, .. }) => sub, @@ -206,6 +207,7 @@ fn flatten(hir: &Hir) -> Hir { HirKind::Literal(hir::Literal(ref x)) => Hir::literal(x.clone()), HirKind::Class(ref x) => Hir::class(x.clone()), HirKind::Look(ref x) => Hir::look(x.clone()), + HirKind::Lookaround(ref x) => Hir::lookaround(x.clone()), HirKind::Repetition(ref x) => Hir::repetition(x.with(flatten(&x.sub))), // This is the interesting case. We just drop the group information // entirely and use the child HIR itself. diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index ced17719d..2fcd907e9 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -1003,6 +1003,7 @@ impl Compiler { Class(Class::Bytes(ref c)) => self.c_byte_class(c), Class(Class::Unicode(ref c)) => self.c_unicode_class(c), Look(ref look) => self.c_look(look), + Lookaround(_) => todo!("implement lookaround NFA compilation"), Repetition(ref rep) => self.c_repetition(rep), Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub), Concat(ref es) => self.c_concat(es.iter().map(|e| self.c(e))), diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index a5a3737f6..d506c8172 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -172,7 +172,9 @@ impl Extractor { use crate::hir::HirKind::*; match *hir.kind() { - Empty | Look(_) => Seq::singleton(self::Literal::exact(vec![])), + Empty | Look(_) | Lookaround(_) => { + Seq::singleton(self::Literal::exact(vec![])) + } Literal(hir::Literal(ref bytes)) => { let mut seq = Seq::singleton(self::Literal::exact(bytes.to_vec())); diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 5db784388..fe893d83e 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -373,6 +373,13 @@ impl Hir { Hir { kind: HirKind::Look(look), props } } + /// Creates a look-around subexpression HIR expression. + #[inline] + pub fn lookaround(lookaround: Lookaround) -> Hir { + let props = Properties::lookaround(&lookaround); + Hir { kind: HirKind::Lookaround(lookaround), props } + } + /// Creates a repetition HIR expression. #[inline] pub fn repetition(mut rep: Repetition) -> Hir { @@ -728,6 +735,8 @@ pub enum HirKind { Class(Class), /// A look-around assertion. A look-around match always has zero length. Look(Look), + /// A look-around subexpression + Lookaround(Lookaround), /// A repetition operation applied to a sub-expression. Repetition(Repetition), /// A capturing group, which contains a sub-expression. @@ -761,6 +770,7 @@ impl HirKind { | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) => &[], + HirKind::Lookaround(ref lookaround) => from_ref(lookaround.sub()), HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub), HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub), HirKind::Concat(ref subs) => subs, @@ -1786,6 +1796,37 @@ impl Look { } } +/// Represents a general lookaround assertion +/// +/// Currently, only lookbehind assertions are supported. +/// Furthermore, capture groups inside assertions are not supported. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Lookaround { + /// A positive lookbehind assertion + PositiveLookBehind(Box), + /// A negative lookbehind assertion + NegativeLookBehind(Box), +} + +impl Lookaround { + /// Returns a reference to the inner expression that must match for this + /// lookaround assertion to hold. + pub fn sub(&self) -> &Hir { + match self { + Lookaround::PositiveLookBehind(sub) + | Lookaround::NegativeLookBehind(sub) => sub, + } + } + + /// Returns a mutable reference to the inner expression + pub fn sub_mut(&mut self) -> &mut Hir { + match self { + Lookaround::PositiveLookBehind(sub) + | Lookaround::NegativeLookBehind(sub) => sub, + } + } +} + /// The high-level intermediate representation for a capturing group. /// /// A capturing group always has an index and a child expression. It may @@ -1935,6 +1976,9 @@ impl Drop for Hir { | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) => {} + HirKind::Lookaround(ref mut x) => { + stack.push(mem::replace(x.sub_mut(), Hir::empty())); + } HirKind::Capture(ref mut x) => { stack.push(mem::replace(&mut x.sub, Hir::empty())); } @@ -2499,6 +2543,18 @@ impl Properties { Properties(Box::new(inner)) } + fn lookaround(lookaround: &Lookaround) -> Properties { + let sub_p = lookaround.sub().properties(); + let inner = PropertiesI { + minimum_len: Some(0), + maximum_len: Some(0), + literal: false, + alternation_literal: false, + ..*sub_p.0.clone() + }; + Properties(Box::new(inner)) + } + /// Create a new set of HIR properties for a repetition. fn repetition(rep: &Repetition) -> Properties { let p = rep.sub.properties(); diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index dfa6d4032..547e579e9 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -227,6 +227,12 @@ impl Visitor for Writer { self.wtr.write_str(r"\b{end-half}")?; } }, + HirKind::Lookaround(hir::Lookaround::PositiveLookBehind(_)) => { + self.wtr.write_str(r"(?<=)")?; + } + HirKind::Lookaround(hir::Lookaround::NegativeLookBehind(_)) => { + self.wtr.write_str(r"(? { self.wtr.write_str("(")?; if let Some(ref name) = *name { @@ -293,7 +299,8 @@ impl Visitor for Writer { } HirKind::Capture(_) | HirKind::Concat(_) - | HirKind::Alternation(_) => { + | HirKind::Alternation(_) + | HirKind::Lookaround(_) => { self.wtr.write_str(r")")?; } } From 3c54714d4819ce166d9cb8e2d0e5a963f5afdc2b Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sat, 8 Mar 2025 20:48:09 +0100 Subject: [PATCH 02/66] Change how flatten works on hir::Lookaround The lack of recursing into the inner expression of a lookaround is correct under the current assumption that lookarounds cannot have capture groups. But once the restriction is lifted, this wrong implementation can be very subtle to find. Instead, we can already do the filtering and accept it being a no-op for now. --- regex-automata/src/meta/reverse_inner.rs | 4 +++- regex-syntax/src/hir/mod.rs | 29 ++++++++++++++++++------ 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/regex-automata/src/meta/reverse_inner.rs b/regex-automata/src/meta/reverse_inner.rs index b236cf2e1..8d9099600 100644 --- a/regex-automata/src/meta/reverse_inner.rs +++ b/regex-automata/src/meta/reverse_inner.rs @@ -207,7 +207,9 @@ fn flatten(hir: &Hir) -> Hir { HirKind::Literal(hir::Literal(ref x)) => Hir::literal(x.clone()), HirKind::Class(ref x) => Hir::class(x.clone()), HirKind::Look(ref x) => Hir::look(x.clone()), - HirKind::Lookaround(ref x) => Hir::lookaround(x.clone()), + HirKind::Lookaround(ref x) => { + Hir::lookaround(x.with(flatten(x.sub()))) + } HirKind::Repetition(ref x) => Hir::repetition(x.with(flatten(&x.sub))), // This is the interesting case. We just drop the group information // entirely and use the child HIR itself. diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index fe893d83e..df5384b76 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1796,15 +1796,15 @@ impl Look { } } -/// Represents a general lookaround assertion +/// Represents a general lookaround assertion. /// /// Currently, only lookbehind assertions are supported. /// Furthermore, capture groups inside assertions are not supported. #[derive(Clone, Debug, Eq, PartialEq)] pub enum Lookaround { - /// A positive lookbehind assertion + /// A positive lookbehind assertion. PositiveLookBehind(Box), - /// A negative lookbehind assertion + /// A negative lookbehind assertion. NegativeLookBehind(Box), } @@ -1813,16 +1813,31 @@ impl Lookaround { /// lookaround assertion to hold. pub fn sub(&self) -> &Hir { match self { - Lookaround::PositiveLookBehind(sub) - | Lookaround::NegativeLookBehind(sub) => sub, + Self::PositiveLookBehind(sub) | Self::NegativeLookBehind(sub) => { + sub + } } } /// Returns a mutable reference to the inner expression pub fn sub_mut(&mut self) -> &mut Hir { match self { - Lookaround::PositiveLookBehind(sub) - | Lookaround::NegativeLookBehind(sub) => sub, + Self::PositiveLookBehind(sub) | Self::NegativeLookBehind(sub) => { + sub + } + } + } + + /// Returns a new lookaround of the same kind, but with its + /// sub-expression replaced with the one given. + pub fn with(&self, sub: Hir) -> Lookaround { + match self { + Self::PositiveLookBehind(_) => { + Self::PositiveLookBehind(Box::new(sub)) + } + Self::NegativeLookBehind(_) => { + Self::NegativeLookBehind(Box::new(sub)) + } } } } From 4eb42869a8497d3bafdb49baec8a2af2a74a258c Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sat, 8 Mar 2025 21:08:52 +0100 Subject: [PATCH 03/66] Add hir::Lookaround to the visitor --- regex-syntax/src/hir/visitor.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/regex-syntax/src/hir/visitor.rs b/regex-syntax/src/hir/visitor.rs index f30f0a163..a946d9ddb 100644 --- a/regex-syntax/src/hir/visitor.rs +++ b/regex-syntax/src/hir/visitor.rs @@ -83,6 +83,9 @@ enum Frame<'a> { /// A stack frame allocated just before descending into a capture's child /// node. Capture(&'a hir::Capture), + /// A stack frame allocated just before descending into a look-around's + /// child node. + LookAround(&'a hir::Lookaround), /// The stack frame used while visiting every child node of a concatenation /// of expressions. Concat { @@ -162,6 +165,7 @@ impl<'a> HeapVisitor<'a> { match *hir.kind() { HirKind::Repetition(ref x) => Some(Frame::Repetition(x)), HirKind::Capture(ref x) => Some(Frame::Capture(x)), + HirKind::Lookaround(ref x) => Some(Frame::LookAround(x)), HirKind::Concat(ref x) if x.is_empty() => None, HirKind::Concat(ref x) => { Some(Frame::Concat { head: &x[0], tail: &x[1..] }) @@ -180,6 +184,7 @@ impl<'a> HeapVisitor<'a> { match induct { Frame::Repetition(_) => None, Frame::Capture(_) => None, + Frame::LookAround(_) => None, Frame::Concat { tail, .. } => { if tail.is_empty() { None @@ -208,6 +213,7 @@ impl<'a> Frame<'a> { match *self { Frame::Repetition(rep) => &rep.sub, Frame::Capture(capture) => &capture.sub, + Frame::LookAround(lookaround) => &lookaround.sub(), Frame::Concat { head, .. } => head, Frame::Alternation { head, .. } => head, } From bdee6b2fc535350ef2b6f26fe03b568a5cf3defd Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sat, 8 Mar 2025 21:09:12 +0100 Subject: [PATCH 04/66] Fix hir::Lookaround printing and add test --- regex-syntax/src/hir/print.rs | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index 547e579e9..8ff5c85e2 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -228,10 +228,10 @@ impl Visitor for Writer { } }, HirKind::Lookaround(hir::Lookaround::PositiveLookBehind(_)) => { - self.wtr.write_str(r"(?<=)")?; + self.wtr.write_str(r"(?<=")?; } HirKind::Lookaround(hir::Lookaround::NegativeLookBehind(_)) => { - self.wtr.write_str(r"(? { self.wtr.write_str("(")?; @@ -484,6 +484,18 @@ mod tests { roundtrip("((((a))))", "((((a))))"); } + #[test] + #[ignore = "Missing parser support for lookaround"] + fn print_look_around() { + roundtrip("(?<=)", "(?<=(?:))"); + roundtrip("(? Date: Sat, 8 Mar 2025 21:12:58 +0100 Subject: [PATCH 05/66] Remove useless ref --- regex-syntax/src/hir/visitor.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-syntax/src/hir/visitor.rs b/regex-syntax/src/hir/visitor.rs index a946d9ddb..8ba304683 100644 --- a/regex-syntax/src/hir/visitor.rs +++ b/regex-syntax/src/hir/visitor.rs @@ -213,7 +213,7 @@ impl<'a> Frame<'a> { match *self { Frame::Repetition(rep) => &rep.sub, Frame::Capture(capture) => &capture.sub, - Frame::LookAround(lookaround) => &lookaround.sub(), + Frame::LookAround(lookaround) => lookaround.sub(), Frame::Concat { head, .. } => head, Frame::Alternation { head, .. } => head, } From 8a5d895e21a05214202d555a11d5eb080b9a8d65 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sat, 8 Mar 2025 21:21:08 +0100 Subject: [PATCH 06/66] Add missing drop case for hir::Lookaround --- regex-syntax/src/hir/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index df5384b76..35017477f 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1976,6 +1976,9 @@ impl Drop for Hir { | HirKind::Class(_) | HirKind::Look(_) => return, HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return, + HirKind::Lookaround(ref x) if x.sub().kind.subs().is_empty() => { + return + } HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => { return } From 951408c0c413f9cf9ca86ae5b0aae67bda4d232e Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sat, 8 Mar 2025 21:25:23 +0100 Subject: [PATCH 07/66] Rename Lookaround to LookAround This makes it consistent with parser's ErrorKind::UnsupportedLookAround. --- regex-automata/src/meta/reverse_inner.rs | 4 ++-- regex-automata/src/nfa/thompson/compiler.rs | 2 +- regex-syntax/src/hir/literal.rs | 2 +- regex-syntax/src/hir/mod.rs | 20 ++++++++++---------- regex-syntax/src/hir/print.rs | 6 +++--- regex-syntax/src/hir/visitor.rs | 4 ++-- 6 files changed, 19 insertions(+), 19 deletions(-) diff --git a/regex-automata/src/meta/reverse_inner.rs b/regex-automata/src/meta/reverse_inner.rs index 8d9099600..14e260a1e 100644 --- a/regex-automata/src/meta/reverse_inner.rs +++ b/regex-automata/src/meta/reverse_inner.rs @@ -170,7 +170,7 @@ fn top_concat(mut hir: &Hir) -> Option> { | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) - | HirKind::Lookaround(_) + | HirKind::LookAround(_) | HirKind::Repetition(_) | HirKind::Alternation(_) => return None, HirKind::Capture(hir::Capture { ref sub, .. }) => sub, @@ -207,7 +207,7 @@ fn flatten(hir: &Hir) -> Hir { HirKind::Literal(hir::Literal(ref x)) => Hir::literal(x.clone()), HirKind::Class(ref x) => Hir::class(x.clone()), HirKind::Look(ref x) => Hir::look(x.clone()), - HirKind::Lookaround(ref x) => { + HirKind::LookAround(ref x) => { Hir::lookaround(x.with(flatten(x.sub()))) } HirKind::Repetition(ref x) => Hir::repetition(x.with(flatten(&x.sub))), diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 2fcd907e9..3964e5af6 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -1003,7 +1003,7 @@ impl Compiler { Class(Class::Bytes(ref c)) => self.c_byte_class(c), Class(Class::Unicode(ref c)) => self.c_unicode_class(c), Look(ref look) => self.c_look(look), - Lookaround(_) => todo!("implement lookaround NFA compilation"), + LookAround(_) => todo!("implement lookaround NFA compilation"), Repetition(ref rep) => self.c_repetition(rep), Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub), Concat(ref es) => self.c_concat(es.iter().map(|e| self.c(e))), diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index d506c8172..5a5ba3b82 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -172,7 +172,7 @@ impl Extractor { use crate::hir::HirKind::*; match *hir.kind() { - Empty | Look(_) | Lookaround(_) => { + Empty | Look(_) | LookAround(_) => { Seq::singleton(self::Literal::exact(vec![])) } Literal(hir::Literal(ref bytes)) => { diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 35017477f..87ec2d4b1 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -375,9 +375,9 @@ impl Hir { /// Creates a look-around subexpression HIR expression. #[inline] - pub fn lookaround(lookaround: Lookaround) -> Hir { + pub fn lookaround(lookaround: LookAround) -> Hir { let props = Properties::lookaround(&lookaround); - Hir { kind: HirKind::Lookaround(lookaround), props } + Hir { kind: HirKind::LookAround(lookaround), props } } /// Creates a repetition HIR expression. @@ -736,7 +736,7 @@ pub enum HirKind { /// A look-around assertion. A look-around match always has zero length. Look(Look), /// A look-around subexpression - Lookaround(Lookaround), + LookAround(LookAround), /// A repetition operation applied to a sub-expression. Repetition(Repetition), /// A capturing group, which contains a sub-expression. @@ -770,7 +770,7 @@ impl HirKind { | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) => &[], - HirKind::Lookaround(ref lookaround) => from_ref(lookaround.sub()), + HirKind::LookAround(ref lookaround) => from_ref(lookaround.sub()), HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub), HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub), HirKind::Concat(ref subs) => subs, @@ -1801,14 +1801,14 @@ impl Look { /// Currently, only lookbehind assertions are supported. /// Furthermore, capture groups inside assertions are not supported. #[derive(Clone, Debug, Eq, PartialEq)] -pub enum Lookaround { +pub enum LookAround { /// A positive lookbehind assertion. PositiveLookBehind(Box), /// A negative lookbehind assertion. NegativeLookBehind(Box), } -impl Lookaround { +impl LookAround { /// Returns a reference to the inner expression that must match for this /// lookaround assertion to hold. pub fn sub(&self) -> &Hir { @@ -1830,7 +1830,7 @@ impl Lookaround { /// Returns a new lookaround of the same kind, but with its /// sub-expression replaced with the one given. - pub fn with(&self, sub: Hir) -> Lookaround { + pub fn with(&self, sub: Hir) -> LookAround { match self { Self::PositiveLookBehind(_) => { Self::PositiveLookBehind(Box::new(sub)) @@ -1976,7 +1976,7 @@ impl Drop for Hir { | HirKind::Class(_) | HirKind::Look(_) => return, HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return, - HirKind::Lookaround(ref x) if x.sub().kind.subs().is_empty() => { + HirKind::LookAround(ref x) if x.sub().kind.subs().is_empty() => { return } HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => { @@ -1994,7 +1994,7 @@ impl Drop for Hir { | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) => {} - HirKind::Lookaround(ref mut x) => { + HirKind::LookAround(ref mut x) => { stack.push(mem::replace(x.sub_mut(), Hir::empty())); } HirKind::Capture(ref mut x) => { @@ -2561,7 +2561,7 @@ impl Properties { Properties(Box::new(inner)) } - fn lookaround(lookaround: &Lookaround) -> Properties { + fn lookaround(lookaround: &LookAround) -> Properties { let sub_p = lookaround.sub().properties(); let inner = PropertiesI { minimum_len: Some(0), diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index 8ff5c85e2..86e0018c6 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -227,10 +227,10 @@ impl Visitor for Writer { self.wtr.write_str(r"\b{end-half}")?; } }, - HirKind::Lookaround(hir::Lookaround::PositiveLookBehind(_)) => { + HirKind::LookAround(hir::LookAround::PositiveLookBehind(_)) => { self.wtr.write_str(r"(?<=")?; } - HirKind::Lookaround(hir::Lookaround::NegativeLookBehind(_)) => { + HirKind::LookAround(hir::LookAround::NegativeLookBehind(_)) => { self.wtr.write_str(r"(? { @@ -300,7 +300,7 @@ impl Visitor for Writer { HirKind::Capture(_) | HirKind::Concat(_) | HirKind::Alternation(_) - | HirKind::Lookaround(_) => { + | HirKind::LookAround(_) => { self.wtr.write_str(r")")?; } } diff --git a/regex-syntax/src/hir/visitor.rs b/regex-syntax/src/hir/visitor.rs index 8ba304683..0af0aeca1 100644 --- a/regex-syntax/src/hir/visitor.rs +++ b/regex-syntax/src/hir/visitor.rs @@ -85,7 +85,7 @@ enum Frame<'a> { Capture(&'a hir::Capture), /// A stack frame allocated just before descending into a look-around's /// child node. - LookAround(&'a hir::Lookaround), + LookAround(&'a hir::LookAround), /// The stack frame used while visiting every child node of a concatenation /// of expressions. Concat { @@ -165,7 +165,7 @@ impl<'a> HeapVisitor<'a> { match *hir.kind() { HirKind::Repetition(ref x) => Some(Frame::Repetition(x)), HirKind::Capture(ref x) => Some(Frame::Capture(x)), - HirKind::Lookaround(ref x) => Some(Frame::LookAround(x)), + HirKind::LookAround(ref x) => Some(Frame::LookAround(x)), HirKind::Concat(ref x) if x.is_empty() => None, HirKind::Concat(ref x) => { Some(Frame::Concat { head: &x[0], tail: &x[1..] }) From f01c5a2a52f9f332027e4df96dd9d3956a8e649b Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sat, 8 Mar 2025 21:37:48 +0100 Subject: [PATCH 08/66] Fix properties of LookArounds --- regex-syntax/src/hir/mod.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 87ec2d4b1..3ba35e8d8 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -735,7 +735,7 @@ pub enum HirKind { Class(Class), /// A look-around assertion. A look-around match always has zero length. Look(Look), - /// A look-around subexpression + /// A look-around subexpression. LookAround(LookAround), /// A repetition operation applied to a sub-expression. Repetition(Repetition), @@ -2561,6 +2561,7 @@ impl Properties { Properties(Box::new(inner)) } + /// Create a new set of HIR properties for a look-around. fn lookaround(lookaround: &LookAround) -> Properties { let sub_p = lookaround.sub().properties(); let inner = PropertiesI { @@ -2568,6 +2569,8 @@ impl Properties { maximum_len: Some(0), literal: false, alternation_literal: false, + explicit_captures_len: sub_p.explicit_captures_len(), + static_explicit_captures_len: sub_p.static_explicit_captures_len(), ..*sub_p.0.clone() }; Properties(Box::new(inner)) From f14c47ceb70c9cae6459b82f7f22883b0ef1d90f Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sat, 8 Mar 2025 22:32:43 +0100 Subject: [PATCH 09/66] Add missing literal lookaround test --- regex-syntax/src/hir/literal.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index 5a5ba3b82..4c35d8d4e 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -2455,6 +2455,22 @@ mod tests { assert_eq!(expected, e(r"^aZ*b")); } + #[test] + #[ignore = "Missing parser support for lookaround"] + fn lookaround() { + assert_eq!(exact(["ab"]), e(r"a(?<=qwe)b")); + assert_eq!(exact(["ab"]), e(r"a(? Date: Tue, 11 Mar 2025 15:26:20 +0100 Subject: [PATCH 10/66] Fix literal test and useless property computation --- regex-syntax/src/hir/literal.rs | 4 ++-- regex-syntax/src/hir/mod.rs | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index 4c35d8d4e..fe5bc8c88 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -2458,13 +2458,13 @@ mod tests { #[test] #[ignore = "Missing parser support for lookaround"] fn lookaround() { - assert_eq!(exact(["ab"]), e(r"a(?<=qwe)b")); + assert_eq!(exact(["ab"]), e(r"a(?<=qwa)b")); assert_eq!(exact(["ab"]), e(r"a(? Date: Sat, 8 Mar 2025 21:54:56 +0100 Subject: [PATCH 11/66] Adjust parsing errors for lookarounds --- regex-syntax/src/ast/mod.rs | 20 +++++++++++--------- regex-syntax/src/ast/parse.rs | 24 ++++++++++++++---------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index ce79a89ab..d217bf836 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -181,12 +181,15 @@ pub enum ErrorKind { /// escape is used. The octal escape is assumed to be an invocation of /// a backreference, which is the common case. UnsupportedBackreference, - /// When syntax similar to PCRE's look-around is used, this error is + /// When syntax similar to PCRE's look-ahead is used, this error is /// returned. Some example syntaxes that are rejected include, but are - /// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and - /// `(? { write!(f, "backreferences are not supported") } - UnsupportedLookAround => write!( - f, - "look-around, including look-ahead and look-behind, \ - is not supported" - ), + UnsupportedLookAhead => write!(f, "look-aheads are not supported"), + UsupportedCaptureInLookBehind => { + write!(f, "capture groups are not supported in look-behinds") + } } } } diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 0c2a35265..fb6876f21 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -1232,7 +1232,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { if self.is_lookaround_prefix() { return Err(self.error( Span::new(open_span.start, self.span().end), - ast::ErrorKind::UnsupportedLookAround, + ast::ErrorKind::UnsupportedLookAhead, )); } let inner_span = self.span(); @@ -3736,33 +3736,37 @@ bar } #[test] - fn parse_unsupported_lookaround() { + fn parse_unsupported_lookahead() { assert_eq!( parser(r"(?=a)").parse().unwrap_err(), TestError { span: span(0..3), - kind: ast::ErrorKind::UnsupportedLookAround, + kind: ast::ErrorKind::UnsupportedLookAhead, } ); assert_eq!( parser(r"(?!a)").parse().unwrap_err(), TestError { span: span(0..3), - kind: ast::ErrorKind::UnsupportedLookAround, + kind: ast::ErrorKind::UnsupportedLookAhead, } ); + } + + #[test] + fn parse_unsupported_capture_in_lookbehind() { assert_eq!( - parser(r"(?<=a)").parse().unwrap_err(), + parser(r"(?<=(?<=(a)))").parse().unwrap_err(), TestError { - span: span(0..4), - kind: ast::ErrorKind::UnsupportedLookAround, + span: span(8..10), + kind: ast::ErrorKind::UsupportedCaptureInLookBehind, } ); assert_eq!( - parser(r"(? Date: Sat, 8 Mar 2025 22:27:04 +0100 Subject: [PATCH 12/66] Add LookAround to Ast --- regex-syntax/src/ast/mod.rs | 35 +++++++++++++++++++++++++++++++ regex-syntax/src/ast/parse.rs | 7 +++++++ regex-syntax/src/ast/print.rs | 20 ++++++++++++++++++ regex-syntax/src/ast/visitor.rs | 6 ++++++ regex-syntax/src/hir/translate.rs | 6 ++++++ 5 files changed, 74 insertions(+) diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index d217bf836..a2daef197 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -479,6 +479,8 @@ pub enum Ast { Dot(Box), /// A single zero-width assertion. Assertion(Box), + /// A single look-around regular expression. + LookAround(Box), /// A single Unicode character class, e.g., `\pL` or `\p{Greek}`. ClassUnicode(Box), /// A single perl character class, e.g., `\d` or `\W`. @@ -523,6 +525,11 @@ impl Ast { Ast::Assertion(Box::new(e)) } + /// Create a "look-around" AST item. + pub fn look_around(e: LookAround) -> Ast { + Ast::LookAround(Box::new(e)) + } + /// Create a "Unicode class" AST item. pub fn class_unicode(e: ClassUnicode) -> Ast { Ast::ClassUnicode(Box::new(e)) @@ -566,6 +573,7 @@ impl Ast { Ast::Literal(ref x) => &x.span, Ast::Dot(ref span) => span, Ast::Assertion(ref x) => &x.span, + Ast::LookAround(ref x) => &x.span, Ast::ClassUnicode(ref x) => &x.span, Ast::ClassPerl(ref x) => &x.span, Ast::ClassBracketed(ref x) => &x.span, @@ -598,6 +606,7 @@ impl Ast { Ast::ClassBracketed(_) | Ast::Repetition(_) | Ast::Group(_) + | Ast::LookAround(_) | Ast::Alternation(_) | Ast::Concat(_) => true, } @@ -1344,6 +1353,28 @@ pub enum AssertionKind { WordBoundaryEndHalf, } +/// A single zero-width look-around. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct LookAround { + /// The span of this look-around. + pub span: Span, + /// The look-around kind, e.g. negative/positive look-behind. + pub kind: LookAroundKind, + /// The regular expression inside the look-around. + pub ast: Box, +} + +/// A look-around kind. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum LookAroundKind { + /// `(?<=...)` + PositiveLookBehind, + /// `(? return, Ast::Repetition(ref x) if !x.ast.has_subexprs() => return, Ast::Group(ref x) if !x.ast.has_subexprs() => return, + Ast::LookAround(ref x) if !x.ast.has_subexprs() => return, Ast::Alternation(ref x) if x.asts.is_empty() => return, Ast::Concat(ref x) if x.asts.is_empty() => return, _ => {} @@ -1675,6 +1707,9 @@ impl Drop for Ast { Ast::Group(ref mut x) => { stack.push(mem::replace(&mut x.ast, empty_ast())); } + Ast::LookAround(ref mut x) => { + stack.push(mem::replace(&mut x.ast, empty_ast())); + } Ast::Alternation(ref mut x) => { stack.extend(x.asts.drain(..)); } diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index fb6876f21..13975919d 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -2328,6 +2328,7 @@ impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { Ast::ClassBracketed(ref x) => &x.span, Ast::Repetition(ref x) => &x.span, Ast::Group(ref x) => &x.span, + Ast::LookAround(ref x) => &x.span, Ast::Alternation(ref x) => &x.span, Ast::Concat(ref x) => &x.span, }; @@ -2349,6 +2350,7 @@ impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { Ast::ClassBracketed(_) | Ast::Repetition(_) | Ast::Group(_) + | Ast::LookAround(_) | Ast::Alternation(_) | Ast::Concat(_) => { self.decrement_depth(); @@ -3753,6 +3755,11 @@ bar ); } + #[test] + fn parse_lookbehinds() { + todo!() + } + #[test] fn parse_unsupported_capture_in_lookbehind() { assert_eq!( diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 1ceb3c7fa..69be55064 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -80,6 +80,7 @@ impl Visitor for Writer { fn visit_pre(&mut self, ast: &Ast) -> fmt::Result { match *ast { Ast::Group(ref x) => self.fmt_group_pre(x), + Ast::LookAround(ref x) => self.fmt_lookaround_pre(x), Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_pre(x), _ => Ok(()), } @@ -92,6 +93,7 @@ impl Visitor for Writer { Ast::Literal(ref x) => self.fmt_literal(x), Ast::Dot(_) => self.wtr.write_str("."), Ast::Assertion(ref x) => self.fmt_assertion(x), + Ast::LookAround(ref x) => self.fmt_lookaround_post(x), Ast::ClassPerl(ref x) => self.fmt_class_perl(x), Ast::ClassUnicode(ref x) => self.fmt_class_unicode(x), Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_post(x), @@ -174,6 +176,18 @@ impl Writer { self.wtr.write_str(")") } + fn fmt_lookaround_pre(&mut self, ast: &ast::LookAround) -> fmt::Result { + use crate::ast::LookAroundKind::*; + match ast.kind { + PositiveLookBehind => self.wtr.write_str("(?<="), + NegativeLookBehind => self.wtr.write_str("(? fmt::Result { + self.wtr.write_str(")") + } + fn fmt_repetition(&mut self, ast: &ast::Repetition) -> fmt::Result { use crate::ast::RepetitionKind::*; match ast.op.kind { @@ -511,6 +525,12 @@ mod tests { roundtrip("(a)"); } + #[test] + fn print_lookaround() { + roundtrip("(? { /// A stack frame allocated just before descending into a group's child /// node. Group(&'a ast::Group), + /// A stack frame allocated just before descending into a look-around's + /// child node. + LookAround(&'a ast::LookAround), /// The stack frame used while visiting every child node of a concatenation /// of expressions. Concat { @@ -270,6 +273,7 @@ impl<'a> HeapVisitor<'a> { } Ast::Repetition(ref x) => Some(Frame::Repetition(x)), Ast::Group(ref x) => Some(Frame::Group(x)), + Ast::LookAround(ref x) => Some(Frame::LookAround(x)), Ast::Concat(ref x) if x.asts.is_empty() => None, Ast::Concat(ref x) => { Some(Frame::Concat { head: &x.asts[0], tail: &x.asts[1..] }) @@ -289,6 +293,7 @@ impl<'a> HeapVisitor<'a> { match induct { Frame::Repetition(_) => None, Frame::Group(_) => None, + Frame::LookAround(_) => None, Frame::Concat { tail, .. } => { if tail.is_empty() { None @@ -444,6 +449,7 @@ impl<'a> Frame<'a> { match *self { Frame::Repetition(rep) => &rep.ast, Frame::Group(group) => &group.ast, + Frame::LookAround(look) => &look.ast, Frame::Concat { head, .. } => head, Frame::Alternation { head, .. } => head, } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index e8e5a8812..a24446456 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -354,6 +354,9 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { .unwrap_or_else(|| self.flags()); self.push(HirFrame::Group { old_flags }); } + Ast::LookAround(ref x) => { + todo!("translation from AST to HIR"); + } Ast::Concat(_) => { self.push(HirFrame::Concat); } @@ -446,6 +449,9 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.trans().flags.set(old_flags); self.push(HirFrame::Expr(self.hir_capture(x, expr))); } + Ast::LookAround(_) => { + todo!("translation from AST to HIR"); + } Ast::Concat(_) => { let mut exprs = vec![]; while let Some(expr) = self.pop_concat_expr() { From b1c7e5d53888b24e7d93fc22b5c5e5fecc483be9 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sat, 8 Mar 2025 22:42:58 +0100 Subject: [PATCH 13/66] Disable failing tests --- regex-syntax/src/ast/parse.rs | 4 +++- regex-syntax/src/ast/print.rs | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 13975919d..6090485a7 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -3756,11 +3756,13 @@ bar } #[test] + #[ignore = "Missing parser support for lookaround"] fn parse_lookbehinds() { - todo!() + todo!("write tests for lookbehinds"); } #[test] + #[ignore = "Missing parser support for lookaround"] fn parse_unsupported_capture_in_lookbehind() { assert_eq!( parser(r"(?<=(?<=(a)))").parse().unwrap_err(), diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 69be55064..112c0bda1 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -526,6 +526,7 @@ mod tests { } #[test] + #[ignore = "Missing parser support for lookaround"] fn print_lookaround() { roundtrip("(? Date: Sat, 8 Mar 2025 22:44:24 +0100 Subject: [PATCH 14/66] Fix UnsupportedCaptureInLookBehind typo --- regex-syntax/src/ast/mod.rs | 4 ++-- regex-syntax/src/ast/parse.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index a2daef197..0eca1d4db 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -189,7 +189,7 @@ pub enum ErrorKind { UnsupportedLookAhead, /// When a capture group is used in a look-behind assertion, this error is /// returned. Look-behind assertions do not support capturing groups. - UsupportedCaptureInLookBehind, + UnsupportedCaptureInLookBehind, } #[cfg(feature = "std")] @@ -305,7 +305,7 @@ impl core::fmt::Display for ErrorKind { write!(f, "backreferences are not supported") } UnsupportedLookAhead => write!(f, "look-aheads are not supported"), - UsupportedCaptureInLookBehind => { + UnsupportedCaptureInLookBehind => { write!(f, "capture groups are not supported in look-behinds") } } diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 6090485a7..012185537 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -3768,14 +3768,14 @@ bar parser(r"(?<=(?<=(a)))").parse().unwrap_err(), TestError { span: span(8..10), - kind: ast::ErrorKind::UsupportedCaptureInLookBehind, + kind: ast::ErrorKind::UnsupportedCaptureInLookBehind, } ); assert_eq!( parser(r"(? Date: Sun, 9 Mar 2025 10:51:09 +0100 Subject: [PATCH 15/66] Add unclosed lookaround error --- regex-syntax/src/ast/mod.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index 0eca1d4db..25f3b9280 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -144,6 +144,10 @@ pub enum ErrorKind { /// /// The span of this error corresponds to the unclosed parenthesis. GroupUnclosed, + /// An unclosed look-around, e.g., `(? write!(f, "invalid capture group character"), GroupNameUnexpectedEof => write!(f, "unclosed capture group name"), GroupUnclosed => write!(f, "unclosed group"), + LookAroundUnclosed => write!(f, "unclosed look-around"), GroupUnopened => write!(f, "unopened group"), NestLimitExceeded(limit) => write!( f, @@ -526,7 +531,7 @@ impl Ast { } /// Create a "look-around" AST item. - pub fn look_around(e: LookAround) -> Ast { + pub fn lookaround(e: LookAround) -> Ast { Ast::LookAround(Box::new(e)) } From 145ec420d6dd460f73d06caf7bb5887010adf7a2 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sun, 9 Mar 2025 10:53:32 +0100 Subject: [PATCH 16/66] Rename lookaround to look-around --- regex-syntax/src/hir/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 1a2de4e47..83729fe27 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1796,7 +1796,7 @@ impl Look { } } -/// Represents a general lookaround assertion. +/// Represents a general look-around assertion. /// /// Currently, only lookbehind assertions are supported. /// Furthermore, capture groups inside assertions are not supported. @@ -1810,7 +1810,7 @@ pub enum LookAround { impl LookAround { /// Returns a reference to the inner expression that must match for this - /// lookaround assertion to hold. + /// look-around assertion to hold. pub fn sub(&self) -> &Hir { match self { Self::PositiveLookBehind(sub) | Self::NegativeLookBehind(sub) => { @@ -1828,7 +1828,7 @@ impl LookAround { } } - /// Returns a new lookaround of the same kind, but with its + /// Returns a new look-around of the same kind, but with its /// sub-expression replaced with the one given. pub fn with(&self, sub: Hir) -> LookAround { match self { From 6af9719254ba96f53f82699a09a8bd5e368c9aef Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sun, 9 Mar 2025 11:00:13 +0100 Subject: [PATCH 17/66] Support parsing of look-behinds --- regex-syntax/src/ast/parse.rs | 204 +++++++++++++++++++++++----------- regex-syntax/src/ast/print.rs | 3 +- 2 files changed, 141 insertions(+), 66 deletions(-) diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 012185537..026e26883 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -16,7 +16,7 @@ use alloc::{ }; use crate::{ - ast::{self, Ast, Position, Span}, + ast::{self, Ast, LookAroundKind, Position, Span}, either::Either, is_escapeable_character, is_meta_character, }; @@ -299,9 +299,9 @@ struct ParserI<'s, P> { pattern: &'s str, } -/// GroupState represents a single stack frame while parsing nested groups -/// and alternations. Each frame records the state up to an opening parenthesis -/// or a alternating bracket `|`. +/// GroupState represents a single stack frame while parsing nested groups, +/// look-arounds and alternations. Each frame records the state up to an opening +/// parenthesis or a alternating bracket `|`. #[derive(Clone, Debug)] enum GroupState { /// This state is pushed whenever an opening group is found. @@ -313,6 +313,13 @@ enum GroupState { /// Whether this group has the `x` flag enabled or not. ignore_whitespace: bool, }, + /// This state is pushed whenever an opening look-around is found. + LookAround { + /// The concatenation immediately preceding the opening look-around. + concat: ast::Concat, + /// The look-around that has been opened. Its sub-AST is always empty. + lookaround: ast::LookAround, + }, /// This state is pushed whenever a new alternation branch is found. If /// an alternation branch is found and this state is at the top of the /// stack, then this state should be modified to include the new @@ -521,18 +528,15 @@ impl<'s, P: Borrow> ParserI<'s, P> { } } - /// Returns true if and only if the parser is positioned at a look-around + /// Returns true if and only if the parser is positioned at a look-ahead /// prefix. The conditions under which this returns true must always /// correspond to a regular expression that would otherwise be consider /// invalid. /// /// This should only be called immediately after parsing the opening of /// a group or a set of flags. - fn is_lookaround_prefix(&self) -> bool { - self.bump_if("?=") - || self.bump_if("?!") - || self.bump_if("?<=") - || self.bump_if("? bool { + self.bump_if("?=") || self.bump_if("?!") } /// Bump the parser, and if the `x` flag is enabled, bump through any @@ -686,9 +690,9 @@ impl<'s, P: Borrow> ParserI<'s, P> { })); } - /// Parse and push a group AST (and its parent concatenation) on to the - /// parser's internal stack. Return a fresh concatenation corresponding - /// to the group's sub-AST. + /// Parse and push a group or look-around AST (and its parent + /// concatenation) on to the parser's internal stack. Return a fresh + /// concatenation corresponding to the grouping's sub-AST. /// /// If a set of flags was found (with no group), then the concatenation /// is returned with that set of flags added. @@ -697,12 +701,12 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// parenthesis. It advances the parser to the character at the start /// of the sub-expression (or adjoining expression). /// - /// If there was a problem parsing the start of the group, then an error - /// is returned. + /// If there was a problem parsing the start of the grouping, then an + /// error is returned. #[inline(never)] - fn push_group(&self, mut concat: ast::Concat) -> Result { + fn push_grouping(&self, mut concat: ast::Concat) -> Result { assert_eq!(self.char(), '('); - match self.parse_group()? { + match self.parse_grouping()? { Either::Left(set) => { let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace); if let Some(v) = ignore { @@ -712,7 +716,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { concat.asts.push(Ast::flags(set)); Ok(concat) } - Either::Right(group) => { + Either::Right(Either::Left(group)) => { let old_ignore_whitespace = self.ignore_whitespace(); let new_ignore_whitespace = group .flags() @@ -728,61 +732,105 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.parser().ignore_whitespace.set(new_ignore_whitespace); Ok(ast::Concat { span: self.span(), asts: vec![] }) } + Either::Right(Either::Right(lookaround)) => { + self.parser() + .stack_group + .borrow_mut() + .push(GroupState::LookAround { concat, lookaround }); + Ok(ast::Concat { span: self.span(), asts: vec![] }) + } } } - /// Pop a group AST from the parser's internal stack and set the group's - /// AST to the given concatenation. Return the concatenation containing - /// the group. + /// Pop a group or look-around AST from the parser's internal stack and + /// set the grouping's AST to the given concatenation. Return the + /// concatenation containing the grouping. /// /// This assumes that the parser is currently positioned on the closing /// parenthesis and advances the parser to the character following the `)`. /// - /// If no such group could be popped, then an unopened group error is + /// If no such grouping could be popped, then an unopened group error is /// returned. #[inline(never)] - fn pop_group(&self, mut group_concat: ast::Concat) -> Result { + fn pop_grouping( + &self, + mut grouping_concat: ast::Concat, + ) -> Result { use self::GroupState::*; assert_eq!(self.char(), ')'); let mut stack = self.parser().stack_group.borrow_mut(); - let (mut prior_concat, mut group, ignore_whitespace, alt) = match stack - .pop() - { - Some(Group { concat, group, ignore_whitespace }) => { - (concat, group, ignore_whitespace, None) - } - Some(Alternation(alt)) => match stack.pop() { + let (mut prior_concat, mut grouping, ignore_whitespace, alt) = + match stack.pop() { Some(Group { concat, group, ignore_whitespace }) => { - (concat, group, ignore_whitespace, Some(alt)) + (concat, Either::Left(group), ignore_whitespace, None) } - None | Some(Alternation(_)) => { + Some(LookAround { concat, lookaround }) => ( + concat, + Either::Right(lookaround), + self.parser().ignore_whitespace.get(), + None, + ), + Some(Alternation(alt)) => match stack.pop() { + Some(Group { concat, group, ignore_whitespace }) => ( + concat, + Either::Left(group), + ignore_whitespace, + Some(alt), + ), + Some(LookAround { concat, lookaround }) => ( + concat, + Either::Right(lookaround), + self.parser().ignore_whitespace.get(), + Some(alt), + ), + None | Some(Alternation(_)) => { + return Err(self.error( + self.span_char(), + ast::ErrorKind::GroupUnopened, + )); + } + }, + None => { return Err(self.error( self.span_char(), ast::ErrorKind::GroupUnopened, )); } - }, - None => { - return Err(self - .error(self.span_char(), ast::ErrorKind::GroupUnopened)); - } - }; + }; self.parser().ignore_whitespace.set(ignore_whitespace); - group_concat.span.end = self.pos(); + grouping_concat.span.end = self.pos(); self.bump(); - group.span.end = self.pos(); + match &mut grouping { + Either::Left(group) => group.span.end = self.pos(), + Either::Right(lookaround) => lookaround.span.end = self.pos(), + } match alt { Some(mut alt) => { - alt.span.end = group_concat.span.end; - alt.asts.push(group_concat.into_ast()); - group.ast = Box::new(alt.into_ast()); - } - None => { - group.ast = Box::new(group_concat.into_ast()); + alt.span.end = grouping_concat.span.end; + alt.asts.push(grouping_concat.into_ast()); + match &mut grouping { + Either::Left(group) => { + group.ast = Box::new(alt.into_ast()) + } + Either::Right(lookaround) => { + lookaround.ast = Box::new(alt.into_ast()) + } + } } + None => match &mut grouping { + Either::Left(group) => { + group.ast = Box::new(grouping_concat.into_ast()) + } + Either::Right(lookaround) => { + lookaround.ast = Box::new(grouping_concat.into_ast()) + } + }, } - prior_concat.asts.push(Ast::group(group)); + prior_concat.asts.push(match grouping { + Either::Left(group) => Ast::group(group), + Either::Right(lookaround) => Ast::lookaround(lookaround), + }); Ok(prior_concat) } @@ -793,7 +841,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// /// This assumes that the parser has advanced to the end. #[inline(never)] - fn pop_group_end(&self, mut concat: ast::Concat) -> Result { + fn pop_grouping_end(&self, mut concat: ast::Concat) -> Result { concat.span.end = self.pos(); let mut stack = self.parser().stack_group.borrow_mut(); let ast = match stack.pop() { @@ -808,6 +856,12 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.error(group.span, ast::ErrorKind::GroupUnclosed) ); } + Some(GroupState::LookAround { lookaround, .. }) => { + return Err(self.error( + lookaround.span, + ast::ErrorKind::LookAroundUnclosed, + )); + } }; // If we try to pop again, there should be nothing. match stack.pop() { @@ -824,6 +878,8 @@ impl<'s, P: Borrow> ParserI<'s, P> { Some(GroupState::Group { group, .. }) => { Err(self.error(group.span, ast::ErrorKind::GroupUnclosed)) } + Some(GroupState::LookAround { lookaround, .. }) => Err(self + .error(lookaround.span, ast::ErrorKind::LookAroundUnclosed)), } } @@ -989,8 +1045,8 @@ impl<'s, P: Borrow> ParserI<'s, P> { break; } match self.char() { - '(' => concat = self.push_group(concat)?, - ')' => concat = self.pop_group(concat)?, + '(' => concat = self.push_grouping(concat)?, + ')' => concat = self.pop_grouping(concat)?, '|' => concat = self.push_alternate(concat)?, '[' => { let class = self.parse_set_class()?; @@ -1020,7 +1076,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { _ => concat.asts.push(self.parse_primitive()?.into_ast()), } } - let ast = self.pop_group_end(concat)?; + let ast = self.pop_grouping_end(concat)?; NestLimiter::new(self).check(&ast)?; Ok(ast::WithComments { ast, @@ -1205,16 +1261,17 @@ impl<'s, P: Borrow> ParserI<'s, P> { Ok(concat) } - /// Parse a group (which contains a sub-expression) or a set of flags. + /// Parse a group or look-around (which contain a sub-expression), or a + /// set of flags. /// - /// If a group was found, then it is returned with an empty AST. If a set - /// of flags is found, then that set is returned. + /// If a group or look-around was found, then it is returned with an + /// empty AST. If a set of flags is found, then that set is returned. /// /// The parser should be positioned at the opening parenthesis. /// /// This advances the parser to the character before the start of the - /// sub-expression (in the case of a group) or to the closing parenthesis - /// immediately following the set of flags. + /// sub-expression (in the case of a group or look-around) or to the + /// closing parenthesis immediately following the set of flags. /// /// # Errors /// @@ -1223,19 +1280,38 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// /// If a capture name is given and it is incorrectly specified, then a /// corresponding error is returned. + /// + /// If a look-ahead is given (which is currently unsupported), then an + /// error is returned. #[inline(never)] - fn parse_group(&self) -> Result> { + fn parse_grouping( + &self, + ) -> Result>> + { assert_eq!(self.char(), '('); let open_span = self.span_char(); self.bump(); self.bump_space(); - if self.is_lookaround_prefix() { + if self.is_lookahead_prefix() { return Err(self.error( Span::new(open_span.start, self.span().end), ast::ErrorKind::UnsupportedLookAhead, )); } let inner_span = self.span(); + + let mut lookaround_kind = LookAroundKind::PositiveLookBehind; + if self.bump_if("?<=") || { + lookaround_kind = LookAroundKind::NegativeLookBehind; + self.bump_if("?> ParserI<'s, P> { } { let capture_index = self.next_capture_index(open_span)?; let name = self.parse_capture_name(capture_index)?; - Ok(Either::Right(ast::Group { + Ok(Either::Right(Either::Left(ast::Group { span: open_span, kind: ast::GroupKind::CaptureName { starts_with_p, name }, ast: Box::new(Ast::empty(self.span())), - })) + }))) } else if self.bump_if("?") { if self.is_eof() { return Err( @@ -1272,19 +1348,19 @@ impl<'s, P: Borrow> ParserI<'s, P> { })) } else { assert_eq!(char_end, ':'); - Ok(Either::Right(ast::Group { + Ok(Either::Right(Either::Left(ast::Group { span: open_span, kind: ast::GroupKind::NonCapturing(flags), ast: Box::new(Ast::empty(self.span())), - })) + }))) } } else { let capture_index = self.next_capture_index(open_span)?; - Ok(Either::Right(ast::Group { + Ok(Either::Right(Either::Left(ast::Group { span: open_span, kind: ast::GroupKind::CaptureIndex(capture_index), ast: Box::new(Ast::empty(self.span())), - })) + }))) } } diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 112c0bda1..0e87599d2 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -526,9 +526,8 @@ mod tests { } #[test] - #[ignore = "Missing parser support for lookaround"] fn print_lookaround() { - roundtrip("(? Date: Sun, 9 Mar 2025 11:40:51 +0100 Subject: [PATCH 18/66] Reject lookbehinds with capture groups --- regex-syntax/src/ast/parse.rs | 76 +++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 4 deletions(-) diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 026e26883..e8c65eadb 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -751,6 +751,8 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// /// If no such grouping could be popped, then an unopened group error is /// returned. + /// + /// If a look-behind contains a capture group, then an error is returned. #[inline(never)] fn pop_grouping( &self, @@ -829,7 +831,16 @@ impl<'s, P: Borrow> ParserI<'s, P> { } prior_concat.asts.push(match grouping { Either::Left(group) => Ast::group(group), - Either::Right(lookaround) => Ast::lookaround(lookaround), + Either::Right(lookaround) => { + if let Some(span) = first_capture_group_span(&lookaround.ast) { + return Err(self.error( + span, + ast::ErrorKind::UnsupportedCaptureInLookBehind, + )); + } + + Ast::lookaround(lookaround) + } }); Ok(prior_concat) } @@ -2511,6 +2522,29 @@ fn specialize_err( } } +/// Returns the span of the first capture group found. Returns None in case there are no capture groups. +fn first_capture_group_span(ast: &Ast) -> Option { + struct CaptureGroupSearcher; + + impl ast::Visitor for CaptureGroupSearcher { + type Output = (); + type Err = Span; + + fn finish(self) -> core::result::Result { + Ok(()) + } + + fn visit_pre(&mut self, ast: &Ast) -> std::result::Result<(), Span> { + match ast { + Ast::Group(group) => Err(group.span), + _ => Ok(()), + } + } + } + + ast::visit(ast, CaptureGroupSearcher).err() +} + #[cfg(test)] mod tests { use core::ops::Range; @@ -3838,19 +3872,53 @@ bar } #[test] - #[ignore = "Missing parser support for lookaround"] fn parse_unsupported_capture_in_lookbehind() { assert_eq!( parser(r"(?<=(?<=(a)))").parse().unwrap_err(), TestError { - span: span(8..10), + span: span(8..11), kind: ast::ErrorKind::UnsupportedCaptureInLookBehind, } ); assert_eq!( parser(r"(?a))").parse().unwrap_err(), + TestError { + span: span(4..14), + kind: ast::ErrorKind::UnsupportedCaptureInLookBehind, + } + ); + assert_eq!( + parser(r"(?a)|b)").parse().unwrap_err(), + TestError { + span: span(6..16), kind: ast::ErrorKind::UnsupportedCaptureInLookBehind, } ); From 9fedb23f399d632dd1ea3fb90ddd24a5320a5d96 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sun, 9 Mar 2025 11:56:22 +0100 Subject: [PATCH 19/66] Add tests for parsing lookbehinds --- regex-syntax/src/ast/parse.rs | 83 ++++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 2 deletions(-) diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index e8c65eadb..0713ce948 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -3866,9 +3866,88 @@ bar } #[test] - #[ignore = "Missing parser support for lookaround"] fn parse_lookbehinds() { - todo!("write tests for lookbehinds"); + assert_eq!( + parser(r"(?<=)").parse(), + Ok(Ast::lookaround(ast::LookAround { + span: span(0..5), + ast: Box::new(Ast::empty(span(4..4))), + kind: ast::LookAroundKind::PositiveLookBehind + })) + ); + assert_eq!( + parser(r"(?<=a)").parse(), + Ok(Ast::lookaround(ast::LookAround { + span: span(0..6), + ast: Box::new(lit('a', 4)), + kind: ast::LookAroundKind::PositiveLookBehind + })) + ); + assert_eq!( + parser(r"(? Date: Sun, 9 Mar 2025 12:35:32 +0100 Subject: [PATCH 20/66] Add AST -> HIR translation for lookarounds --- regex-syntax/src/hir/literal.rs | 1 - regex-syntax/src/hir/print.rs | 26 ++++++++++++++++++++------ regex-syntax/src/hir/translate.rs | 15 ++++++++++----- 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index fe5bc8c88..c08c2b007 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -2456,7 +2456,6 @@ mod tests { } #[test] - #[ignore = "Missing parser support for lookaround"] fn lookaround() { assert_eq!(exact(["ab"]), e(r"a(?<=qwa)b")); assert_eq!(exact(["ab"]), e(r"a(? Visitor for TranslatorI<'t, 'p> { .unwrap_or_else(|| self.flags()); self.push(HirFrame::Group { old_flags }); } - Ast::LookAround(ref x) => { - todo!("translation from AST to HIR"); - } Ast::Concat(_) => { self.push(HirFrame::Concat); } @@ -449,8 +446,16 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.trans().flags.set(old_flags); self.push(HirFrame::Expr(self.hir_capture(x, expr))); } - Ast::LookAround(_) => { - todo!("translation from AST to HIR"); + Ast::LookAround(ref x) => { + let expr = Box::new(self.pop().unwrap().unwrap_expr()); + self.push(HirFrame::Expr(Hir::lookaround(match x.kind { + ast::LookAroundKind::PositiveLookBehind => { + hir::LookAround::PositiveLookBehind(expr) + } + ast::LookAroundKind::NegativeLookBehind => { + hir::LookAround::NegativeLookBehind(expr) + } + }))); } Ast::Concat(_) => { let mut exprs = vec![]; From ab3419403689e2615fe6cda0ef172940793830a4 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sun, 9 Mar 2025 12:38:26 +0100 Subject: [PATCH 21/66] Fix typo --- regex-syntax/src/hir/print.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index 90587a605..e32e222c6 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -486,8 +486,8 @@ mod tests { #[test] fn print_look_around() { - // we do not want to do a roundtrip: printed lookarounds are not - // can contain capture groups which are unsupported by the parser. + // we do not want to do a roundtrip: printed lookarounds can + // contain capture groups which are unsupported by the parser. // TODO(shilangyu): is this a problem that some regexes are not // roundtrippable? fn test(given: &str, expected: &str) { From a3a0f051f6db78a3bce7b1cc68de300e9b774adc Mon Sep 17 00:00:00 2001 From: shilangyu Date: Tue, 11 Mar 2025 21:46:06 +0100 Subject: [PATCH 22/66] Allow for non-capturing groups in lookbehinds --- regex-syntax/src/ast/parse.rs | 38 ++++++++++++++++++++++++++++++++++- regex-syntax/src/hir/print.rs | 25 +++++------------------ 2 files changed, 42 insertions(+), 21 deletions(-) diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 0713ce948..171f01683 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -2536,7 +2536,14 @@ fn first_capture_group_span(ast: &Ast) -> Option { fn visit_pre(&mut self, ast: &Ast) -> std::result::Result<(), Span> { match ast { - Ast::Group(group) => Err(group.span), + Ast::Group(group) + if !matches!( + group.kind, + ast::GroupKind::NonCapturing(_) + ) => + { + Err(group.span) + } _ => Ok(()), } } @@ -3883,6 +3890,21 @@ bar kind: ast::LookAroundKind::PositiveLookBehind })) ); + assert_eq!( + parser(r"(?<=(?:a))").parse(), + Ok(Ast::lookaround(ast::LookAround { + span: span(0..10), + ast: Box::new(Ast::group(ast::Group { + span: span(4..9), + kind: ast::GroupKind::NonCapturing(ast::Flags { + span: span(6..6), + items: vec![], + }), + ast: Box::new(lit('a', 7)), + })), + kind: ast::LookAroundKind::PositiveLookBehind + })) + ); assert_eq!( parser(r"(? Date: Tue, 11 Mar 2025 23:18:07 +0100 Subject: [PATCH 23/66] Fix missing LookAround in regex-cli --- regex-cli/cmd/generate/fowler.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/regex-cli/cmd/generate/fowler.rs b/regex-cli/cmd/generate/fowler.rs index 404c47721..052d59ef8 100644 --- a/regex-cli/cmd/generate/fowler.rs +++ b/regex-cli/cmd/generate/fowler.rs @@ -412,6 +412,9 @@ fn count_capturing_groups_ast(ast: ®ex_syntax::ast::Ast) -> usize { let this = if group.is_capturing() { 1 } else { 0 }; this + count_capturing_groups_ast(&*group.ast) } + Ast::LookAround(ref lookaround) => { + count_capturing_groups_ast(&lookaround.ast) + } Ast::Alternation(ref alt) => { alt.asts.iter().map(count_capturing_groups_ast).sum() } From a133b6fe15d6f7be6c6e7d484af0542579c7f003 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Tue, 11 Mar 2025 23:27:33 +0100 Subject: [PATCH 24/66] Detect capture groups in lookarounds for cheaper --- regex-syntax/src/ast/parse.rs | 78 +++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 36 deletions(-) diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 171f01683..138be2905 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -159,6 +159,7 @@ impl ParserBuilder { stack_class: RefCell::new(vec![]), capture_names: RefCell::new(vec![]), scratch: RefCell::new(String::new()), + lookaround_depth: Cell::new(0), } } @@ -280,6 +281,9 @@ pub struct Parser { /// A scratch buffer used in various places. Mostly this is used to /// accumulate relevant characters from parts of a pattern. scratch: RefCell, + /// Whether the parser is currently in a look-around. This is used to + /// detect capture groups within look-arounds, which are not supported. + lookaround_depth: Cell, } /// ParserI is the internal parser implementation. @@ -392,6 +396,7 @@ impl Parser { self.comments.borrow_mut().clear(); self.stack_group.borrow_mut().clear(); self.stack_class.borrow_mut().clear(); + self.lookaround_depth.set(0); } } @@ -477,6 +482,11 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.parser().ignore_whitespace.get() } + /// Return whether the parser is currently in a look-around. + fn in_lookaround(&self) -> bool { + self.parser().lookaround_depth.get() != 0 + } + /// Return the character at the current position of the parser. /// /// This panics if the current position does not point to a valid char. @@ -737,6 +747,9 @@ impl<'s, P: Borrow> ParserI<'s, P> { .stack_group .borrow_mut() .push(GroupState::LookAround { concat, lookaround }); + self.parser() + .lookaround_depth + .set(self.parser().lookaround_depth.get() + 1); Ok(ast::Concat { span: self.span(), asts: vec![] }) } } @@ -770,7 +783,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { Some(LookAround { concat, lookaround }) => ( concat, Either::Right(lookaround), - self.parser().ignore_whitespace.get(), + self.ignore_whitespace(), None, ), Some(Alternation(alt)) => match stack.pop() { @@ -783,7 +796,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { Some(LookAround { concat, lookaround }) => ( concat, Either::Right(lookaround), - self.parser().ignore_whitespace.get(), + self.ignore_whitespace(), Some(alt), ), None | Some(Alternation(_)) => { @@ -830,15 +843,20 @@ impl<'s, P: Borrow> ParserI<'s, P> { }, } prior_concat.asts.push(match grouping { - Either::Left(group) => Ast::group(group), - Either::Right(lookaround) => { - if let Some(span) = first_capture_group_span(&lookaround.ast) { + Either::Left(group) => { + if group.is_capturing() && self.in_lookaround() { return Err(self.error( - span, + group.span, ast::ErrorKind::UnsupportedCaptureInLookBehind, )); } + Ast::group(group) + } + Either::Right(lookaround) => { + self.parser() + .lookaround_depth + .set(self.parser().lookaround_depth.get() - 1); Ast::lookaround(lookaround) } }); @@ -2522,36 +2540,6 @@ fn specialize_err( } } -/// Returns the span of the first capture group found. Returns None in case there are no capture groups. -fn first_capture_group_span(ast: &Ast) -> Option { - struct CaptureGroupSearcher; - - impl ast::Visitor for CaptureGroupSearcher { - type Output = (); - type Err = Span; - - fn finish(self) -> core::result::Result { - Ok(()) - } - - fn visit_pre(&mut self, ast: &Ast) -> std::result::Result<(), Span> { - match ast { - Ast::Group(group) - if !matches!( - group.kind, - ast::GroupKind::NonCapturing(_) - ) => - { - Err(group.span) - } - _ => Ok(()), - } - } - } - - ast::visit(ast, CaptureGroupSearcher).err() -} - #[cfg(test)] mod tests { use core::ops::Range; @@ -3882,6 +3870,24 @@ bar kind: ast::LookAroundKind::PositiveLookBehind })) ); + assert_eq!( + parser(r"(?<=(?<=))(a)").parse(), + Ok(concat( + 0..13, + vec![ + Ast::lookaround(ast::LookAround { + span: span(0..10), + ast: Box::new(Ast::lookaround(ast::LookAround { + span: span(4..9), + ast: Box::new(Ast::empty(span(8..8))), + kind: ast::LookAroundKind::PositiveLookBehind + })), + kind: ast::LookAroundKind::PositiveLookBehind + }), + group(10..13, 1, lit('a', 11)), + ] + )) + ); assert_eq!( parser(r"(?<=a)").parse(), Ok(Ast::lookaround(ast::LookAround { From f87b5c099e0764cfc97022059cf2f3664433f858 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Tue, 11 Mar 2025 23:35:27 +0100 Subject: [PATCH 25/66] Remove accidental import --- regex-syntax/src/ast/parse.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 138be2905..5883a0dd4 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -16,7 +16,7 @@ use alloc::{ }; use crate::{ - ast::{self, Ast, LookAroundKind, Position, Span}, + ast::{self, Ast, Position, Span}, either::Either, is_escapeable_character, is_meta_character, }; @@ -1329,9 +1329,9 @@ impl<'s, P: Borrow> ParserI<'s, P> { } let inner_span = self.span(); - let mut lookaround_kind = LookAroundKind::PositiveLookBehind; + let mut lookaround_kind = ast::LookAroundKind::PositiveLookBehind; if self.bump_if("?<=") || { - lookaround_kind = LookAroundKind::NegativeLookBehind; + lookaround_kind = ast::LookAroundKind::NegativeLookBehind; self.bump_if("? Date: Thu, 6 Mar 2025 09:21:38 +0100 Subject: [PATCH 26/66] Add new instructions to NFA We require two vm instructions 'CheckLookaround' and 'WriteLookaround' to be able to track the state of lookaround expressions at the current position in the haystack. Both instructions access a new 'lookaround' vector of booleans, which contains one entry per lookaround expression in the regex. --- regex-automata/src/dfa/onepass.rs | 4 + regex-automata/src/nfa/thompson/backtrack.rs | 4 + regex-automata/src/nfa/thompson/nfa.rs | 66 ++++++++++- regex-automata/src/nfa/thompson/pikevm.rs | 113 ++++++++++++++++--- regex-automata/src/util/determinize/mod.rs | 12 ++ 5 files changed, 179 insertions(+), 20 deletions(-) diff --git a/regex-automata/src/dfa/onepass.rs b/regex-automata/src/dfa/onepass.rs index 01e45309c..022305a5a 100644 --- a/regex-automata/src/dfa/onepass.rs +++ b/regex-automata/src/dfa/onepass.rs @@ -638,6 +638,10 @@ impl<'a> InternalBuilder<'a> { self.stack_push(nfa_id, Epsilons::empty())?; while let Some((id, epsilons)) = self.stack.pop() { match *self.nfa.state(id) { + thompson::State::WriteLookaround { .. } + | thompson::State::CheckLookaround { .. } => { + todo!("check how to handle") + } thompson::State::ByteRange { ref trans } => { self.compile_transition(dfa_id, trans, epsilons)?; } diff --git a/regex-automata/src/nfa/thompson/backtrack.rs b/regex-automata/src/nfa/thompson/backtrack.rs index df99e456d..b63a47fd5 100644 --- a/regex-automata/src/nfa/thompson/backtrack.rs +++ b/regex-automata/src/nfa/thompson/backtrack.rs @@ -1519,6 +1519,10 @@ impl BoundedBacktracker { } sid = next; } + State::WriteLookaround { .. } + | State::CheckLookaround { .. } => { + todo!("check how to handle") + } State::Union { ref alternates } => { sid = match alternates.get(0) { None => return None, diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 1f57f8ebd..f445ac8e4 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1100,6 +1100,12 @@ impl NFA { self.0.look_set_prefix_any } + /// Returns how many lookaround sub-expressions this nfa contains + #[inline] + pub fn look_count(&self) -> usize { + self.0.look_count + } + // FIXME: The `look_set_prefix_all` computation was not correct, and it // seemed a little tricky to fix it. Since I wasn't actually using it for // anything, I just decided to remove it in the run up to the regex 1.9 @@ -1260,6 +1266,7 @@ pub(super) struct Inner { /// zero-length prefix for any of the patterns in this NFA. look_set_prefix_all: LookSet, */ + look_count: usize, /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state /// might use a different amount of heap, we need to keep track of this @@ -1288,7 +1295,11 @@ impl Inner { match self.states[sid] { State::ByteRange { .. } | State::Dense { .. } - | State::Fail => continue, + | State::Fail + | State::WriteLookaround { .. } => continue, + State::CheckLookaround { next, .. } => { + stack.push(next); + } State::Sparse(_) => { // This snippet below will rewrite this sparse state // as a dense state. By doing it here, we apply this @@ -1371,6 +1382,10 @@ impl Inner { State::Capture { .. } => { self.has_capture = true; } + State::CheckLookaround { look_idx, .. } + | State::WriteLookaround { look_idx } => { + self.look_count = self.look_count.max(look_idx); + } State::Union { .. } | State::BinaryUnion { .. } | State::Fail @@ -1545,6 +1560,25 @@ pub enum State { /// satisfied. next: StateID, }, + /// This is like a match state but for a lookaround expression + /// executing this state will write a `true` into the lookaround oracle at + /// index `look_idx` + WriteLookaround { + /// The index of the lookaround expression that matches + look_idx: usize, + }, + /// This indicates that we need to check whether lookaround expression with + /// index `look_idx` holds at the current position in the haystack + /// If `positive` is false, then the lookaround expression is negative and + /// hence must NOT hold. + CheckLookaround { + /// The index of the lookaround expression that must be satisfied + look_idx: usize, + /// Whether this is a positive lookaround expression + positive: bool, + /// The next state to transition if the lookaround assertion is satisfied + next: StateID, + }, /// An alternation such that there exists an epsilon transition to all /// states in `alternates`, where matches found via earlier transitions /// are preferred over later transitions. @@ -1658,11 +1692,13 @@ impl State { | State::Sparse { .. } | State::Dense { .. } | State::Fail - | State::Match { .. } => false, + | State::Match { .. } + | State::WriteLookaround { .. } => false, State::Look { .. } | State::Union { .. } | State::BinaryUnion { .. } - | State::Capture { .. } => true, + | State::Capture { .. } + | State::CheckLookaround { .. } => true, } } @@ -1674,7 +1710,9 @@ impl State { | State::BinaryUnion { .. } | State::Capture { .. } | State::Match { .. } - | State::Fail => 0, + | State::Fail + | State::WriteLookaround { .. } + | State::CheckLookaround { .. } => 0, State::Sparse(SparseTransitions { ref transitions }) => { transitions.len() * mem::size_of::() } @@ -1707,6 +1745,9 @@ impl State { } } State::Look { ref mut next, .. } => *next = remap[*next], + State::CheckLookaround { ref mut next, .. } => { + *next = remap[*next] + } State::Union { ref mut alternates } => { for alt in alternates.iter_mut() { *alt = remap[*alt]; @@ -1717,8 +1758,9 @@ impl State { *alt2 = remap[*alt2]; } State::Capture { ref mut next, .. } => *next = remap[*next], - State::Fail => {} - State::Match { .. } => {} + State::Fail + | State::Match { .. } + | State::WriteLookaround { .. } => {} } } } @@ -1748,6 +1790,18 @@ impl fmt::Debug for State { State::Look { ref look, next } => { write!(f, "{:?} => {:?}", look, next.as_usize()) } + State::WriteLookaround { look_idx } => { + write!(f, "Write Lookaround: {}", look_idx) + } + State::CheckLookaround { look_idx, positive, next } => { + write!( + f, + "Check Lookaround {} is {} => {}", + look_idx, + positive, + next.as_usize() + ) + } State::Union { ref alternates } => { let alts = alternates .iter() diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 0128c151a..d061023d5 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1216,6 +1216,10 @@ impl PikeVM { } impl PikeVM { + fn look_count(&self) -> usize { + self.nfa.look_count() + } + /// The implementation of standard leftmost search. /// /// Capturing group spans are written to `slots`, but only if requested. @@ -1254,7 +1258,12 @@ impl PikeVM { let pre = if anchored { None } else { self.get_config().get_prefilter() }; - let Cache { ref mut stack, ref mut curr, ref mut next } = cache; + let Cache { + ref mut stack, + ref mut curr, + ref mut next, + ref mut lookaround, + } = cache; let mut hm = None; // Yes, our search doesn't end at input.end(), but includes it. This // is necessary because matches are delayed by one byte, just like @@ -1361,9 +1370,12 @@ impl PikeVM { // transitions, and thus must be able to write offsets to the // slots given which are later copied to slot values in 'curr'. let slots = next.slot_table.all_absent(); - self.epsilon_closure(stack, slots, curr, input, at, start_id); + self.epsilon_closure( + stack, slots, curr, lookaround, input, at, start_id, + ); } - if let Some(pid) = self.nexts(stack, curr, next, input, at, slots) + if let Some(pid) = + self.nexts(stack, curr, next, lookaround, input, at, slots) { hm = Some(HalfMatch::new(pid, at)); } @@ -1425,7 +1437,12 @@ impl PikeVM { Some(config) => config, }; - let Cache { ref mut stack, ref mut curr, ref mut next } = cache; + let Cache { + ref mut stack, + ref mut curr, + ref mut next, + ref mut lookaround, + } = cache; for at in input.start()..=input.end() { let any_matches = !patset.is_empty(); if curr.set.is_empty() { @@ -1438,9 +1455,13 @@ impl PikeVM { } if !any_matches || allmatches { let slots = &mut []; - self.epsilon_closure(stack, slots, curr, input, at, start_id); + self.epsilon_closure( + stack, slots, curr, lookaround, input, at, start_id, + ); } - self.nexts_overlapping(stack, curr, next, input, at, patset); + self.nexts_overlapping( + stack, curr, next, lookaround, input, at, patset, + ); // If we found a match and filled our set, then there is no more // additional info that we can provide. Thus, we can quit. We also // quit if the caller asked us to stop at the earliest point that @@ -1469,6 +1490,7 @@ impl PikeVM { stack: &mut Vec, curr: &mut ActiveStates, next: &mut ActiveStates, + lookarounds: &mut Vec, input: &Input<'_>, at: usize, slots: &mut [Option], @@ -1477,7 +1499,15 @@ impl PikeVM { let mut pid = None; let ActiveStates { ref set, ref mut slot_table } = *curr; for sid in set.iter() { - pid = match self.next(stack, slot_table, next, input, at, sid) { + pid = match self.next( + stack, + slot_table, + next, + lookarounds, + input, + at, + sid, + ) { None => continue, Some(pid) => Some(pid), }; @@ -1497,6 +1527,7 @@ impl PikeVM { stack: &mut Vec, curr: &mut ActiveStates, next: &mut ActiveStates, + lookarounds: &mut Vec, input: &Input<'_>, at: usize, patset: &mut PatternSet, @@ -1505,8 +1536,15 @@ impl PikeVM { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); let ActiveStates { ref set, ref mut slot_table } = *curr; for sid in set.iter() { - let pid = match self.next(stack, slot_table, next, input, at, sid) - { + let pid = match self.next( + stack, + slot_table, + next, + lookarounds, + input, + at, + sid, + ) { None => continue, Some(pid) => pid, }; @@ -1543,6 +1581,7 @@ impl PikeVM { stack: &mut Vec, curr_slot_table: &mut SlotTable, next: &mut ActiveStates, + lookarounds: &mut Vec, input: &Input<'_>, at: usize, sid: StateID, @@ -1553,7 +1592,9 @@ impl PikeVM { | State::Look { .. } | State::Union { .. } | State::BinaryUnion { .. } - | State::Capture { .. } => None, + | State::Capture { .. } + | State::WriteLookaround { .. } + | State::CheckLookaround { .. } => None, State::ByteRange { ref trans } => { if trans.matches(input.haystack(), at) { let slots = curr_slot_table.for_state(sid); @@ -1561,7 +1602,13 @@ impl PikeVM { // adding 1 will never wrap. let at = at.wrapping_add(1); self.epsilon_closure( - stack, slots, next, input, at, trans.next, + stack, + slots, + next, + lookarounds, + input, + at, + trans.next, ); } None @@ -1573,7 +1620,13 @@ impl PikeVM { // adding 1 will never wrap. let at = at.wrapping_add(1); self.epsilon_closure( - stack, slots, next, input, at, next_sid, + stack, + slots, + next, + lookarounds, + input, + at, + next_sid, ); } None @@ -1585,7 +1638,13 @@ impl PikeVM { // adding 1 will never wrap. let at = at.wrapping_add(1); self.epsilon_closure( - stack, slots, next, input, at, next_sid, + stack, + slots, + next, + lookarounds, + input, + at, + next_sid, ); } None @@ -1613,6 +1672,7 @@ impl PikeVM { stack: &mut Vec, curr_slots: &mut [Option], next: &mut ActiveStates, + lookarounds: &mut Vec, input: &Input<'_>, at: usize, sid: StateID, @@ -1629,7 +1689,13 @@ impl PikeVM { } FollowEpsilon::Explore(sid) => { self.epsilon_closure_explore( - stack, curr_slots, next, input, at, sid, + stack, + curr_slots, + next, + lookarounds, + input, + at, + sid, ); } } @@ -1666,6 +1732,7 @@ impl PikeVM { stack: &mut Vec, curr_slots: &mut [Option], next: &mut ActiveStates, + lookarounds: &mut Vec, input: &Input<'_>, at: usize, mut sid: StateID, @@ -1705,6 +1772,16 @@ impl PikeVM { } sid = next; } + State::WriteLookaround { look_idx } => { + lookarounds[look_idx] = true; + return; + } + State::CheckLookaround { look_idx, positive, next } => { + if lookarounds[look_idx] != positive { + return; + } + sid = next; + } State::Union { ref alternates } => { sid = match alternates.get(0) { None => return, @@ -1886,6 +1963,9 @@ pub struct Cache { /// The next set of states we're building that will be explored for the /// next byte in the haystack. next: ActiveStates, + /// This answers the question: "Does lookaround assertion x hold at the + /// current position in the haystack" + lookaround: Vec, } impl Cache { @@ -1902,6 +1982,11 @@ impl Cache { stack: vec![], curr: ActiveStates::new(re), next: ActiveStates::new(re), + lookaround: { + let mut res = Vec::new(); + res.resize(re.look_count(), false); + res + }, } } diff --git a/regex-automata/src/util/determinize/mod.rs b/regex-automata/src/util/determinize/mod.rs index ba32991d0..9778bf1af 100644 --- a/regex-automata/src/util/determinize/mod.rs +++ b/regex-automata/src/util/determinize/mod.rs @@ -251,6 +251,10 @@ pub(crate) fn next( | thompson::State::Fail | thompson::State::Look { .. } | thompson::State::Capture { .. } => {} + thompson::State::CheckLookaround { .. } + | thompson::State::WriteLookaround { .. } => { + todo!("check how to handle") + } thompson::State::Match { pattern_id } => { // Notice here that we are calling the NEW state a match // state if the OLD state we are transitioning from @@ -399,6 +403,10 @@ pub(crate) fn epsilon_closure( | thompson::State::Dense { .. } | thompson::State::Fail | thompson::State::Match { .. } => break, + thompson::State::WriteLookaround { .. } + | thompson::State::CheckLookaround { .. } => { + todo!("check how to handle") + } thompson::State::Look { look, next } => { if !look_have.contains(look) { break; @@ -465,6 +473,10 @@ pub(crate) fn add_nfa_states( builder.add_nfa_state_id(nfa_id); builder.set_look_need(|need| need.insert(look)); } + thompson::State::CheckLookaround { .. } + | thompson::State::WriteLookaround { .. } => { + todo!("check how to handle") + } thompson::State::Union { .. } | thompson::State::BinaryUnion { .. } => { // Pure epsilon transitions don't need to be tracked as part From 59f9d03d75cb0979bb01af3f9f77383b3b67643f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Thu, 6 Mar 2025 16:38:13 +0100 Subject: [PATCH 27/66] Implement lookaround compilation These changes implement the compilation of lookaround assertions from HIR to NFA. Subexpressions of lookaround assertions are patched to a top level reverse union. This is necessary so that the NFA will explore the innermost subexpression first and thereby make sure that all subexpression results are available when they need to be checked. I.e. any `WriteLookaround` state must be visited before any `CheckLookaround` state with the same index. --- regex-automata/src/nfa/thompson/builder.rs | 81 +++++++++++++++++++-- regex-automata/src/nfa/thompson/compiler.rs | 63 +++++++++++++++- regex-cli/cmd/generate/fowler.rs | 1 + 3 files changed, 134 insertions(+), 11 deletions(-) diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index 6b69e8784..8c6eb0e85 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -41,7 +41,9 @@ enum State { }, /// A state that only transitions to another state if the current input /// byte is in a particular range of bytes. - ByteRange { trans: Transition }, + ByteRange { + trans: Transition, + }, /// A state with possibly many transitions, represented in a sparse /// fashion. Transitions must be ordered lexicographically by input range /// and be non-overlapping. As such, this may only be used when every @@ -55,10 +57,15 @@ enum State { /// that `Sparse` is used for via `Union`. But this creates a more bloated /// NFA with more epsilon transitions than is necessary in the special case /// of character classes. - Sparse { transitions: Vec }, + Sparse { + transitions: Vec, + }, /// A conditional epsilon transition satisfied via some sort of /// look-around. - Look { look: Look, next: StateID }, + Look { + look: Look, + next: StateID, + }, /// An empty state that records the start of a capture location. This is an /// unconditional epsilon transition like `Empty`, except it can be used to /// record position information for a capture group when using the NFA for @@ -91,10 +98,20 @@ enum State { /// The next state that this state should transition to. next: StateID, }, + WriteLookaround { + lookaround_index: usize, + }, + CheckLookaround { + lookaround_index: usize, + positive: bool, + next: StateID, + }, /// An alternation such that there exists an epsilon transition to all /// states in `alternates`, where matches found via earlier transitions /// are preferred over later transitions. - Union { alternates: Vec }, + Union { + alternates: Vec, + }, /// An alternation such that there exists an epsilon transition to all /// states in `alternates`, where matches found via later transitions are /// preferred over earlier transitions. @@ -110,7 +127,9 @@ enum State { /// to be amortized constant time. But if we used a `Union`, we'd need to /// prepend the state, which takes O(n) time. There are other approaches we /// could use to solve this, but this seems simple enough. - UnionReverse { alternates: Vec }, + UnionReverse { + alternates: Vec, + }, /// A state that cannot be transitioned out of. This is useful for cases /// where you want to prevent matching from occurring. For example, if your /// regex parser permits empty character classes, then one could choose a @@ -124,7 +143,9 @@ enum State { /// /// `pattern_id` refers to the ID of the pattern itself, which corresponds /// to the pattern's index (starting at 0). - Match { pattern_id: PatternID }, + Match { + pattern_id: PatternID, + }, } impl State { @@ -154,7 +175,9 @@ impl State { | State::CaptureStart { .. } | State::CaptureEnd { .. } | State::Fail - | State::Match { .. } => 0, + | State::Match { .. } + | State::CheckLookaround { .. } + | State::WriteLookaround { .. } => 0, State::Sparse { ref transitions } => { transitions.len() * mem::size_of::() } @@ -470,6 +493,22 @@ impl Builder { State::Look { look, next } => { remap[sid] = nfa.add(nfa::State::Look { look, next }); } + State::WriteLookaround { lookaround_index } => { + remap[sid] = nfa.add(nfa::State::WriteLookaround { + look_idx: lookaround_index, + }); + } + State::CheckLookaround { + lookaround_index, + positive, + next, + } => { + remap[sid] = nfa.add(nfa::State::CheckLookaround { + look_idx: lookaround_index, + positive, + next, + }); + } State::CaptureStart { pattern_id, group_index, next } => { // We can't remove this empty state because of the side // effect of capturing an offset for this capture slot. @@ -693,6 +732,30 @@ impl Builder { self.add(State::Empty { next: StateID::ZERO }) } + /// Add a state which will record that the lookaround with the given index + /// is satisfied at the current position. + pub fn add_write_lookaround( + &mut self, + index: usize, + ) -> Result { + self.add(State::WriteLookaround { lookaround_index: index }) + } + + /// Add a state which will check whether the lookaround with the given + /// index is satisfied at the current position. + pub fn add_check_lookaround( + &mut self, + index: usize, + positive: bool, + next: StateID, + ) -> Result { + self.add(State::CheckLookaround { + lookaround_index: index, + positive, + next, + }) + } + /// Add a "union" NFA state. /// /// A "union" NFA state that contains zero or more unconditional epsilon @@ -1159,6 +1222,9 @@ impl Builder { State::Look { ref mut next, .. } => { *next = to; } + State::CheckLookaround { ref mut next, .. } => { + *next = to; + } State::Union { ref mut alternates } => { alternates.push(to); self.memory_states += mem::size_of::(); @@ -1173,6 +1239,7 @@ impl Builder { State::CaptureEnd { ref mut next, .. } => { *next = to; } + State::WriteLookaround { .. } => {} State::Fail => {} State::Match { .. } => {} } diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 3964e5af6..96dc82a24 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -3,7 +3,7 @@ use core::{borrow::Borrow, cell::RefCell}; use alloc::{sync::Arc, vec, vec::Vec}; use regex_syntax::{ - hir::{self, Hir}, + hir::{self, Hir, LookAround}, utf8::{Utf8Range, Utf8Sequences}, ParserBuilder, }; @@ -711,6 +711,7 @@ pub struct Compiler { /// State used for caching common suffixes when compiling reverse UTF-8 /// automata (for Unicode character classes). utf8_suffix: RefCell, + lookaround_alt: RefCell>, } impl Compiler { @@ -723,6 +724,7 @@ impl Compiler { utf8_state: RefCell::new(Utf8State::new()), trie_state: RefCell::new(RangeTrie::new()), utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), + lookaround_alt: RefCell::new(None), } } @@ -977,11 +979,20 @@ impl Compiler { let compiled = self.c_alt_iter(exprs.iter().map(|e| { let _ = self.start_pattern()?; + let lookaround_prefix = + self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; + let lookaround_alt = self.add_union_reverse()?; + self.patch(lookaround_prefix.end, lookaround_alt)?; + let top_level_alt = self.add_union()?; + self.patch(top_level_alt, lookaround_prefix.start)?; + self.lookaround_alt.borrow_mut().replace(lookaround_alt); let one = self.c_cap(0, None, e.borrow())?; let match_state_id = self.add_match()?; self.patch(one.end, match_state_id)?; - let _ = self.finish_pattern(one.start)?; - Ok(ThompsonRef { start: one.start, end: match_state_id }) + self.patch(top_level_alt, one.start)?; + let _ = self.finish_pattern(top_level_alt)?; + self.lookaround_alt.borrow_mut().take(); + Ok(ThompsonRef { start: top_level_alt, end: match_state_id }) }))?; self.patch(unanchored_prefix.end, compiled.start)?; let nfa = self @@ -1003,7 +1014,7 @@ impl Compiler { Class(Class::Bytes(ref c)) => self.c_byte_class(c), Class(Class::Unicode(ref c)) => self.c_unicode_class(c), Look(ref look) => self.c_look(look), - LookAround(_) => todo!("implement lookaround NFA compilation"), + LookAround(ref lookaround) => self.c_lookaround(lookaround), Repetition(ref rep) => self.c_repetition(rep), Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub), Concat(ref es) => self.c_concat(es.iter().map(|e| self.c(e))), @@ -1011,6 +1022,31 @@ impl Compiler { } } + fn c_lookaround( + &self, + lookaround: &LookAround, + ) -> Result { + let sub = match lookaround { + LookAround::NegativeLookBehind(ref sub) + | LookAround::PositiveLookBehind(ref sub) => self.c(sub)?, + }; + let pos = match lookaround { + LookAround::NegativeLookBehind(_) => false, + LookAround::PositiveLookBehind(_) => true, + }; + let idx = todo!("get index"); + let check = self.add_check_lookaround(idx, pos)?; + let write = self.add_write_lookaround(idx)?; + self.patch(sub.end, write)?; + self.patch( + self.lookaround_alt + .borrow() + .expect("Cannot compile lookaround outside pattern"), + sub.start, + )?; + Ok(ThompsonRef { start: check, end: check }) + } + /// Compile a concatenation of the sub-expressions yielded by the given /// iterator. If the iterator yields no elements, then this compiles down /// to an "empty" state that always matches. @@ -1631,6 +1667,25 @@ impl Compiler { self.builder.borrow_mut().add_empty() } + fn add_write_lookaround( + &self, + index: usize, + ) -> Result { + self.builder.borrow_mut().add_write_lookaround(index) + } + + fn add_check_lookaround( + &self, + index: usize, + positive: bool, + ) -> Result { + self.builder.borrow_mut().add_check_lookaround( + index, + positive, + StateID::ZERO, + ) + } + fn add_range(&self, start: u8, end: u8) -> Result { self.builder.borrow_mut().add_range(Transition { start, diff --git a/regex-cli/cmd/generate/fowler.rs b/regex-cli/cmd/generate/fowler.rs index 052d59ef8..70db71fb0 100644 --- a/regex-cli/cmd/generate/fowler.rs +++ b/regex-cli/cmd/generate/fowler.rs @@ -421,5 +421,6 @@ fn count_capturing_groups_ast(ast: ®ex_syntax::ast::Ast) -> usize { Ast::Concat(ref concat) => { concat.asts.iter().map(count_capturing_groups_ast).sum() } + Ast::LookAround(_) => todo!(), } } From bb375f5a9935516745d4f663b0e7840367ef0547 Mon Sep 17 00:00:00 2001 From: Robin Date: Tue, 11 Mar 2025 17:31:52 +0100 Subject: [PATCH 28/66] Restore compilation behaviour for regexes without lookarounds The machinery necessary to perform the parallel lookbehind checking should only be compiled in when there is actually a lookbehind expression in the regex. This restores compilation to the expected outputs for regexes without lookbehind expressions. --- regex-automata/src/nfa/thompson/compiler.rs | 28 +++++++++++++++------ regex-syntax/src/hir/mod.rs | 26 +++++++++++++++++++ 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 96dc82a24..49744d2ee 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -979,17 +979,29 @@ impl Compiler { let compiled = self.c_alt_iter(exprs.iter().map(|e| { let _ = self.start_pattern()?; - let lookaround_prefix = - self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; - let lookaround_alt = self.add_union_reverse()?; - self.patch(lookaround_prefix.end, lookaround_alt)?; - let top_level_alt = self.add_union()?; - self.patch(top_level_alt, lookaround_prefix.start)?; - self.lookaround_alt.borrow_mut().replace(lookaround_alt); + let has_lookarounds = + (e.borrow() as &Hir).properties().contains_lookaround_expr(); + let mut top_level_alt = if has_lookarounds { + self.add_union()? + } else { + StateID::ZERO + }; + if has_lookarounds { + let lookaround_prefix = + self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; + let lookaround_alt = self.add_union_reverse()?; + self.patch(lookaround_prefix.end, lookaround_alt)?; + self.patch(top_level_alt, lookaround_prefix.start)?; + self.lookaround_alt.borrow_mut().replace(lookaround_alt); + } let one = self.c_cap(0, None, e.borrow())?; let match_state_id = self.add_match()?; self.patch(one.end, match_state_id)?; - self.patch(top_level_alt, one.start)?; + if has_lookarounds { + self.patch(top_level_alt, one.start)?; + } else { + top_level_alt = one.start; + } let _ = self.finish_pattern(top_level_alt)?; self.lookaround_alt.borrow_mut().take(); Ok(ThompsonRef { start: top_level_alt, end: match_state_id }) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 83729fe27..1d460ad00 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -2041,6 +2041,7 @@ struct PropertiesI { look_set_suffix: LookSet, look_set_prefix_any: LookSet, look_set_suffix_any: LookSet, + contains_lookaround_expr: bool, utf8: bool, explicit_captures_len: usize, static_explicit_captures_len: Option, @@ -2134,6 +2135,17 @@ impl Properties { self.0.look_set_suffix_any } + /// Returns whether there are any look-around expressions in this HIR value. + /// + /// Only returns true for [`HirKind::LookAround`] and not for + /// [`HirKind::Look`], which can be queried by [`look_set`] instead. + /// Currently, only lookbehind assertions without capture groups are + /// supported. + #[inline] + pub fn contains_lookaround_expr(&self) -> bool { + self.0.contains_lookaround_expr + } + /// Return true if and only if the corresponding HIR will always match /// valid UTF-8. /// @@ -2403,6 +2415,7 @@ impl Properties { look_set_suffix: fix, look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), + contains_lookaround_expr: false, utf8: true, explicit_captures_len: 0, static_explicit_captures_len, @@ -2418,6 +2431,8 @@ impl Properties { props.look_set_suffix.set_intersect(p.look_set_suffix()); props.look_set_prefix_any.set_union(p.look_set_prefix_any()); props.look_set_suffix_any.set_union(p.look_set_suffix_any()); + props.contains_lookaround_expr = + props.contains_lookaround_expr || p.contains_lookaround_expr(); props.utf8 = props.utf8 && p.is_utf8(); props.explicit_captures_len = props .explicit_captures_len @@ -2465,6 +2480,7 @@ impl Properties { look_set_suffix: LookSet::empty(), look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), + contains_lookaround_expr: false, // It is debatable whether an empty regex always matches at valid // UTF-8 boundaries. Strictly speaking, at a byte oriented view, // it is clearly false. There are, for example, many empty strings @@ -2501,6 +2517,7 @@ impl Properties { look_set_suffix: LookSet::empty(), look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), + contains_lookaround_expr: false, utf8: core::str::from_utf8(&lit.0).is_ok(), explicit_captures_len: 0, static_explicit_captures_len: Some(0), @@ -2520,6 +2537,7 @@ impl Properties { look_set_suffix: LookSet::empty(), look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), + contains_lookaround_expr: false, utf8: class.is_utf8(), explicit_captures_len: 0, static_explicit_captures_len: Some(0), @@ -2539,6 +2557,9 @@ impl Properties { look_set_suffix: LookSet::singleton(look), look_set_prefix_any: LookSet::singleton(look), look_set_suffix_any: LookSet::singleton(look), + // Note, this field represents _general_ lookarounds (ones using + // LookAround) and not simple ones (using Look). + contains_lookaround_expr: false, // This requires a little explanation. Basically, we don't consider // matching an empty string to be equivalent to matching invalid // UTF-8, even though technically matching every empty string will @@ -2569,6 +2590,7 @@ impl Properties { maximum_len: Some(0), literal: false, alternation_literal: false, + contains_lookaround_expr: true, ..*sub_p.0.clone() }; Properties(Box::new(inner)) @@ -2595,6 +2617,7 @@ impl Properties { look_set_suffix: LookSet::empty(), look_set_prefix_any: p.look_set_prefix_any(), look_set_suffix_any: p.look_set_suffix_any(), + contains_lookaround_expr: p.contains_lookaround_expr(), utf8: p.is_utf8(), explicit_captures_len: p.explicit_captures_len(), static_explicit_captures_len: p.static_explicit_captures_len(), @@ -2656,6 +2679,7 @@ impl Properties { look_set_suffix: LookSet::empty(), look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), + contains_lookaround_expr: false, utf8: true, explicit_captures_len: 0, static_explicit_captures_len: Some(0), @@ -2667,6 +2691,8 @@ impl Properties { let p = x.properties(); props.look_set.set_union(p.look_set()); props.utf8 = props.utf8 && p.is_utf8(); + props.contains_lookaround_expr = + props.contains_lookaround_expr || p.contains_lookaround_expr(); props.explicit_captures_len = props .explicit_captures_len .saturating_add(p.explicit_captures_len()); From 27e90face2af441c28300985743ed912c395d8af Mon Sep 17 00:00:00 2001 From: Robin Date: Tue, 11 Mar 2025 18:06:11 +0100 Subject: [PATCH 29/66] Address review comments --- regex-automata/src/nfa/thompson/builder.rs | 40 ++++++++------------- regex-automata/src/nfa/thompson/compiler.rs | 6 ++-- regex-automata/src/nfa/thompson/nfa.rs | 19 +++++----- regex-automata/src/nfa/thompson/pikevm.rs | 6 ++-- 4 files changed, 32 insertions(+), 39 deletions(-) diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index 8c6eb0e85..f9119f537 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -41,9 +41,7 @@ enum State { }, /// A state that only transitions to another state if the current input /// byte is in a particular range of bytes. - ByteRange { - trans: Transition, - }, + ByteRange { trans: Transition }, /// A state with possibly many transitions, represented in a sparse /// fashion. Transitions must be ordered lexicographically by input range /// and be non-overlapping. As such, this may only be used when every @@ -57,15 +55,10 @@ enum State { /// that `Sparse` is used for via `Union`. But this creates a more bloated /// NFA with more epsilon transitions than is necessary in the special case /// of character classes. - Sparse { - transitions: Vec, - }, + Sparse { transitions: Vec }, /// A conditional epsilon transition satisfied via some sort of /// look-around. - Look { - look: Look, - next: StateID, - }, + Look { look: Look, next: StateID }, /// An empty state that records the start of a capture location. This is an /// unconditional epsilon transition like `Empty`, except it can be used to /// record position information for a capture group when using the NFA for @@ -98,20 +91,21 @@ enum State { /// The next state that this state should transition to. next: StateID, }, - WriteLookaround { - lookaround_index: usize, - }, + /// An empty state that behaves analogously to a `Match` state but for + /// the look-around sub-expression with the given index. + WriteLookaround { lookaround_index: SmallIndex }, + /// A conditional epsilon transition that will only be taken if the + /// look-around sub-expression with the given index evaluates to `positive` + /// at the current position in the haystack. CheckLookaround { - lookaround_index: usize, + lookaround_index: SmallIndex, positive: bool, next: StateID, }, /// An alternation such that there exists an epsilon transition to all /// states in `alternates`, where matches found via earlier transitions /// are preferred over later transitions. - Union { - alternates: Vec, - }, + Union { alternates: Vec }, /// An alternation such that there exists an epsilon transition to all /// states in `alternates`, where matches found via later transitions are /// preferred over earlier transitions. @@ -127,9 +121,7 @@ enum State { /// to be amortized constant time. But if we used a `Union`, we'd need to /// prepend the state, which takes O(n) time. There are other approaches we /// could use to solve this, but this seems simple enough. - UnionReverse { - alternates: Vec, - }, + UnionReverse { alternates: Vec }, /// A state that cannot be transitioned out of. This is useful for cases /// where you want to prevent matching from occurring. For example, if your /// regex parser permits empty character classes, then one could choose a @@ -143,9 +135,7 @@ enum State { /// /// `pattern_id` refers to the ID of the pattern itself, which corresponds /// to the pattern's index (starting at 0). - Match { - pattern_id: PatternID, - }, + Match { pattern_id: PatternID }, } impl State { @@ -736,7 +726,7 @@ impl Builder { /// is satisfied at the current position. pub fn add_write_lookaround( &mut self, - index: usize, + index: SmallIndex, ) -> Result { self.add(State::WriteLookaround { lookaround_index: index }) } @@ -745,7 +735,7 @@ impl Builder { /// index is satisfied at the current position. pub fn add_check_lookaround( &mut self, - index: usize, + index: SmallIndex, positive: bool, next: StateID, ) -> Result { diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 49744d2ee..313df2821 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -19,7 +19,7 @@ use crate::{ }, util::{ look::{Look, LookMatcher}, - primitives::{PatternID, StateID}, + primitives::{PatternID, SmallIndex, StateID}, }, }; @@ -1681,14 +1681,14 @@ impl Compiler { fn add_write_lookaround( &self, - index: usize, + index: SmallIndex, ) -> Result { self.builder.borrow_mut().add_write_lookaround(index) } fn add_check_lookaround( &self, - index: usize, + index: SmallIndex, positive: bool, ) -> Result { self.builder.borrow_mut().add_check_lookaround( diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index f445ac8e4..39c533592 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1102,8 +1102,8 @@ impl NFA { /// Returns how many lookaround sub-expressions this nfa contains #[inline] - pub fn look_count(&self) -> usize { - self.0.look_count + pub fn lookaround_count(&self) -> SmallIndex { + self.0.lookaround_count } // FIXME: The `look_set_prefix_all` computation was not correct, and it @@ -1266,7 +1266,10 @@ pub(super) struct Inner { /// zero-length prefix for any of the patterns in this NFA. look_set_prefix_all: LookSet, */ - look_count: usize, + /// How many look-around expression this NFA contains. + /// This is needed to initialize the table for storing the result of + /// look-around evaluation + lookaround_count: SmallIndex, /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state /// might use a different amount of heap, we need to keep track of this @@ -1384,7 +1387,7 @@ impl Inner { } State::CheckLookaround { look_idx, .. } | State::WriteLookaround { look_idx } => { - self.look_count = self.look_count.max(look_idx); + self.lookaround_count = self.lookaround_count.max(look_idx); } State::Union { .. } | State::BinaryUnion { .. } @@ -1565,7 +1568,7 @@ pub enum State { /// index `look_idx` WriteLookaround { /// The index of the lookaround expression that matches - look_idx: usize, + look_idx: SmallIndex, }, /// This indicates that we need to check whether lookaround expression with /// index `look_idx` holds at the current position in the haystack @@ -1573,7 +1576,7 @@ pub enum State { /// hence must NOT hold. CheckLookaround { /// The index of the lookaround expression that must be satisfied - look_idx: usize, + look_idx: SmallIndex, /// Whether this is a positive lookaround expression positive: bool, /// The next state to transition if the lookaround assertion is satisfied @@ -1791,13 +1794,13 @@ impl fmt::Debug for State { write!(f, "{:?} => {:?}", look, next.as_usize()) } State::WriteLookaround { look_idx } => { - write!(f, "Write Lookaround: {}", look_idx) + write!(f, "Write Lookaround: {}", look_idx.as_u32()) } State::CheckLookaround { look_idx, positive, next } => { write!( f, "Check Lookaround {} is {} => {}", - look_idx, + look_idx.as_u32(), positive, next.as_usize() ) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index d061023d5..d6848cafe 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1216,8 +1216,8 @@ impl PikeVM { } impl PikeVM { - fn look_count(&self) -> usize { - self.nfa.look_count() + fn lookaround_count(&self) -> SmallIndex { + self.nfa.lookaround_count() } /// The implementation of standard leftmost search. @@ -1984,7 +1984,7 @@ impl Cache { next: ActiveStates::new(re), lookaround: { let mut res = Vec::new(); - res.resize(re.look_count(), false); + res.resize(re.lookaround_count().as_usize(), false); res }, } From c126820e3f0ae3072e805300ac247e7510bda5f4 Mon Sep 17 00:00:00 2001 From: Robin Date: Tue, 11 Mar 2025 18:34:17 +0100 Subject: [PATCH 30/66] Implement look-around index generation --- regex-automata/src/nfa/thompson/compiler.rs | 13 +++++++++++- regex-automata/src/nfa/thompson/error.rs | 23 ++++++++++++++++++++- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 313df2821..8ffc68959 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -711,7 +711,13 @@ pub struct Compiler { /// State used for caching common suffixes when compiling reverse UTF-8 /// automata (for Unicode character classes). utf8_suffix: RefCell, + /// Top level alternation state which is used to run all look-around + /// assertion checks in lockstep with the main expression. Each look-around + /// expression is compiled to a set of states that is patched into this + /// state, and this state is updated on each new pattern being compiled. lookaround_alt: RefCell>, + /// The next index to use for a look-around expression. + lookaround_index: RefCell, } impl Compiler { @@ -725,6 +731,7 @@ impl Compiler { trie_state: RefCell::new(RangeTrie::new()), utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), lookaround_alt: RefCell::new(None), + lookaround_index: RefCell::new(SmallIndex::ZERO), } } @@ -1046,7 +1053,11 @@ impl Compiler { LookAround::NegativeLookBehind(_) => false, LookAround::PositiveLookBehind(_) => true, }; - let idx = todo!("get index"); + let idx = *self.lookaround_index.borrow(); + *self.lookaround_index.borrow_mut() = SmallIndex::new(idx.one_more()) + .map_err(|e| { + BuildError::too_many_lookarounds(e.attempted() as usize) + })?; let check = self.add_check_lookaround(idx, pos)?; let write = self.add_write_lookaround(idx)?; self.patch(sub.end, write)?; diff --git a/regex-automata/src/nfa/thompson/error.rs b/regex-automata/src/nfa/thompson/error.rs index 3c2fa8a21..a1f5aed5c 100644 --- a/regex-automata/src/nfa/thompson/error.rs +++ b/regex-automata/src/nfa/thompson/error.rs @@ -1,6 +1,6 @@ use crate::util::{ captures, look, - primitives::{PatternID, StateID}, + primitives::{PatternID, SmallIndex, StateID}, }; /// An error that can occurred during the construction of a thompson NFA. @@ -55,6 +55,14 @@ enum BuildErrorKind { /// The limit on the number of states. limit: usize, }, + /// An error that occurs if too many indices need to be generated for + /// look-around sub-expressions while building an NFA. + TooManyLookArounds { + /// The number of sub-expressions that exceeded the limit. + given: usize, + /// The limit on the number of sub-expressions. + limit: usize, + }, /// An error that occurs when NFA compilation exceeds a configured heap /// limit. ExceededSizeLimit { @@ -115,6 +123,13 @@ impl BuildError { BuildError { kind: BuildErrorKind::TooManyStates { given, limit } } } + pub(crate) fn too_many_lookarounds(given: usize) -> BuildError { + let limit = SmallIndex::LIMIT; + BuildError { + kind: BuildErrorKind::TooManyLookArounds { given, limit }, + } + } + pub(crate) fn exceeded_size_limit(limit: usize) -> BuildError { BuildError { kind: BuildErrorKind::ExceededSizeLimit { limit } } } @@ -164,6 +179,12 @@ impl core::fmt::Display for BuildError { which exceeds the limit of {}", given, limit, ), + BuildErrorKind::TooManyLookArounds { given, limit } => write!( + f, + "attempted to compile {} look-around expressions, \ + which exceeds the limit of {}", + given, limit, + ), BuildErrorKind::ExceededSizeLimit { limit } => write!( f, "heap usage during NFA compilation exceeded limit of {}", From bfd8087ba1a518ecc6835cc22cd8234025f854ef Mon Sep 17 00:00:00 2001 From: Robin Date: Tue, 11 Mar 2025 18:52:59 +0100 Subject: [PATCH 31/66] Change tracking of look-around state to index This makes it so we don't need to reset the lookaround state on each character advancement. --- regex-automata/src/nfa/thompson/pikevm.rs | 28 ++++++++++++++--------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index d6848cafe..259768604 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1490,7 +1490,7 @@ impl PikeVM { stack: &mut Vec, curr: &mut ActiveStates, next: &mut ActiveStates, - lookarounds: &mut Vec, + lookarounds: &mut Vec>, input: &Input<'_>, at: usize, slots: &mut [Option], @@ -1527,7 +1527,7 @@ impl PikeVM { stack: &mut Vec, curr: &mut ActiveStates, next: &mut ActiveStates, - lookarounds: &mut Vec, + lookarounds: &mut Vec>, input: &Input<'_>, at: usize, patset: &mut PatternSet, @@ -1581,7 +1581,7 @@ impl PikeVM { stack: &mut Vec, curr_slot_table: &mut SlotTable, next: &mut ActiveStates, - lookarounds: &mut Vec, + lookarounds: &mut Vec>, input: &Input<'_>, at: usize, sid: StateID, @@ -1672,7 +1672,7 @@ impl PikeVM { stack: &mut Vec, curr_slots: &mut [Option], next: &mut ActiveStates, - lookarounds: &mut Vec, + lookarounds: &mut Vec>, input: &Input<'_>, at: usize, sid: StateID, @@ -1732,7 +1732,7 @@ impl PikeVM { stack: &mut Vec, curr_slots: &mut [Option], next: &mut ActiveStates, - lookarounds: &mut Vec, + lookarounds: &mut Vec>, input: &Input<'_>, at: usize, mut sid: StateID, @@ -1773,11 +1773,16 @@ impl PikeVM { sid = next; } State::WriteLookaround { look_idx } => { - lookarounds[look_idx] = true; + // This is ok since `at` is always less than `usize::MAX`. + lookarounds[look_idx] = NonMaxUsize::new(at); return; } State::CheckLookaround { look_idx, positive, next } => { - if lookarounds[look_idx] != positive { + let state = match lookarounds[look_idx] { + None => usize::MAX, + Some(pos) => pos.get(), + }; + if (state == at) != positive { return; } sid = next; @@ -1963,9 +1968,10 @@ pub struct Cache { /// The next set of states we're building that will be explored for the /// next byte in the haystack. next: ActiveStates, - /// This answers the question: "Does lookaround assertion x hold at the - /// current position in the haystack" - lookaround: Vec, + /// This answers the question: "What is the maximum position in the + /// haystack at which lookaround assertion x holds and which is <= to the + /// current position" + lookaround: Vec>, } impl Cache { @@ -1984,7 +1990,7 @@ impl Cache { next: ActiveStates::new(re), lookaround: { let mut res = Vec::new(); - res.resize(re.lookaround_count().as_usize(), false); + res.resize(re.lookaround_count().as_usize(), None); res }, } From 183da7ae1b25c7c9b92fa986435e86a75bfa2fdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Thu, 13 Mar 2025 11:19:21 +0100 Subject: [PATCH 32/66] Fix cli tool and AST->HIR translation --- regex-cli/cmd/generate/fowler.rs | 1 - regex-syntax/src/hir/translate.rs | 74 +++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 1 deletion(-) diff --git a/regex-cli/cmd/generate/fowler.rs b/regex-cli/cmd/generate/fowler.rs index 70db71fb0..052d59ef8 100644 --- a/regex-cli/cmd/generate/fowler.rs +++ b/regex-cli/cmd/generate/fowler.rs @@ -421,6 +421,5 @@ fn count_capturing_groups_ast(ast: ®ex_syntax::ast::Ast) -> usize { Ast::Concat(ref concat) => { concat.asts.iter().map(count_capturing_groups_ast).sum() } - Ast::LookAround(_) => todo!(), } } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 20c62400b..d24ffb6d9 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -212,6 +212,13 @@ enum HirFrame { /// This sentinel only exists to stop other things (like flattening /// literals) from reaching across repetition operators. Repetition, + /// This is pushed whenever a look-around expression is observed. After + /// visiting the sub-expression in the look-around, the translator's stack + /// is expected to have this sentinel at the top. + /// + /// This sentinel only exists to stop other things (like flattening + /// literals) from reaching across look-around operators. + LookAround, /// This is pushed on to the stack upon first seeing any kind of capture, /// indicated by parentheses (including non-capturing groups). It is popped /// upon leaving a group. @@ -298,6 +305,18 @@ impl HirFrame { } } + fn unwrap_lookaround(self) { + match self { + HirFrame::LookAround => {} + _ => { + panic!( + "tried to unwrap look-around from HirFrame, got: {:?}", + self + ) + } + } + } + /// Assert that the current stack frame is a group indicator and return /// its corresponding flags (the flags that were active at the time the /// group was entered). @@ -363,6 +382,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::AlternationBranch); } } + Ast::LookAround(_) => self.push(HirFrame::LookAround), _ => {} } Ok(()) @@ -448,6 +468,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } Ast::LookAround(ref x) => { let expr = Box::new(self.pop().unwrap().unwrap_expr()); + self.pop().unwrap().unwrap_lookaround(); self.push(HirFrame::Expr(Hir::lookaround(match x.kind { ast::LookAroundKind::PositiveLookBehind => { hir::LookAround::PositiveLookBehind(expr) @@ -770,6 +791,9 @@ impl<'t, 'p> TranslatorI<'t, 'p> { HirFrame::AlternationBranch => { unreachable!("expected expr or concat, got alt branch marker") } + HirFrame::LookAround => { + unreachable!("expected expr or concat, got look-around") + } } } @@ -801,6 +825,9 @@ impl<'t, 'p> TranslatorI<'t, 'p> { HirFrame::AlternationBranch => { unreachable!("expected expr or alt, got alt branch marker") } + HirFrame::LookAround => { + unreachable!("expected expr or alt, got look-around") + } } } @@ -1612,6 +1639,15 @@ mod tests { Hir::look(look) } + fn hir_lookbehind(expr: Hir, positive: bool) -> Hir { + let lookaround = if positive { + hir::LookAround::PositiveLookBehind(Box::new(expr)) + } else { + hir::LookAround::NegativeLookBehind(Box::new(expr)) + }; + Hir::lookaround(lookaround) + } + #[test] fn empty() { assert_eq!(t(""), Hir::empty()); @@ -1835,6 +1871,44 @@ mod tests { assert_eq!(t(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate)); } + #[test] + fn lookarounds() { + assert_eq!(t("(?<=a)"), hir_lookbehind(hir_lit("a"), true)); + assert_eq!(t("(? Date: Thu, 13 Mar 2025 11:52:48 +0100 Subject: [PATCH 33/66] Fix lookaround union order --- regex-automata/src/nfa/thompson/compiler.rs | 50 ++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 8ffc68959..2309b7538 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -996,7 +996,7 @@ impl Compiler { if has_lookarounds { let lookaround_prefix = self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; - let lookaround_alt = self.add_union_reverse()?; + let lookaround_alt = self.add_union()?; self.patch(lookaround_prefix.end, lookaround_alt)?; self.patch(top_level_alt, lookaround_prefix.start)?; self.lookaround_alt.borrow_mut().replace(lookaround_alt); @@ -2037,6 +2037,22 @@ mod tests { } } + fn s_write_lookaround(id: usize) -> State { + State::WriteLookaround { + look_idx: SmallIndex::new(id) + .expect("look-around index too large"), + } + } + + fn s_check_lookaround(id: usize, positive: bool, next: usize) -> State { + State::CheckLookaround { + look_idx: SmallIndex::new(id) + .expect("look-around index too large"), + positive, + next: sid(next), + } + } + fn s_fail() -> State { State::Fail } @@ -2262,6 +2278,38 @@ mod tests { ); } + #[test] + fn compile_lookbehind() { + assert_eq!( + build(r"(?<=a)").states(), + &[ + s_bin_union(1, 4), + s_bin_union(3, 2), + s_range(b'\x00', b'\xFF', 1), + s_byte(b'a', 5), + s_check_lookaround(0, true, 6), + s_write_lookaround(0), + s_match(0) + ] + ); + assert_eq!( + build(r"(?<=a(? Date: Tue, 18 Mar 2025 13:17:59 +0100 Subject: [PATCH 34/66] Address review comments Rename certain enums to be consistent with rest of codebase. --- regex-automata/src/dfa/onepass.rs | 4 +- regex-automata/src/nfa/thompson/backtrack.rs | 4 +- regex-automata/src/nfa/thompson/builder.rs | 28 ++++----- regex-automata/src/nfa/thompson/compiler.rs | 15 ++--- regex-automata/src/nfa/thompson/nfa.rs | 61 +++++++++++--------- regex-automata/src/nfa/thompson/pikevm.rs | 18 +++--- regex-automata/src/util/determinize/mod.rs | 12 ++-- regex-syntax/src/hir/mod.rs | 2 - 8 files changed, 72 insertions(+), 72 deletions(-) diff --git a/regex-automata/src/dfa/onepass.rs b/regex-automata/src/dfa/onepass.rs index 022305a5a..30e4daf06 100644 --- a/regex-automata/src/dfa/onepass.rs +++ b/regex-automata/src/dfa/onepass.rs @@ -638,8 +638,8 @@ impl<'a> InternalBuilder<'a> { self.stack_push(nfa_id, Epsilons::empty())?; while let Some((id, epsilons)) = self.stack.pop() { match *self.nfa.state(id) { - thompson::State::WriteLookaround { .. } - | thompson::State::CheckLookaround { .. } => { + thompson::State::WriteLookAround { .. } + | thompson::State::CheckLookAround { .. } => { todo!("check how to handle") } thompson::State::ByteRange { ref trans } => { diff --git a/regex-automata/src/nfa/thompson/backtrack.rs b/regex-automata/src/nfa/thompson/backtrack.rs index b63a47fd5..be0cbcfbd 100644 --- a/regex-automata/src/nfa/thompson/backtrack.rs +++ b/regex-automata/src/nfa/thompson/backtrack.rs @@ -1519,8 +1519,8 @@ impl BoundedBacktracker { } sid = next; } - State::WriteLookaround { .. } - | State::CheckLookaround { .. } => { + State::WriteLookAround { .. } + | State::CheckLookAround { .. } => { todo!("check how to handle") } State::Union { ref alternates } => { diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index f9119f537..748d1d01c 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -93,11 +93,11 @@ enum State { }, /// An empty state that behaves analogously to a `Match` state but for /// the look-around sub-expression with the given index. - WriteLookaround { lookaround_index: SmallIndex }, + WriteLookAround { lookaround_index: SmallIndex }, /// A conditional epsilon transition that will only be taken if the /// look-around sub-expression with the given index evaluates to `positive` /// at the current position in the haystack. - CheckLookaround { + CheckLookAround { lookaround_index: SmallIndex, positive: bool, next: StateID, @@ -166,8 +166,8 @@ impl State { | State::CaptureEnd { .. } | State::Fail | State::Match { .. } - | State::CheckLookaround { .. } - | State::WriteLookaround { .. } => 0, + | State::CheckLookAround { .. } + | State::WriteLookAround { .. } => 0, State::Sparse { ref transitions } => { transitions.len() * mem::size_of::() } @@ -483,18 +483,18 @@ impl Builder { State::Look { look, next } => { remap[sid] = nfa.add(nfa::State::Look { look, next }); } - State::WriteLookaround { lookaround_index } => { - remap[sid] = nfa.add(nfa::State::WriteLookaround { - look_idx: lookaround_index, + State::WriteLookAround { lookaround_index } => { + remap[sid] = nfa.add(nfa::State::WriteLookAround { + lookaround_idx: lookaround_index, }); } - State::CheckLookaround { + State::CheckLookAround { lookaround_index, positive, next, } => { - remap[sid] = nfa.add(nfa::State::CheckLookaround { - look_idx: lookaround_index, + remap[sid] = nfa.add(nfa::State::CheckLookAround { + lookaround_idx: lookaround_index, positive, next, }); @@ -728,7 +728,7 @@ impl Builder { &mut self, index: SmallIndex, ) -> Result { - self.add(State::WriteLookaround { lookaround_index: index }) + self.add(State::WriteLookAround { lookaround_index: index }) } /// Add a state which will check whether the lookaround with the given @@ -739,7 +739,7 @@ impl Builder { positive: bool, next: StateID, ) -> Result { - self.add(State::CheckLookaround { + self.add(State::CheckLookAround { lookaround_index: index, positive, next, @@ -1212,7 +1212,7 @@ impl Builder { State::Look { ref mut next, .. } => { *next = to; } - State::CheckLookaround { ref mut next, .. } => { + State::CheckLookAround { ref mut next, .. } => { *next = to; } State::Union { ref mut alternates } => { @@ -1229,7 +1229,7 @@ impl Builder { State::CaptureEnd { ref mut next, .. } => { *next = to; } - State::WriteLookaround { .. } => {} + State::WriteLookAround { .. } => {} State::Fail => {} State::Match { .. } => {} } diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 2309b7538..9a3161cc8 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -1045,10 +1045,7 @@ impl Compiler { &self, lookaround: &LookAround, ) -> Result { - let sub = match lookaround { - LookAround::NegativeLookBehind(ref sub) - | LookAround::PositiveLookBehind(ref sub) => self.c(sub)?, - }; + let sub = self.c(lookaround.sub()); let pos = match lookaround { LookAround::NegativeLookBehind(_) => false, LookAround::PositiveLookBehind(_) => true, @@ -1064,7 +1061,7 @@ impl Compiler { self.patch( self.lookaround_alt .borrow() - .expect("Cannot compile lookaround outside pattern"), + .expect("Cannot compile look-around outside pattern"), sub.start, )?; Ok(ThompsonRef { start: check, end: check }) @@ -2038,15 +2035,15 @@ mod tests { } fn s_write_lookaround(id: usize) -> State { - State::WriteLookaround { - look_idx: SmallIndex::new(id) + State::WriteLookAround { + lookaround_idx: SmallIndex::new(id) .expect("look-around index too large"), } } fn s_check_lookaround(id: usize, positive: bool, next: usize) -> State { - State::CheckLookaround { - look_idx: SmallIndex::new(id) + State::CheckLookAround { + lookaround_idx: SmallIndex::new(id) .expect("look-around index too large"), positive, next: sid(next), diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 39c533592..76b72f6cd 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1100,7 +1100,7 @@ impl NFA { self.0.look_set_prefix_any } - /// Returns how many lookaround sub-expressions this nfa contains + /// Returns how many look-around sub-expressions this nfa contains #[inline] pub fn lookaround_count(&self) -> SmallIndex { self.0.lookaround_count @@ -1299,8 +1299,8 @@ impl Inner { State::ByteRange { .. } | State::Dense { .. } | State::Fail - | State::WriteLookaround { .. } => continue, - State::CheckLookaround { next, .. } => { + | State::WriteLookAround { .. } => continue, + State::CheckLookAround { next, .. } => { stack.push(next); } State::Sparse(_) => { @@ -1385,8 +1385,8 @@ impl Inner { State::Capture { .. } => { self.has_capture = true; } - State::CheckLookaround { look_idx, .. } - | State::WriteLookaround { look_idx } => { + State::CheckLookAround { lookaround_idx: look_idx, .. } + | State::WriteLookAround { lookaround_idx: look_idx } => { self.lookaround_count = self.lookaround_count.max(look_idx); } State::Union { .. } @@ -1563,23 +1563,24 @@ pub enum State { /// satisfied. next: StateID, }, - /// This is like a match state but for a lookaround expression - /// executing this state will write a `true` into the lookaround oracle at - /// index `look_idx` - WriteLookaround { - /// The index of the lookaround expression that matches - look_idx: SmallIndex, + /// This is like a match state but for a look-around expression. + /// Executing this state will write the current haystack offset into the + /// look-around oracle at index `lookaround_idx`. + WriteLookAround { + /// The index of the look-around expression that matches. + lookaround_idx: SmallIndex, }, /// This indicates that we need to check whether lookaround expression with - /// index `look_idx` holds at the current position in the haystack + /// index `lookaround_idx` holds at the current position in the haystack /// If `positive` is false, then the lookaround expression is negative and /// hence must NOT hold. - CheckLookaround { - /// The index of the lookaround expression that must be satisfied - look_idx: SmallIndex, - /// Whether this is a positive lookaround expression + CheckLookAround { + /// The index of the look-around expression that must be satisfied. + lookaround_idx: SmallIndex, + /// Whether this is a positive lookaround expression. positive: bool, - /// The next state to transition if the lookaround assertion is satisfied + /// The next state to transition if the look-around assertion is + /// satisfied. next: StateID, }, /// An alternation such that there exists an epsilon transition to all @@ -1696,12 +1697,12 @@ impl State { | State::Dense { .. } | State::Fail | State::Match { .. } - | State::WriteLookaround { .. } => false, + | State::WriteLookAround { .. } => false, State::Look { .. } | State::Union { .. } | State::BinaryUnion { .. } | State::Capture { .. } - | State::CheckLookaround { .. } => true, + | State::CheckLookAround { .. } => true, } } @@ -1714,8 +1715,8 @@ impl State { | State::Capture { .. } | State::Match { .. } | State::Fail - | State::WriteLookaround { .. } - | State::CheckLookaround { .. } => 0, + | State::WriteLookAround { .. } + | State::CheckLookAround { .. } => 0, State::Sparse(SparseTransitions { ref transitions }) => { transitions.len() * mem::size_of::() } @@ -1748,7 +1749,7 @@ impl State { } } State::Look { ref mut next, .. } => *next = remap[*next], - State::CheckLookaround { ref mut next, .. } => { + State::CheckLookAround { ref mut next, .. } => { *next = remap[*next] } State::Union { ref mut alternates } => { @@ -1763,7 +1764,7 @@ impl State { State::Capture { ref mut next, .. } => *next = remap[*next], State::Fail | State::Match { .. } - | State::WriteLookaround { .. } => {} + | State::WriteLookAround { .. } => {} } } } @@ -1793,15 +1794,19 @@ impl fmt::Debug for State { State::Look { ref look, next } => { write!(f, "{:?} => {:?}", look, next.as_usize()) } - State::WriteLookaround { look_idx } => { - write!(f, "Write Lookaround: {}", look_idx.as_u32()) + State::WriteLookAround { lookaround_idx: look_idx } => { + write!(f, "write-look-around({})", look_idx.as_u32()) } - State::CheckLookaround { look_idx, positive, next } => { + State::CheckLookAround { + lookaround_idx: look_idx, + positive, + next, + } => { write!( f, - "Check Lookaround {} is {} => {}", + "check-look-around({} is {}) => {}", look_idx.as_u32(), - positive, + if positive { "matched" } else { "not matched" }, next.as_usize() ) } diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 259768604..bb4899965 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1593,8 +1593,8 @@ impl PikeVM { | State::Union { .. } | State::BinaryUnion { .. } | State::Capture { .. } - | State::WriteLookaround { .. } - | State::CheckLookaround { .. } => None, + | State::WriteLookAround { .. } + | State::CheckLookAround { .. } => None, State::ByteRange { ref trans } => { if trans.matches(input.haystack(), at) { let slots = curr_slot_table.for_state(sid); @@ -1772,12 +1772,16 @@ impl PikeVM { } sid = next; } - State::WriteLookaround { look_idx } => { + State::WriteLookAround { lookaround_idx: look_idx } => { // This is ok since `at` is always less than `usize::MAX`. lookarounds[look_idx] = NonMaxUsize::new(at); return; } - State::CheckLookaround { look_idx, positive, next } => { + State::CheckLookAround { + lookaround_idx: look_idx, + positive, + next, + } => { let state = match lookarounds[look_idx] { None => usize::MAX, Some(pos) => pos.get(), @@ -1988,11 +1992,7 @@ impl Cache { stack: vec![], curr: ActiveStates::new(re), next: ActiveStates::new(re), - lookaround: { - let mut res = Vec::new(); - res.resize(re.lookaround_count().as_usize(), None); - res - }, + lookaround: vec![None; re.lookaround_count().as_usize()], } } diff --git a/regex-automata/src/util/determinize/mod.rs b/regex-automata/src/util/determinize/mod.rs index 9778bf1af..80f57bbe6 100644 --- a/regex-automata/src/util/determinize/mod.rs +++ b/regex-automata/src/util/determinize/mod.rs @@ -251,8 +251,8 @@ pub(crate) fn next( | thompson::State::Fail | thompson::State::Look { .. } | thompson::State::Capture { .. } => {} - thompson::State::CheckLookaround { .. } - | thompson::State::WriteLookaround { .. } => { + thompson::State::CheckLookAround { .. } + | thompson::State::WriteLookAround { .. } => { todo!("check how to handle") } thompson::State::Match { pattern_id } => { @@ -403,8 +403,8 @@ pub(crate) fn epsilon_closure( | thompson::State::Dense { .. } | thompson::State::Fail | thompson::State::Match { .. } => break, - thompson::State::WriteLookaround { .. } - | thompson::State::CheckLookaround { .. } => { + thompson::State::WriteLookAround { .. } + | thompson::State::CheckLookAround { .. } => { todo!("check how to handle") } thompson::State::Look { look, next } => { @@ -473,8 +473,8 @@ pub(crate) fn add_nfa_states( builder.add_nfa_state_id(nfa_id); builder.set_look_need(|need| need.insert(look)); } - thompson::State::CheckLookaround { .. } - | thompson::State::WriteLookaround { .. } => { + thompson::State::CheckLookAround { .. } + | thompson::State::WriteLookAround { .. } => { todo!("check how to handle") } thompson::State::Union { .. } diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 1d460ad00..d879c3b88 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -2139,8 +2139,6 @@ impl Properties { /// /// Only returns true for [`HirKind::LookAround`] and not for /// [`HirKind::Look`], which can be queried by [`look_set`] instead. - /// Currently, only lookbehind assertions without capture groups are - /// supported. #[inline] pub fn contains_lookaround_expr(&self) -> bool { self.0.contains_lookaround_expr From d435d2a82b0eddfbb8ef8ebafdb2162dd3de7ffc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Tue, 18 Mar 2025 15:48:44 +0100 Subject: [PATCH 35/66] Fix look-around indexing --- regex-automata/src/nfa/thompson/compiler.rs | 2 +- regex-automata/src/nfa/thompson/nfa.rs | 7 ++++--- regex-automata/src/nfa/thompson/pikevm.rs | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 9a3161cc8..7848699ed 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -1045,7 +1045,7 @@ impl Compiler { &self, lookaround: &LookAround, ) -> Result { - let sub = self.c(lookaround.sub()); + let sub = self.c(lookaround.sub())?; let pos = match lookaround { LookAround::NegativeLookBehind(_) => false, LookAround::PositiveLookBehind(_) => true, diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 76b72f6cd..4e499466f 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1102,7 +1102,7 @@ impl NFA { /// Returns how many look-around sub-expressions this nfa contains #[inline] - pub fn lookaround_count(&self) -> SmallIndex { + pub fn lookaround_count(&self) -> usize { self.0.lookaround_count } @@ -1269,7 +1269,7 @@ pub(super) struct Inner { /// How many look-around expression this NFA contains. /// This is needed to initialize the table for storing the result of /// look-around evaluation - lookaround_count: SmallIndex, + lookaround_count: usize, /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state /// might use a different amount of heap, we need to keep track of this @@ -1387,7 +1387,8 @@ impl Inner { } State::CheckLookAround { lookaround_idx: look_idx, .. } | State::WriteLookAround { lookaround_idx: look_idx } => { - self.lookaround_count = self.lookaround_count.max(look_idx); + self.lookaround_count = + self.lookaround_count.max(look_idx.as_usize() + 1); } State::Union { .. } | State::BinaryUnion { .. } diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index bb4899965..cf667940e 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1216,7 +1216,7 @@ impl PikeVM { } impl PikeVM { - fn lookaround_count(&self) -> SmallIndex { + fn lookaround_count(&self) -> usize { self.nfa.lookaround_count() } @@ -1992,7 +1992,7 @@ impl Cache { stack: vec![], curr: ActiveStates::new(re), next: ActiveStates::new(re), - lookaround: vec![None; re.lookaround_count().as_usize()], + lookaround: vec![None; re.lookaround_count()], } } From ee10459d08e32c3796068f52890b4faf0c847e0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Tue, 18 Mar 2025 18:55:52 +0100 Subject: [PATCH 36/66] Add error messages and fix pre-filter We need to disable pre-filters when a regex contains lookarounds. This is because the relevant information for a lookbehind can be before the start of the match. --- regex-automata/src/dfa/dense.rs | 6 ++++++ regex-automata/src/dfa/determinize.rs | 4 ++++ regex-automata/src/hybrid/dfa.rs | 3 +++ regex-automata/src/hybrid/error.rs | 6 ++++++ regex-automata/src/meta/wrappers.rs | 2 ++ regex-syntax/src/hir/literal.rs | 5 ++--- 6 files changed, 23 insertions(+), 3 deletions(-) diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index fdae99fa6..8b41d0ae5 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -5083,6 +5083,12 @@ impl BuildError { BuildError { kind: BuildErrorKind::Unsupported(msg) } } + pub(crate) fn unsupported_lookaround() -> BuildError { + let msg = "cannot build DFAs for regexes with look-around\ + sub-expressions; use a different regex engine"; + BuildError { kind: BuildErrorKind::Unsupported(msg) } + } + pub(crate) fn too_many_states() -> BuildError { BuildError { kind: BuildErrorKind::TooManyStates } } diff --git a/regex-automata/src/dfa/determinize.rs b/regex-automata/src/dfa/determinize.rs index 19f99f5d6..3b048081e 100644 --- a/regex-automata/src/dfa/determinize.rs +++ b/regex-automata/src/dfa/determinize.rs @@ -219,6 +219,10 @@ impl<'a> Runner<'a> { return Err(BuildError::unsupported_dfa_word_boundary_unicode()); } + if self.nfa.lookaround_count() > 0 { + return Err(BuildError::unsupported_lookaround()); + } + // A sequence of "representative" bytes drawn from each equivalence // class. These representative bytes are fed to the NFA to compute // state transitions. This allows us to avoid re-computing state diff --git a/regex-automata/src/hybrid/dfa.rs b/regex-automata/src/hybrid/dfa.rs index bd9179b19..5c1978f8d 100644 --- a/regex-automata/src/hybrid/dfa.rs +++ b/regex-automata/src/hybrid/dfa.rs @@ -4056,6 +4056,9 @@ impl Builder { &self, nfa: thompson::NFA, ) -> Result { + if nfa.lookaround_count() > 0 { + return Err(BuildError::unsupported_lookaround()); + } let quitset = self.config.quit_set_from_nfa(&nfa)?; let classes = self.config.byte_classes_from_nfa(&nfa, &quitset); // Check that we can fit at least a few states into our cache, diff --git a/regex-automata/src/hybrid/error.rs b/regex-automata/src/hybrid/error.rs index d134e7ec9..ae3ae6c53 100644 --- a/regex-automata/src/hybrid/error.rs +++ b/regex-automata/src/hybrid/error.rs @@ -61,6 +61,12 @@ impl BuildError { different regex engine"; BuildError { kind: BuildErrorKind::Unsupported(msg) } } + + pub(crate) fn unsupported_lookaround() -> BuildError { + let msg = "cannot build DFAs for regexes with look-around\ + sub-expressions; use a different regex engine"; + BuildError { kind: BuildErrorKind::Unsupported(msg) } + } } #[cfg(feature = "std")] diff --git a/regex-automata/src/meta/wrappers.rs b/regex-automata/src/meta/wrappers.rs index 6cb19ba0d..f7c5c1096 100644 --- a/regex-automata/src/meta/wrappers.rs +++ b/regex-automata/src/meta/wrappers.rs @@ -204,6 +204,8 @@ impl BoundedBacktrackerEngine { { if !info.config().get_backtrack() || info.config().get_match_kind() != MatchKind::LeftmostFirst + // TODO: remove once look-around support is added. + || nfa.lookaround_count() > 0 { return Ok(None); } diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index c08c2b007..584c2893b 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -172,9 +172,8 @@ impl Extractor { use crate::hir::HirKind::*; match *hir.kind() { - Empty | Look(_) | LookAround(_) => { - Seq::singleton(self::Literal::exact(vec![])) - } + Empty | Look(_) => Seq::singleton(self::Literal::exact(vec![])), + LookAround(_) => Seq::infinite(), Literal(hir::Literal(ref bytes)) => { let mut seq = Seq::singleton(self::Literal::exact(bytes.to_vec())); From 0b51fc5f823cf31013fcc38a0eacf0aafcc19c09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Tue, 18 Mar 2025 18:58:11 +0100 Subject: [PATCH 37/66] Add unit tests for look-behind assertions --- regex-automata/tests/dfa/onepass/suite.rs | 5 +- regex-automata/tests/dfa/suite.rs | 12 +++- regex-automata/tests/hybrid/suite.rs | 11 +++- regex-automata/tests/lib.rs | 1 + .../tests/nfa/thompson/backtrack/suite.rs | 8 +++ testdata/lookaround.toml | 59 +++++++++++++++++++ 6 files changed, 93 insertions(+), 3 deletions(-) create mode 100644 testdata/lookaround.toml diff --git a/regex-automata/tests/dfa/onepass/suite.rs b/regex-automata/tests/dfa/onepass/suite.rs index 20bd6965c..4c7682f7f 100644 --- a/regex-automata/tests/dfa/onepass/suite.rs +++ b/regex-automata/tests/dfa/onepass/suite.rs @@ -79,7 +79,10 @@ fn compiler( // Since our error types are all generally opaque, we just // look for an error string. Not great, but not the end of the // world. - if test.compiles() && msg.contains("not one-pass") { + if test.compiles() + && (msg.contains("not one-pass") + || msg.contains("look-around")) + { return Ok(CompiledRegex::skip()); } return Err(err.into()); diff --git a/regex-automata/tests/dfa/suite.rs b/regex-automata/tests/dfa/suite.rs index 8ed6dd007..febded611 100644 --- a/regex-automata/tests/dfa/suite.rs +++ b/regex-automata/tests/dfa/suite.rs @@ -292,7 +292,17 @@ fn compiler( if !configure_regex_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); } - create_matcher(&builder, pre, builder.build_many(®exes)?) + let re = match builder.build_many(regexes) { + Ok(re) => re, + Err(err) + if test.compiles() + && format!("{err}").contains("look-around") => + { + return Ok(CompiledRegex::skip()); + } + Err(err) => return Err(err.into()), + }; + create_matcher(&builder, pre, re) } } diff --git a/regex-automata/tests/hybrid/suite.rs b/regex-automata/tests/hybrid/suite.rs index 4aaca6698..ee81aca8d 100644 --- a/regex-automata/tests/hybrid/suite.rs +++ b/regex-automata/tests/hybrid/suite.rs @@ -183,7 +183,16 @@ fn compiler( if !configure_regex_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); } - let re = builder.build_many(®exes)?; + let re = match builder.build_many(regexes) { + Ok(re) => re, + Err(err) + if test.compiles() + && format!("{err}").contains("look-around") => + { + return Ok(CompiledRegex::skip()); + } + Err(err) => return Err(err.into()), + }; let mut cache = re.create_cache(); Ok(CompiledRegex::compiled(move |test| -> TestResult { run_test(&re, &mut cache, test) diff --git a/regex-automata/tests/lib.rs b/regex-automata/tests/lib.rs index 67c979aa8..1ba08fe87 100644 --- a/regex-automata/tests/lib.rs +++ b/regex-automata/tests/lib.rs @@ -65,6 +65,7 @@ fn suite() -> anyhow::Result { load!("fowler/basic"); load!("fowler/nullsubexpr"); load!("fowler/repetition"); + load!("lookaround"); Ok(tests) } diff --git a/regex-automata/tests/nfa/thompson/backtrack/suite.rs b/regex-automata/tests/nfa/thompson/backtrack/suite.rs index bce0eef40..674ce5039 100644 --- a/regex-automata/tests/nfa/thompson/backtrack/suite.rs +++ b/regex-automata/tests/nfa/thompson/backtrack/suite.rs @@ -74,6 +74,10 @@ fn min_visited_capacity() -> Result<()> { .configure(config_thompson(test)) .syntax(config_syntax(test)) .build_many(®exes)?; + // TODO: remove once look-around is supported. + if nfa.lookaround_count() > 0 { + return Ok(CompiledRegex::skip()); + } let mut builder = BoundedBacktracker::builder(); if !configure_backtrack_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); @@ -105,6 +109,10 @@ fn compiler( return Ok(CompiledRegex::skip()); } let re = builder.build_many(®exes)?; + // TODO: remove once look-around is supported. + if re.get_nfa().lookaround_count() > 0 { + return Ok(CompiledRegex::skip()); + } let mut cache = re.create_cache(); Ok(CompiledRegex::compiled(move |test| -> TestResult { run_test(&re, &mut cache, test) diff --git a/testdata/lookaround.toml b/testdata/lookaround.toml new file mode 100644 index 000000000..ecbd76d48 --- /dev/null +++ b/testdata/lookaround.toml @@ -0,0 +1,59 @@ +[[test]] +name = "basic lookbehind positive" +regex = "(?<=b)a" +haystack = "ba" +matches = [[1, 2]] + +[[test]] +name = "basic lookbehind negative" +regex = "(? Date: Tue, 25 Mar 2025 11:17:49 +0100 Subject: [PATCH 38/66] Bump version numbers --- Cargo.toml | 6 +++--- regex-automata/Cargo.toml | 4 ++-- regex-cli/Cargo.toml | 4 ++-- regex-syntax/Cargo.toml | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 60be5b9d4..0f8b99f74 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex" -version = "1.11.1" #:version +version = "1.12.0" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" readme = "README.md" @@ -176,14 +176,14 @@ default-features = false # For the actual regex engines. [dependencies.regex-automata] path = "regex-automata" -version = "0.4.8" +version = "0.5.0" default-features = false features = ["alloc", "syntax", "meta", "nfa-pikevm"] # For parsing regular expressions. [dependencies.regex-syntax] path = "regex-syntax" -version = "0.8.5" +version = "0.9.0" default-features = false [dev-dependencies] diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 19d9dc229..2c4069899 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-automata" -version = "0.4.9" #:version +version = "0.5.0" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" @@ -86,7 +86,7 @@ internal-instrument-pikevm = ["logging", "std"] aho-corasick = { version = "1.0.0", optional = true, default-features = false } log = { version = "0.4.14", optional = true } memchr = { version = "2.6.0", optional = true, default-features = false } -regex-syntax = { path = "../regex-syntax", version = "0.8.5", optional = true, default-features = false } +regex-syntax = { path = "../regex-syntax", version = "0.9.0", optional = true, default-features = false } [dev-dependencies] anyhow = "1.0.69" diff --git a/regex-cli/Cargo.toml b/regex-cli/Cargo.toml index d7fd44b7b..4284091ea 100644 --- a/regex-cli/Cargo.toml +++ b/regex-cli/Cargo.toml @@ -29,8 +29,8 @@ lexopt = "0.3.0" log = { version = "0.4.17", features = ["std"] } memmap2 = "0.9.4" regex = { version = "1.9.0", path = ".." } -regex-automata = { version = "0.4.8", path = "../regex-automata", features = ["logging"] } +regex-automata = { version = "0.5.0", path = "../regex-automata", features = ["logging"] } regex-lite = { version = "0.1.0", path = "../regex-lite" } -regex-syntax = { version = "0.8.5", path = "../regex-syntax" } +regex-syntax = { version = "0.9.0", path = "../regex-syntax" } tabwriter = { version = "1.2.1", features = ["ansi_formatting"] } textwrap = { version = "0.16.0", default-features = false } diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index 0cbcde5e7..f6a443546 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-syntax" -version = "0.8.5" #:version +version = "0.9.0" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex/tree/master/regex-syntax" From 4fa91049c0afeecbec6e61571632ecca9ff90f0d Mon Sep 17 00:00:00 2001 From: shilangyu Date: Wed, 2 Apr 2025 22:17:07 +0200 Subject: [PATCH 39/66] Adjust some docs --- regex-automata/src/nfa/thompson/backtrack.rs | 2 +- regex-automata/src/nfa/thompson/builder.rs | 13 +++++----- regex-automata/src/nfa/thompson/compiler.rs | 4 +-- regex-automata/src/nfa/thompson/nfa.rs | 26 ++++++++++---------- regex-automata/src/nfa/thompson/pikevm.rs | 8 +++--- regex-syntax/src/hir/mod.rs | 4 +-- 6 files changed, 28 insertions(+), 29 deletions(-) diff --git a/regex-automata/src/nfa/thompson/backtrack.rs b/regex-automata/src/nfa/thompson/backtrack.rs index be0cbcfbd..98a5b5c1e 100644 --- a/regex-automata/src/nfa/thompson/backtrack.rs +++ b/regex-automata/src/nfa/thompson/backtrack.rs @@ -1453,7 +1453,7 @@ impl BoundedBacktracker { /// Execute a "step" in the backtracing algorithm. /// /// A "step" is somewhat of a misnomer, because this routine keeps going - /// until it either runs out of things to try or fins a match. In the + /// until it either runs out of things to try or finds a match. In the /// former case, it may have pushed some things on to the backtracking /// stack, in which case, those will be tried next as part of the /// 'backtrack' routine above. diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index 748d1d01c..c769fda23 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -92,7 +92,7 @@ enum State { next: StateID, }, /// An empty state that behaves analogously to a `Match` state but for - /// the look-around sub-expression with the given index. + /// the look-around sub-expression with the given look-around index. WriteLookAround { lookaround_index: SmallIndex }, /// A conditional epsilon transition that will only be taken if the /// look-around sub-expression with the given index evaluates to `positive` @@ -484,9 +484,8 @@ impl Builder { remap[sid] = nfa.add(nfa::State::Look { look, next }); } State::WriteLookAround { lookaround_index } => { - remap[sid] = nfa.add(nfa::State::WriteLookAround { - lookaround_idx: lookaround_index, - }); + remap[sid] = nfa + .add(nfa::State::WriteLookAround { lookaround_index }); } State::CheckLookAround { lookaround_index, @@ -494,7 +493,7 @@ impl Builder { next, } => { remap[sid] = nfa.add(nfa::State::CheckLookAround { - lookaround_idx: lookaround_index, + lookaround_index, positive, next, }); @@ -722,7 +721,7 @@ impl Builder { self.add(State::Empty { next: StateID::ZERO }) } - /// Add a state which will record that the lookaround with the given index + /// Add a state which will record that the look-around with the given index /// is satisfied at the current position. pub fn add_write_lookaround( &mut self, @@ -731,7 +730,7 @@ impl Builder { self.add(State::WriteLookAround { lookaround_index: index }) } - /// Add a state which will check whether the lookaround with the given + /// Add a state which will check whether the look-around with the given /// index is satisfied at the current position. pub fn add_check_lookaround( &mut self, diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 7848699ed..f4ecd4a09 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -2036,14 +2036,14 @@ mod tests { fn s_write_lookaround(id: usize) -> State { State::WriteLookAround { - lookaround_idx: SmallIndex::new(id) + lookaround_index: SmallIndex::new(id) .expect("look-around index too large"), } } fn s_check_lookaround(id: usize, positive: bool, next: usize) -> State { State::CheckLookAround { - lookaround_idx: SmallIndex::new(id) + lookaround_index: SmallIndex::new(id) .expect("look-around index too large"), positive, next: sid(next), diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 4e499466f..461fae9f5 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1100,7 +1100,7 @@ impl NFA { self.0.look_set_prefix_any } - /// Returns how many look-around sub-expressions this nfa contains + /// Returns how many look-around sub-expressions this nfa contains. #[inline] pub fn lookaround_count(&self) -> usize { self.0.lookaround_count @@ -1268,7 +1268,7 @@ pub(super) struct Inner { */ /// How many look-around expression this NFA contains. /// This is needed to initialize the table for storing the result of - /// look-around evaluation + /// look-around evaluation. lookaround_count: usize, /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state @@ -1385,8 +1385,8 @@ impl Inner { State::Capture { .. } => { self.has_capture = true; } - State::CheckLookAround { lookaround_idx: look_idx, .. } - | State::WriteLookAround { lookaround_idx: look_idx } => { + State::CheckLookAround { lookaround_index: look_idx, .. } + | State::WriteLookAround { lookaround_index: look_idx } => { self.lookaround_count = self.lookaround_count.max(look_idx.as_usize() + 1); } @@ -1566,19 +1566,19 @@ pub enum State { }, /// This is like a match state but for a look-around expression. /// Executing this state will write the current haystack offset into the - /// look-around oracle at index `lookaround_idx`. + /// look-around oracle at index `lookaround_index`. WriteLookAround { /// The index of the look-around expression that matches. - lookaround_idx: SmallIndex, + lookaround_index: SmallIndex, }, - /// This indicates that we need to check whether lookaround expression with - /// index `lookaround_idx` holds at the current position in the haystack - /// If `positive` is false, then the lookaround expression is negative and + /// This indicates that we need to check whether look-around expression with + /// index `lookaround_index` holds at the current position in the haystack. + /// If `positive` is false, then the look-around expression is negative and /// hence must NOT hold. CheckLookAround { /// The index of the look-around expression that must be satisfied. - lookaround_idx: SmallIndex, - /// Whether this is a positive lookaround expression. + lookaround_index: SmallIndex, + /// Whether this is a positive look-around expression. positive: bool, /// The next state to transition if the look-around assertion is /// satisfied. @@ -1795,11 +1795,11 @@ impl fmt::Debug for State { State::Look { ref look, next } => { write!(f, "{:?} => {:?}", look, next.as_usize()) } - State::WriteLookAround { lookaround_idx: look_idx } => { + State::WriteLookAround { lookaround_index: look_idx } => { write!(f, "write-look-around({})", look_idx.as_u32()) } State::CheckLookAround { - lookaround_idx: look_idx, + lookaround_index: look_idx, positive, next, } => { diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index cf667940e..6d75d2859 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1772,13 +1772,13 @@ impl PikeVM { } sid = next; } - State::WriteLookAround { lookaround_idx: look_idx } => { + State::WriteLookAround { lookaround_index: look_idx } => { // This is ok since `at` is always less than `usize::MAX`. lookarounds[look_idx] = NonMaxUsize::new(at); return; } State::CheckLookAround { - lookaround_idx: look_idx, + lookaround_index: look_idx, positive, next, } => { @@ -1973,8 +1973,8 @@ pub struct Cache { /// next byte in the haystack. next: ActiveStates, /// This answers the question: "What is the maximum position in the - /// haystack at which lookaround assertion x holds and which is <= to the - /// current position" + /// haystack at which look-around indexed x holds and which is <= to the + /// current position". lookaround: Vec>, } diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index d879c3b88..c47ce041e 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1819,7 +1819,7 @@ impl LookAround { } } - /// Returns a mutable reference to the inner expression + /// Returns a mutable reference to the inner expression. pub fn sub_mut(&mut self) -> &mut Hir { match self { Self::PositiveLookBehind(sub) | Self::NegativeLookBehind(sub) => { @@ -2556,7 +2556,7 @@ impl Properties { look_set_prefix_any: LookSet::singleton(look), look_set_suffix_any: LookSet::singleton(look), // Note, this field represents _general_ lookarounds (ones using - // LookAround) and not simple ones (using Look). + // LookAround) and not assertions (using Look). contains_lookaround_expr: false, // This requires a little explanation. Basically, we don't consider // matching an empty string to be equivalent to matching invalid From 998f705090e0d9c6e6bd893702c86ce3e715a257 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Thu, 3 Apr 2025 20:53:53 +0200 Subject: [PATCH 40/66] Add lookbehind with capture group test --- testdata/lookaround.toml | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/testdata/lookaround.toml b/testdata/lookaround.toml index ecbd76d48..9b34be47b 100644 --- a/testdata/lookaround.toml +++ b/testdata/lookaround.toml @@ -26,34 +26,43 @@ matches = [] name = "lookbehind in quantifier non-repeating" regex = "(?:(?<=c)a)+" haystack = "badacacaea" -matches = [[5,6], [7,8]] +matches = [[5, 6], [7, 8]] [[test]] name = "lookbehind in quantifier repeating" regex = "(?:(?<=a)a)+" haystack = "babaabaaabaaaac" -matches = [[4,5], [7,9], [11,14]] +matches = [[4, 5], [7, 9], [11, 14]] [[test]] name = "lookbehind with quantifier" regex = "(?<=cb+)a" haystack = "acabacbacbbaea" -matches = [[7,8], [11,12]] +matches = [[7, 8], [11, 12]] [[test]] name = "nested lookbehind" regex = "(?<=c[def]+(? Date: Thu, 3 Apr 2025 20:59:27 +0200 Subject: [PATCH 41/66] Change how test suite filters tests --- regex-automata/tests/dfa/suite.rs | 18 +++++++----------- regex-automata/tests/hybrid/suite.rs | 17 +++++++---------- .../tests/nfa/thompson/backtrack/suite.rs | 4 ++-- 3 files changed, 16 insertions(+), 23 deletions(-) diff --git a/regex-automata/tests/dfa/suite.rs b/regex-automata/tests/dfa/suite.rs index febded611..aa43cc7e6 100644 --- a/regex-automata/tests/dfa/suite.rs +++ b/regex-automata/tests/dfa/suite.rs @@ -289,20 +289,16 @@ fn compiler( } } } + // Or look-around expressions. + for hir in hirs.iter() { + if hir.properties().contains_lookaround_expr() { + return Ok(CompiledRegex::skip()); + } + } if !configure_regex_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); } - let re = match builder.build_many(regexes) { - Ok(re) => re, - Err(err) - if test.compiles() - && format!("{err}").contains("look-around") => - { - return Ok(CompiledRegex::skip()); - } - Err(err) => return Err(err.into()), - }; - create_matcher(&builder, pre, re) + create_matcher(&builder, pre, builder.build_many(regexes)?) } } diff --git a/regex-automata/tests/hybrid/suite.rs b/regex-automata/tests/hybrid/suite.rs index ee81aca8d..65769f001 100644 --- a/regex-automata/tests/hybrid/suite.rs +++ b/regex-automata/tests/hybrid/suite.rs @@ -180,19 +180,16 @@ fn compiler( } } } + // Or look-around expressions. + for hir in hirs.iter() { + if hir.properties().contains_lookaround_expr() { + return Ok(CompiledRegex::skip()); + } + } if !configure_regex_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); } - let re = match builder.build_many(regexes) { - Ok(re) => re, - Err(err) - if test.compiles() - && format!("{err}").contains("look-around") => - { - return Ok(CompiledRegex::skip()); - } - Err(err) => return Err(err.into()), - }; + let re = builder.build_many(®exes)?; let mut cache = re.create_cache(); Ok(CompiledRegex::compiled(move |test| -> TestResult { run_test(&re, &mut cache, test) diff --git a/regex-automata/tests/nfa/thompson/backtrack/suite.rs b/regex-automata/tests/nfa/thompson/backtrack/suite.rs index 674ce5039..2dd9d1f1b 100644 --- a/regex-automata/tests/nfa/thompson/backtrack/suite.rs +++ b/regex-automata/tests/nfa/thompson/backtrack/suite.rs @@ -74,7 +74,7 @@ fn min_visited_capacity() -> Result<()> { .configure(config_thompson(test)) .syntax(config_syntax(test)) .build_many(®exes)?; - // TODO: remove once look-around is supported. + // The backtracker doesn't support lookarounds, so skip if there are any. if nfa.lookaround_count() > 0 { return Ok(CompiledRegex::skip()); } @@ -109,7 +109,7 @@ fn compiler( return Ok(CompiledRegex::skip()); } let re = builder.build_many(®exes)?; - // TODO: remove once look-around is supported. + // The backtracker doesn't support lookarounds, so skip if there are any. if re.get_nfa().lookaround_count() > 0 { return Ok(CompiledRegex::skip()); } From f79fbab4a6eab6dbc261ba0fe430d9c19c75b4c9 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Fri, 4 Apr 2025 21:43:09 +0200 Subject: [PATCH 42/66] Change engine fallbacks --- regex-automata/src/dfa/dense.rs | 2 +- regex-automata/src/dfa/onepass.rs | 13 ++- regex-automata/src/hybrid/error.rs | 2 +- regex-automata/src/meta/strategy.rs | 85 ++++++++++--------- regex-automata/src/nfa/thompson/backtrack.rs | 7 +- regex-automata/src/nfa/thompson/compiler.rs | 7 ++ regex-automata/src/nfa/thompson/error.rs | 16 ++++ regex-automata/src/util/determinize/mod.rs | 6 +- .../tests/nfa/thompson/backtrack/suite.rs | 12 ++- 9 files changed, 101 insertions(+), 49 deletions(-) diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index 8b41d0ae5..43973a92a 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -5084,7 +5084,7 @@ impl BuildError { } pub(crate) fn unsupported_lookaround() -> BuildError { - let msg = "cannot build DFAs for regexes with look-around\ + let msg = "cannot build DFAs for regexes with look-around \ sub-expressions; use a different regex engine"; BuildError { kind: BuildErrorKind::Unsupported(msg) } } diff --git a/regex-automata/src/dfa/onepass.rs b/regex-automata/src/dfa/onepass.rs index 30e4daf06..b75feac45 100644 --- a/regex-automata/src/dfa/onepass.rs +++ b/regex-automata/src/dfa/onepass.rs @@ -602,6 +602,9 @@ impl<'a> InternalBuilder<'a> { )); } assert_eq!(DEAD, self.add_empty_state()?); + if self.nfa.lookaround_count() > 0 { + return Err(BuildError::unsupported_lookaround()); + } // This is where the explicit slots start. We care about this because // we only need to track explicit slots. The implicit slots---two for @@ -640,7 +643,7 @@ impl<'a> InternalBuilder<'a> { match *self.nfa.state(id) { thompson::State::WriteLookAround { .. } | thompson::State::CheckLookAround { .. } => { - todo!("check how to handle") + return Err(BuildError::unsupported_lookaround()); } thompson::State::ByteRange { ref trans } => { self.compile_transition(dfa_id, trans, epsilons)?; @@ -3000,6 +3003,7 @@ enum BuildErrorKind { UnsupportedLook { look: Look }, ExceededSizeLimit { limit: usize }, NotOnePass { msg: &'static str }, + UnsupportedLookAround, } impl BuildError { @@ -3030,6 +3034,10 @@ impl BuildError { fn not_one_pass(msg: &'static str) -> BuildError { BuildError { kind: BuildErrorKind::NotOnePass { msg } } } + + fn unsupported_lookaround() -> BuildError { + BuildError { kind: BuildErrorKind::UnsupportedLookAround } + } } #[cfg(feature = "std")] @@ -3078,6 +3086,9 @@ impl core::fmt::Display for BuildError { pattern is not one-pass: {}", msg, ), + UnsupportedLookAround => { + write!(f, "one-pass DFA does not support look-arounds") + } } } } diff --git a/regex-automata/src/hybrid/error.rs b/regex-automata/src/hybrid/error.rs index ae3ae6c53..062b9ac62 100644 --- a/regex-automata/src/hybrid/error.rs +++ b/regex-automata/src/hybrid/error.rs @@ -63,7 +63,7 @@ impl BuildError { } pub(crate) fn unsupported_lookaround() -> BuildError { - let msg = "cannot build DFAs for regexes with look-around\ + let msg = "cannot build DFAs for regexes with look-around \ sub-expressions; use a different regex engine"; BuildError { kind: BuildErrorKind::Unsupported(msg) } } diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 04f2ba3c3..0ac830b9d 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -490,49 +490,52 @@ impl Core { // we know we aren't going to use the lazy DFA. So we do a config check // up front, which is in practice the only way we won't try to use the // DFA. - let (nfarev, hybrid, dfa) = - if !info.config().get_hybrid() && !info.config().get_dfa() { - (None, wrappers::Hybrid::none(), wrappers::DFA::none()) + let (nfarev, hybrid, dfa) = if !info.config().get_hybrid() + && !info.config().get_dfa() + // With look-arounds, the lazy DFA and dense DFA would fail to build + || nfa.lookaround_count() > 0 + { + (None, wrappers::Hybrid::none(), wrappers::DFA::none()) + } else { + // FIXME: Technically, we don't quite yet KNOW that we need + // a reverse NFA. It's possible for the DFAs below to both + // fail to build just based on the forward NFA. In which case, + // building the reverse NFA was totally wasted work. But... + // fixing this requires breaking DFA construction apart into + // two pieces: one for the forward part and another for the + // reverse part. Quite annoying. Making it worse, when building + // both DFAs fails, it's quite likely that the NFA is large and + // that it will take quite some time to build the reverse NFA + // too. So... it's really probably worth it to do this! + let nfarev = thompson::Compiler::new() + // Currently, reverse NFAs don't support capturing groups, + // so we MUST disable them. But even if we didn't have to, + // we would, because nothing in this crate does anything + // useful with capturing groups in reverse. And of course, + // the lazy DFA ignores capturing groups in all cases. + .configure( + thompson_config + .clone() + .which_captures(WhichCaptures::None) + .reverse(true), + ) + .build_many_from_hir(hirs) + .map_err(BuildError::nfa)?; + let dfa = if !info.config().get_dfa() { + wrappers::DFA::none() } else { - // FIXME: Technically, we don't quite yet KNOW that we need - // a reverse NFA. It's possible for the DFAs below to both - // fail to build just based on the forward NFA. In which case, - // building the reverse NFA was totally wasted work. But... - // fixing this requires breaking DFA construction apart into - // two pieces: one for the forward part and another for the - // reverse part. Quite annoying. Making it worse, when building - // both DFAs fails, it's quite likely that the NFA is large and - // that it will take quite some time to build the reverse NFA - // too. So... it's really probably worth it to do this! - let nfarev = thompson::Compiler::new() - // Currently, reverse NFAs don't support capturing groups, - // so we MUST disable them. But even if we didn't have to, - // we would, because nothing in this crate does anything - // useful with capturing groups in reverse. And of course, - // the lazy DFA ignores capturing groups in all cases. - .configure( - thompson_config - .clone() - .which_captures(WhichCaptures::None) - .reverse(true), - ) - .build_many_from_hir(hirs) - .map_err(BuildError::nfa)?; - let dfa = if !info.config().get_dfa() { - wrappers::DFA::none() - } else { - wrappers::DFA::new(&info, pre.clone(), &nfa, &nfarev) - }; - let hybrid = if !info.config().get_hybrid() { - wrappers::Hybrid::none() - } else if dfa.is_some() { - debug!("skipping lazy DFA because we have a full DFA"); - wrappers::Hybrid::none() - } else { - wrappers::Hybrid::new(&info, pre.clone(), &nfa, &nfarev) - }; - (Some(nfarev), hybrid, dfa) + wrappers::DFA::new(&info, pre.clone(), &nfa, &nfarev) }; + let hybrid = if !info.config().get_hybrid() { + wrappers::Hybrid::none() + } else if dfa.is_some() { + debug!("skipping lazy DFA because we have a full DFA"); + wrappers::Hybrid::none() + } else { + wrappers::Hybrid::new(&info, pre.clone(), &nfa, &nfarev) + }; + (Some(nfarev), hybrid, dfa) + }; Ok(Core { info, pre, diff --git a/regex-automata/src/nfa/thompson/backtrack.rs b/regex-automata/src/nfa/thompson/backtrack.rs index 98a5b5c1e..eb36d1829 100644 --- a/regex-automata/src/nfa/thompson/backtrack.rs +++ b/regex-automata/src/nfa/thompson/backtrack.rs @@ -301,6 +301,9 @@ impl Builder { nfa: NFA, ) -> Result { nfa.look_set_any().available().map_err(BuildError::word)?; + if nfa.lookaround_count() > 0 { + return Err(BuildError::unsupported_lookarounds()); + } Ok(BoundedBacktracker { config: self.config.clone(), nfa }) } @@ -1521,7 +1524,9 @@ impl BoundedBacktracker { } State::WriteLookAround { .. } | State::CheckLookAround { .. } => { - todo!("check how to handle") + unimplemented!( + "backtracking engine does not support look-arounds" + ); } State::Union { ref alternates } => { sid = match alternates.get(0) { diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index f4ecd4a09..7526ab467 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -954,6 +954,13 @@ impl Compiler { { return Err(BuildError::unsupported_captures()); } + if self.config.get_reverse() + && exprs.iter().any(|e| { + (e.borrow() as &Hir).properties().contains_lookaround_expr() + }) + { + return Err(BuildError::unsupported_lookarounds()); + } self.builder.borrow_mut().clear(); self.builder.borrow_mut().set_utf8(self.config.get_utf8()); diff --git a/regex-automata/src/nfa/thompson/error.rs b/regex-automata/src/nfa/thompson/error.rs index a1f5aed5c..d2b8c796c 100644 --- a/regex-automata/src/nfa/thompson/error.rs +++ b/regex-automata/src/nfa/thompson/error.rs @@ -81,6 +81,13 @@ enum BuildErrorKind { /// should support it at some point. #[cfg(feature = "syntax")] UnsupportedCaptures, + /// An error that occurs when one tries to build a reverse NFA with + /// look-around sub-expressions. Currently, this isn't supported, but we + /// probably should support it at some point. + /// + /// This is also emmitted by the backtracking engine which does not + /// support look-around sub-expressions. + UnsupportedLookArounds, } impl BuildError { @@ -142,6 +149,10 @@ impl BuildError { pub(crate) fn unsupported_captures() -> BuildError { BuildError { kind: BuildErrorKind::UnsupportedCaptures } } + + pub(crate) fn unsupported_lookarounds() -> BuildError { + BuildError { kind: BuildErrorKind::UnsupportedLookArounds } + } } #[cfg(feature = "std")] @@ -201,6 +212,11 @@ impl core::fmt::Display for BuildError { "currently captures must be disabled when compiling \ a reverse NFA", ), + BuildErrorKind::UnsupportedLookArounds => write!( + f, + "currently look-around sub-expressions cannot be in the pattern \ + when compiling a reverse NFA or using the backtracking engine", + ), } } } diff --git a/regex-automata/src/util/determinize/mod.rs b/regex-automata/src/util/determinize/mod.rs index 80f57bbe6..bdcb4e025 100644 --- a/regex-automata/src/util/determinize/mod.rs +++ b/regex-automata/src/util/determinize/mod.rs @@ -253,7 +253,7 @@ pub(crate) fn next( | thompson::State::Capture { .. } => {} thompson::State::CheckLookAround { .. } | thompson::State::WriteLookAround { .. } => { - todo!("check how to handle") + unimplemented!("look-around support in DFA") } thompson::State::Match { pattern_id } => { // Notice here that we are calling the NEW state a match @@ -405,7 +405,7 @@ pub(crate) fn epsilon_closure( | thompson::State::Match { .. } => break, thompson::State::WriteLookAround { .. } | thompson::State::CheckLookAround { .. } => { - todo!("check how to handle") + unimplemented!("look-around support in DFA") } thompson::State::Look { look, next } => { if !look_have.contains(look) { @@ -475,7 +475,7 @@ pub(crate) fn add_nfa_states( } thompson::State::CheckLookAround { .. } | thompson::State::WriteLookAround { .. } => { - todo!("check how to handle") + unimplemented!("look-around support in DFA") } thompson::State::Union { .. } | thompson::State::BinaryUnion { .. } => { diff --git a/regex-automata/tests/nfa/thompson/backtrack/suite.rs b/regex-automata/tests/nfa/thompson/backtrack/suite.rs index 2dd9d1f1b..7be175f04 100644 --- a/regex-automata/tests/nfa/thompson/backtrack/suite.rs +++ b/regex-automata/tests/nfa/thompson/backtrack/suite.rs @@ -108,7 +108,17 @@ fn compiler( if !configure_backtrack_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); } - let re = builder.build_many(®exes)?; + let re = match builder.build_many(®exes) { + Ok(re) => re, + // Due to errors being opaque, we need to check the error message to skip tests with look-arounds + Err(err) => { + if test.compiles() && err.to_string().contains("look-around") { + return Ok(CompiledRegex::skip()); + } + + return Err(err.into()); + } + }; // The backtracker doesn't support lookarounds, so skip if there are any. if re.get_nfa().lookaround_count() > 0 { return Ok(CompiledRegex::skip()); From 77637b4c8f54c6e20216f280fbdf971b017a9e2a Mon Sep 17 00:00:00 2001 From: shilangyu Date: Fri, 4 Apr 2025 21:46:39 +0200 Subject: [PATCH 43/66] Rename lookaround_index --- regex-automata/src/nfa/thompson/nfa.rs | 18 +++++++----------- regex-automata/src/nfa/thompson/pikevm.rs | 8 ++++---- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 461fae9f5..2657540cb 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1385,10 +1385,10 @@ impl Inner { State::Capture { .. } => { self.has_capture = true; } - State::CheckLookAround { lookaround_index: look_idx, .. } - | State::WriteLookAround { lookaround_index: look_idx } => { + State::CheckLookAround { lookaround_index, .. } + | State::WriteLookAround { lookaround_index } => { self.lookaround_count = - self.lookaround_count.max(look_idx.as_usize() + 1); + self.lookaround_count.max(lookaround_index.as_usize() + 1); } State::Union { .. } | State::BinaryUnion { .. } @@ -1795,18 +1795,14 @@ impl fmt::Debug for State { State::Look { ref look, next } => { write!(f, "{:?} => {:?}", look, next.as_usize()) } - State::WriteLookAround { lookaround_index: look_idx } => { - write!(f, "write-look-around({})", look_idx.as_u32()) + State::WriteLookAround { lookaround_index } => { + write!(f, "write-look-around({})", lookaround_index.as_u32()) } - State::CheckLookAround { - lookaround_index: look_idx, - positive, - next, - } => { + State::CheckLookAround { lookaround_index, positive, next } => { write!( f, "check-look-around({} is {}) => {}", - look_idx.as_u32(), + lookaround_index.as_u32(), if positive { "matched" } else { "not matched" }, next.as_usize() ) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 6d75d2859..eb40bf1a9 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1772,17 +1772,17 @@ impl PikeVM { } sid = next; } - State::WriteLookAround { lookaround_index: look_idx } => { + State::WriteLookAround { lookaround_index } => { // This is ok since `at` is always less than `usize::MAX`. - lookarounds[look_idx] = NonMaxUsize::new(at); + lookarounds[lookaround_index] = NonMaxUsize::new(at); return; } State::CheckLookAround { - lookaround_index: look_idx, + lookaround_index, positive, next, } => { - let state = match lookarounds[look_idx] { + let state = match lookarounds[lookaround_index] { None => usize::MAX, Some(pos) => pos.get(), }; From ee95174d4b158961da52cefbc78b55f341ce5e13 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Fri, 4 Apr 2025 21:56:46 +0200 Subject: [PATCH 44/66] Fix literals tests --- regex-syntax/src/hir/literal.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index 584c2893b..f419dd70e 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -2456,16 +2456,16 @@ mod tests { #[test] fn lookaround() { - assert_eq!(exact(["ab"]), e(r"a(?<=qwa)b")); - assert_eq!(exact(["ab"]), e(r"a(? Date: Sat, 5 Apr 2025 08:00:55 +0200 Subject: [PATCH 45/66] Fix anchors in lookarounds --- regex-automata/src/nfa/thompson/compiler.rs | 23 +++++++++++++++++++++ regex-syntax/src/hir/mod.rs | 2 ++ testdata/lookaround.toml | 6 ++++++ 3 files changed, 31 insertions(+) diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 7526ab467..7a9393d1e 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -2158,6 +2158,29 @@ mod tests { ); } + #[test] + fn compile_yes_unanchored_prefix_with_start_anchor_in_lookaround() { + let nfa = NFA::compiler() + .configure(NFA::config().which_captures(WhichCaptures::None)) + .build(r"(?<=^)a") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_bin_union(2, 1), + s_range(0, 255, 0), + s_bin_union(3, 6), + s_bin_union(5, 4), + s_range(0, 255, 3), + s_look(Look::Start, 7), + s_check_lookaround(0, true, 8), + s_write_lookaround(0), + s_byte(b'a', 9), + s_match(0) + ] + ); + } + #[test] fn compile_empty() { assert_eq!(build("").states(), &[s_match(0),]); diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index c47ce041e..4d1bf66e1 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -2589,6 +2589,8 @@ impl Properties { literal: false, alternation_literal: false, contains_lookaround_expr: true, + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), ..*sub_p.0.clone() }; Properties(Box::new(inner)) diff --git a/testdata/lookaround.toml b/testdata/lookaround.toml index 9b34be47b..8818a8f1a 100644 --- a/testdata/lookaround.toml +++ b/testdata/lookaround.toml @@ -58,6 +58,12 @@ regex = "(?<=c+)a|(?<=d+)a" haystack = "aabacadaccaddaea" matches = [[5, 6], [7, 8], [10, 11], [13, 14]] +[[test]] +name = "lookbehind with anchor" +regex = "(?<=^c)a" +haystack = "cacacaasdacabasdqwe" +matches = [[1, 2]] + [[test]] name = "lookbehind next to capture group" regex = "(?<=c)(a|b)(b|a)" From 0a882cdc5373c7cd6898f980d505aca3707cc91d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Thu, 10 Apr 2025 16:00:08 +0200 Subject: [PATCH 46/66] Fix broken doc link --- regex-syntax/src/hir/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 4d1bf66e1..a1528c88f 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -2138,7 +2138,7 @@ impl Properties { /// Returns whether there are any look-around expressions in this HIR value. /// /// Only returns true for [`HirKind::LookAround`] and not for - /// [`HirKind::Look`], which can be queried by [`look_set`] instead. + /// [`HirKind::Look`], which can be queried by [`look_set`](Properties::look_set) instead. #[inline] pub fn contains_lookaround_expr(&self) -> bool { self.0.contains_lookaround_expr From 6fee5f66ef01f554e6415eadb030ed6220e76a41 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Thu, 17 Apr 2025 10:47:46 +0200 Subject: [PATCH 47/66] Remove unneeded if condition --- regex-automata/tests/nfa/thompson/backtrack/suite.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/regex-automata/tests/nfa/thompson/backtrack/suite.rs b/regex-automata/tests/nfa/thompson/backtrack/suite.rs index 7be175f04..b0aa0fc6c 100644 --- a/regex-automata/tests/nfa/thompson/backtrack/suite.rs +++ b/regex-automata/tests/nfa/thompson/backtrack/suite.rs @@ -119,10 +119,6 @@ fn compiler( return Err(err.into()); } }; - // The backtracker doesn't support lookarounds, so skip if there are any. - if re.get_nfa().lookaround_count() > 0 { - return Ok(CompiledRegex::skip()); - } let mut cache = re.create_cache(); Ok(CompiledRegex::compiled(move |test| -> TestResult { run_test(&re, &mut cache, test) From 70cd9a054778b5ddfc22a67a385b751145ca0206 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Thu, 1 May 2025 11:43:13 +0200 Subject: [PATCH 48/66] Explain use of empty look-set --- regex-syntax/src/hir/mod.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index a1528c88f..b812554f5 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -2589,6 +2589,8 @@ impl Properties { literal: false, alternation_literal: false, contains_lookaround_expr: true, + // We do not want look-around subexpressions to influence matching + // of the main expression when they contain anchors, so we clear the set. look_set_prefix: LookSet::empty(), look_set_suffix: LookSet::empty(), ..*sub_p.0.clone() From eebbcaceb1b7274a6e0f4815fea3c7c575d10c67 Mon Sep 17 00:00:00 2001 From: Robin Date: Wed, 16 Apr 2025 11:42:58 +0200 Subject: [PATCH 49/66] Add regression tests --- testdata/lookaround.toml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/testdata/lookaround.toml b/testdata/lookaround.toml index 8818a8f1a..14a303d7c 100644 --- a/testdata/lookaround.toml +++ b/testdata/lookaround.toml @@ -46,6 +46,18 @@ regex = "(?<=c[def]+(? Date: Wed, 16 Apr 2025 16:52:11 +0200 Subject: [PATCH 50/66] Change compilation to disconnected components --- regex-automata/src/nfa/thompson/builder.rs | 9 +++ regex-automata/src/nfa/thompson/compiler.rs | 79 +++++++-------------- regex-automata/src/nfa/thompson/nfa.rs | 9 +++ 3 files changed, 43 insertions(+), 54 deletions(-) diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index c769fda23..e4b6ff665 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -340,6 +340,8 @@ pub struct Builder { /// contains a single regex, then `start_pattern[0]` and `start_anchored` /// are always equivalent. start_pattern: Vec, + /// The starting states for each individual look-behind sub-expression. + start_look_behind: Vec, /// A map from pattern ID to capture group index to name. (If no name /// exists, then a None entry is present. Thus, all capturing groups are /// present in this mapping.) @@ -449,6 +451,7 @@ impl Builder { remap.resize(self.states.len(), StateID::ZERO); nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern); + nfa.set_look_behind_starts(self.start_look_behind.as_slice()); nfa.set_captures(&self.captures).map_err(BuildError::captures)?; // The idea here is to convert our intermediate states to their final // form. The only real complexity here is the process of converting @@ -706,6 +709,12 @@ impl Builder { self.start_pattern.len() } + /// Adds the [`start_id`] to the set of starting states that is used when + /// running look-behind expressions. + pub fn start_look_behind(&mut self, start_id: StateID) { + self.start_look_behind.push(start_id); + } + /// Add an "empty" NFA state. /// /// An "empty" NFA state is a state with a single unconditional epsilon diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 7a9393d1e..5a7bccd72 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -711,11 +711,6 @@ pub struct Compiler { /// State used for caching common suffixes when compiling reverse UTF-8 /// automata (for Unicode character classes). utf8_suffix: RefCell, - /// Top level alternation state which is used to run all look-around - /// assertion checks in lockstep with the main expression. Each look-around - /// expression is compiled to a set of states that is patched into this - /// state, and this state is updated on each new pattern being compiled. - lookaround_alt: RefCell>, /// The next index to use for a look-around expression. lookaround_index: RefCell, } @@ -730,7 +725,6 @@ impl Compiler { utf8_state: RefCell::new(Utf8State::new()), trie_state: RefCell::new(RangeTrie::new()), utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), - lookaround_alt: RefCell::new(None), lookaround_index: RefCell::new(SmallIndex::ZERO), } } @@ -993,32 +987,11 @@ impl Compiler { let compiled = self.c_alt_iter(exprs.iter().map(|e| { let _ = self.start_pattern()?; - let has_lookarounds = - (e.borrow() as &Hir).properties().contains_lookaround_expr(); - let mut top_level_alt = if has_lookarounds { - self.add_union()? - } else { - StateID::ZERO - }; - if has_lookarounds { - let lookaround_prefix = - self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; - let lookaround_alt = self.add_union()?; - self.patch(lookaround_prefix.end, lookaround_alt)?; - self.patch(top_level_alt, lookaround_prefix.start)?; - self.lookaround_alt.borrow_mut().replace(lookaround_alt); - } let one = self.c_cap(0, None, e.borrow())?; let match_state_id = self.add_match()?; self.patch(one.end, match_state_id)?; - if has_lookarounds { - self.patch(top_level_alt, one.start)?; - } else { - top_level_alt = one.start; - } - let _ = self.finish_pattern(top_level_alt)?; - self.lookaround_alt.borrow_mut().take(); - Ok(ThompsonRef { start: top_level_alt, end: match_state_id }) + let _ = self.finish_pattern(one.start)?; + Ok(ThompsonRef { start: one.start, end: match_state_id }) }))?; self.patch(unanchored_prefix.end, compiled.start)?; let nfa = self @@ -1052,25 +1025,25 @@ impl Compiler { &self, lookaround: &LookAround, ) -> Result { - let sub = self.c(lookaround.sub())?; - let pos = match lookaround { - LookAround::NegativeLookBehind(_) => false, - LookAround::PositiveLookBehind(_) => true, - }; let idx = *self.lookaround_index.borrow(); *self.lookaround_index.borrow_mut() = SmallIndex::new(idx.one_more()) .map_err(|e| { BuildError::too_many_lookarounds(e.attempted() as usize) })?; + let pos = match lookaround { + LookAround::NegativeLookBehind(_) => false, + LookAround::PositiveLookBehind(_) => true, + }; let check = self.add_check_lookaround(idx, pos)?; + + let unanchored = + self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; + + let sub = self.c(lookaround.sub())?; let write = self.add_write_lookaround(idx)?; + self.patch(unanchored.end, sub.start)?; self.patch(sub.end, write)?; - self.patch( - self.lookaround_alt - .borrow() - .expect("Cannot compile look-around outside pattern"), - sub.start, - )?; + self.builder.borrow_mut().start_look_behind(unanchored.start); Ok(ThompsonRef { start: check, end: check }) } @@ -2169,13 +2142,12 @@ mod tests { &[ s_bin_union(2, 1), s_range(0, 255, 0), - s_bin_union(3, 6), + s_check_lookaround(0, true, 7), s_bin_union(5, 4), s_range(0, 255, 3), - s_look(Look::Start, 7), - s_check_lookaround(0, true, 8), + s_look(Look::Start, 6), s_write_lookaround(0), - s_byte(b'a', 9), + s_byte(b'a', 8), s_match(0) ] ); @@ -2310,11 +2282,10 @@ mod tests { assert_eq!( build(r"(?<=a)").states(), &[ - s_bin_union(1, 4), + s_check_lookaround(0, true, 5), s_bin_union(3, 2), s_range(b'\x00', b'\xFF', 1), - s_byte(b'a', 5), - s_check_lookaround(0, true, 6), + s_byte(b'a', 4), s_write_lookaround(0), s_match(0) ] @@ -2322,16 +2293,16 @@ mod tests { assert_eq!( build(r"(?<=a(?, /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state /// might use a different amount of heap, we need to keep track of this @@ -1419,6 +1421,13 @@ impl Inner { self.start_pattern = start_pattern.to_vec(); } + pub(super) fn set_look_behind_starts( + &mut self, + look_behind_starts: &[StateID], + ) { + self.start_look_behind = look_behind_starts.to_vec(); + } + /// Sets the UTF-8 mode of this NFA. pub(super) fn set_utf8(&mut self, yes: bool) { self.utf8 = yes; From a24979a1fe39c38121dce9025c8327360e57c6af Mon Sep 17 00:00:00 2001 From: Robin Date: Thu, 17 Apr 2025 14:53:49 +0200 Subject: [PATCH 51/66] Implement look-behind state processing --- regex-automata/src/nfa/thompson/builder.rs | 1 + regex-automata/src/nfa/thompson/nfa.rs | 9 ++ regex-automata/src/nfa/thompson/pikevm.rs | 103 +++++++++++++++++++++ 3 files changed, 113 insertions(+) diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index e4b6ff665..e2f8bf2ad 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -387,6 +387,7 @@ impl Builder { self.pattern_id = None; self.states.clear(); self.start_pattern.clear(); + self.start_look_behind.clear(); self.captures.clear(); self.memory_states = 0; } diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 219dba657..42904d5f2 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1106,6 +1106,12 @@ impl NFA { self.0.lookaround_count } + /// Returns the starting states for initializing look-behind evaluation + #[inline] + pub fn look_behind_starts(&self) -> &Vec { + &self.0.start_look_behind + } + // FIXME: The `look_set_prefix_all` computation was not correct, and it // seemed a little tricky to fix it. Since I wasn't actually using it for // anything, I just decided to remove it in the run up to the regex 1.9 @@ -1481,6 +1487,9 @@ impl Inner { for id in self.start_pattern.iter_mut() { *id = old_to_new[*id]; } + for id in self.start_look_behind.iter_mut() { + *id = old_to_new[*id]; + } } } diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index eb40bf1a9..746086d08 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1263,7 +1263,46 @@ impl PikeVM { ref mut curr, ref mut next, ref mut lookaround, + ref mut curr_lookaround, + ref mut next_lookaround, } = cache; + + // This initializes the look-behind threads from the start of the input + // Note: since capture groups are not allowed inside look-behinds, + // there won't be any Capture epsilon transitions and hence it is ok to + // use &mut [] for the slots parameter. We need to add the start states + // in reverse because nested look-behinds have a higher index but must + // be executed first. + for look_behind_start in self.nfa.look_behind_starts() { + self.epsilon_closure( + stack, + &mut [], + curr_lookaround, + lookaround, + input, + 0, + *look_behind_start, + ); + } + + // This brings the look-behind threads into the state they must be for + // starting at input.start() instead of the beginning. This is + // necessary for lookbehinds to be able to match outside of the input + // span. + for lb_at in 0..input.start() { + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + lb_at, + &mut [], + ); + core::mem::swap(curr_lookaround, next_lookaround); + next_lookaround.set.clear(); + } + let mut hm = None; // Yes, our search doesn't end at input.end(), but includes it. This // is necessary because matches are delayed by one byte, just like @@ -1374,6 +1413,17 @@ impl PikeVM { stack, slots, curr, lookaround, input, at, start_id, ); } + // The lookbehind states must be processed first, since their + // result must be available for the processing of the main states. + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + at, + &mut [], + ); if let Some(pid) = self.nexts(stack, curr, next, lookaround, input, at, slots) { @@ -1387,7 +1437,9 @@ impl PikeVM { break; } core::mem::swap(curr, next); + core::mem::swap(curr_lookaround, next_lookaround); next.set.clear(); + next_lookaround.set.clear(); at += 1; } instrument!(|c| c.eprint(&self.nfa)); @@ -1442,7 +1494,34 @@ impl PikeVM { ref mut curr, ref mut next, ref mut lookaround, + ref mut curr_lookaround, + ref mut next_lookaround, } = cache; + + for look_behind_start in self.nfa.look_behind_starts() { + self.epsilon_closure( + stack, + &mut [], + curr_lookaround, + lookaround, + input, + 0, + *look_behind_start, + ); + } + for lb_at in 0..input.start() { + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + lb_at, + &mut [], + ); + core::mem::swap(curr_lookaround, next_lookaround); + next_lookaround.set.clear(); + } for at in input.start()..=input.end() { let any_matches = !patset.is_empty(); if curr.set.is_empty() { @@ -1459,6 +1538,15 @@ impl PikeVM { stack, slots, curr, lookaround, input, at, start_id, ); } + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + at, + &mut [], + ); self.nexts_overlapping( stack, curr, next, lookaround, input, at, patset, ); @@ -1470,7 +1558,9 @@ impl PikeVM { break; } core::mem::swap(curr, next); + core::mem::swap(curr_lookaround, next_lookaround); next.set.clear(); + next_lookaround.set.clear(); } instrument!(|c| c.eprint(&self.nfa)); } @@ -1976,6 +2066,10 @@ pub struct Cache { /// haystack at which look-around indexed x holds and which is <= to the /// current position". lookaround: Vec>, + /// The current active states for look-behind subexpressions + curr_lookaround: ActiveStates, + /// The next set of states to be explored for look-behind subexpressions + next_lookaround: ActiveStates, } impl Cache { @@ -1993,6 +2087,8 @@ impl Cache { curr: ActiveStates::new(re), next: ActiveStates::new(re), lookaround: vec![None; re.lookaround_count()], + curr_lookaround: ActiveStates::new(re), + next_lookaround: ActiveStates::new(re), } } @@ -2036,6 +2132,9 @@ impl Cache { pub fn reset(&mut self, re: &PikeVM) { self.curr.reset(re); self.next.reset(re); + self.curr_lookaround.reset(re); + self.next_lookaround.reset(re); + self.lookaround = vec![None; re.lookaround_count()]; } /// Returns the heap memory usage, in bytes, of this cache. @@ -2063,6 +2162,10 @@ impl Cache { self.stack.clear(); self.curr.setup_search(captures_slot_len); self.next.setup_search(captures_slot_len); + // capture groups are not allowed inside look-arounds, so we + // set the slot-length to zero. + self.curr_lookaround.setup_search(0); + self.next_lookaround.setup_search(0); } } From b4cb71b08d9c2bae6ef9893a27ba0398d7523c5a Mon Sep 17 00:00:00 2001 From: Robin Date: Thu, 17 Apr 2025 14:56:11 +0200 Subject: [PATCH 52/66] Show look-behind starts in nfa debug print --- regex-automata/src/nfa/thompson/nfa.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 42904d5f2..2ac69c761 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1501,6 +1501,8 @@ impl fmt::Debug for Inner { '^' } else if sid == self.start_unanchored { '>' + } else if self.start_look_behind.contains(&sid) { + '<' } else { ' ' }; From 26bcd72a797bcb7b0bc8ddb4c32cca3d2b4169a1 Mon Sep 17 00:00:00 2001 From: Robin Date: Thu, 17 Apr 2025 15:44:24 +0200 Subject: [PATCH 53/66] Fix doc-link --- regex-automata/src/nfa/thompson/builder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index e2f8bf2ad..4f2f9af79 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -710,7 +710,7 @@ impl Builder { self.start_pattern.len() } - /// Adds the [`start_id`] to the set of starting states that is used when + /// Adds the `start_id` to the set of starting states that is used when /// running look-behind expressions. pub fn start_look_behind(&mut self, start_id: StateID) { self.start_look_behind.push(start_id); From 5f966e37359c86e1cdda29d1b7df34ebd5d520a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Wed, 23 Apr 2025 10:27:11 +0200 Subject: [PATCH 54/66] Fix memory usage calculation --- regex-automata/src/nfa/thompson/pikevm.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 746086d08..fb92e6e4d 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -2146,6 +2146,8 @@ impl Cache { (self.stack.len() * size_of::()) + self.curr.memory_usage() + self.next.memory_usage() + + self.curr_lookaround.memory_usage() + + self.next_lookaround.memory_usage() } /// Clears this cache. This should be called at the start of every search From 51dd1a4a0d6e2795b0adeb7a17dfe2ebee474bed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Thu, 1 May 2025 11:52:23 +0200 Subject: [PATCH 55/66] Fix spelling --- regex-automata/src/nfa/thompson/nfa.rs | 4 ++-- regex-automata/src/nfa/thompson/pikevm.rs | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 2ac69c761..1d63bd64a 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1106,7 +1106,7 @@ impl NFA { self.0.lookaround_count } - /// Returns the starting states for initializing look-behind evaluation + /// Returns the starting states for initializing look-behind evaluation. #[inline] pub fn look_behind_starts(&self) -> &Vec { &self.0.start_look_behind @@ -1276,7 +1276,7 @@ pub(super) struct Inner { /// This is needed to initialize the table for storing the result of /// look-around evaluation. lookaround_count: usize, - /// Contains the start states for each of the look-behind subexpressions + /// Contains the start states for each of the look-behind subexpressions. start_look_behind: Vec, /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index fb92e6e4d..b3e6e45c9 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1287,7 +1287,7 @@ impl PikeVM { // This brings the look-behind threads into the state they must be for // starting at input.start() instead of the beginning. This is - // necessary for lookbehinds to be able to match outside of the input + // necessary for look-behinds to be able to match outside of the input // span. for lb_at in 0..input.start() { self.nexts( @@ -1413,7 +1413,7 @@ impl PikeVM { stack, slots, curr, lookaround, input, at, start_id, ); } - // The lookbehind states must be processed first, since their + // The look-behind states must be processed first, since their // result must be available for the processing of the main states. self.nexts( stack, @@ -2066,9 +2066,9 @@ pub struct Cache { /// haystack at which look-around indexed x holds and which is <= to the /// current position". lookaround: Vec>, - /// The current active states for look-behind subexpressions + /// The current active states for look-behind subexpressions. curr_lookaround: ActiveStates, - /// The next set of states to be explored for look-behind subexpressions + /// The next set of states to be explored for look-behind subexpressions. next_lookaround: ActiveStates, } From 2419b122c68a4474b32eb63b81e439e0da9404bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Wed, 23 Apr 2025 10:27:48 +0200 Subject: [PATCH 56/66] Implement matchall performance improvement --- regex-automata/src/nfa/thompson/pikevm.rs | 121 +++++++++++++++------- 1 file changed, 86 insertions(+), 35 deletions(-) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index b3e6e45c9..e57a54d7e 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -891,6 +891,7 @@ impl PikeVM { cache: &'c mut Cache, input: I, ) -> FindMatches<'r, 'c, 'h> { + cache.keep_lookaround_state(true); let caps = Captures::matches(self.get_nfa().group_info().clone()); let it = iter::Searcher::new(input.into()); FindMatches { re: self, cache, caps, it } @@ -934,6 +935,7 @@ impl PikeVM { cache: &'c mut Cache, input: I, ) -> CapturesMatches<'r, 'c, 'h> { + cache.keep_lookaround_state(true); let caps = self.create_captures(); let it = iter::Searcher::new(input.into()); CapturesMatches { re: self, cache, caps, it } @@ -1265,42 +1267,48 @@ impl PikeVM { ref mut lookaround, ref mut curr_lookaround, ref mut next_lookaround, + ref mut match_lookaround, + ref keep_lookaround_state, } = cache; - // This initializes the look-behind threads from the start of the input - // Note: since capture groups are not allowed inside look-behinds, - // there won't be any Capture epsilon transitions and hence it is ok to - // use &mut [] for the slots parameter. We need to add the start states - // in reverse because nested look-behinds have a higher index but must - // be executed first. - for look_behind_start in self.nfa.look_behind_starts() { - self.epsilon_closure( - stack, - &mut [], - curr_lookaround, - lookaround, - input, - 0, - *look_behind_start, - ); - } + if let Some(active) = match_lookaround { + *curr_lookaround = active.clone(); + } else { + // This initializes the look-behind threads from the start of the input + // Note: since capture groups are not allowed inside look-behinds, + // there won't be any Capture epsilon transitions and hence it is ok to + // use &mut [] for the slots parameter. We need to add the start states + // in reverse because nested look-behinds have a higher index but must + // be executed first. + for look_behind_start in self.nfa.look_behind_starts() { + self.epsilon_closure( + stack, + &mut [], + curr_lookaround, + lookaround, + input, + 0, + *look_behind_start, + ); + } - // This brings the look-behind threads into the state they must be for - // starting at input.start() instead of the beginning. This is - // necessary for look-behinds to be able to match outside of the input - // span. - for lb_at in 0..input.start() { - self.nexts( - stack, - curr_lookaround, - next_lookaround, - lookaround, - input, - lb_at, - &mut [], - ); - core::mem::swap(curr_lookaround, next_lookaround); - next_lookaround.set.clear(); + // This brings the look-behind threads into the state they must be for + // starting at input.start() instead of the beginning. This is + // necessary for lookbehinds to be able to match outside of the input + // span. + for lb_at in 0..input.start() { + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + lb_at, + &mut [], + ); + core::mem::swap(curr_lookaround, next_lookaround); + next_lookaround.set.clear(); + } } let mut hm = None; @@ -1428,6 +1436,9 @@ impl PikeVM { self.nexts(stack, curr, next, lookaround, input, at, slots) { hm = Some(HalfMatch::new(pid, at)); + if *keep_lookaround_state { + *match_lookaround = Some(curr_lookaround.clone()); + } } // Unless the caller asked us to return early, we need to mush on // to see if we can extend our match. (But note that 'nexts' will @@ -1496,6 +1507,10 @@ impl PikeVM { ref mut lookaround, ref mut curr_lookaround, ref mut next_lookaround, + // It makes no sense to keep any look-behind state for this version of + // the search, since the caller receives no information about + // where the search ended. + .. } = cache; for look_behind_start in self.nfa.look_behind_starts() { @@ -1989,10 +2004,14 @@ impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> { *self; // 'advance' converts errors into panics, which is OK here because // the PikeVM can never return an error. - it.advance(|input| { + let result = it.advance(|input| { re.search(cache, input, caps); Ok(caps.get_match()) - }) + }); + if result.is_none() { + cache.keep_lookaround_state(false); + } + result } } @@ -2034,6 +2053,7 @@ impl<'r, 'c, 'h> Iterator for CapturesMatches<'r, 'c, 'h> { if caps.is_match() { Some(caps.clone()) } else { + cache.keep_lookaround_state(false); None } } @@ -2070,6 +2090,12 @@ pub struct Cache { curr_lookaround: ActiveStates, /// The next set of states to be explored for look-behind subexpressions. next_lookaround: ActiveStates, + /// The active set of states when a match was found. This is needed + /// to resume a search without recomputing look-behind subexpressions. + match_lookaround: Option, + /// When true, use the states of `match_lookaround` to initialize a search, + /// otherwise recompute from the beginning of the haystack. + keep_lookaround_state: bool, } impl Cache { @@ -2089,6 +2115,8 @@ impl Cache { lookaround: vec![None; re.lookaround_count()], curr_lookaround: ActiveStates::new(re), next_lookaround: ActiveStates::new(re), + match_lookaround: None, + keep_lookaround_state: false, } } @@ -2135,6 +2163,24 @@ impl Cache { self.curr_lookaround.reset(re); self.next_lookaround.reset(re); self.lookaround = vec![None; re.lookaround_count()]; + self.match_lookaround = None; + self.keep_lookaround_state = false; + } + + /// Set this cache to keep the state of look-behind assertions upon a + /// match being found. + /// + /// This must only be called with a value of `true` when a new search is + /// started at the end of a previously found match, otherwise the result + /// of any search after this call will most likely be wrong. + /// + /// Calling this function with a value of `false` will clear any previously + /// stored look-behind state. + pub fn keep_lookaround_state(&mut self, keep: bool) { + self.keep_lookaround_state = keep; + if !keep { + self.match_lookaround = None; + } } /// Returns the heap memory usage, in bytes, of this cache. @@ -2143,11 +2189,16 @@ impl Cache { /// compute that, use `std::mem::size_of::()`. pub fn memory_usage(&self) -> usize { use core::mem::size_of; + let match_lookaround_memory = match &self.match_lookaround { + Some(ml) => ml.memory_usage(), + None => 0, + }; (self.stack.len() * size_of::()) + self.curr.memory_usage() + self.next.memory_usage() + self.curr_lookaround.memory_usage() + self.next_lookaround.memory_usage() + + match_lookaround_memory } /// Clears this cache. This should be called at the start of every search From fd8a662de1b2033db3b210b8e56b3099cd591759 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Wed, 23 Apr 2025 11:36:17 +0200 Subject: [PATCH 57/66] Implement matchall speedup for meta-engine --- regex-automata/src/meta/regex.rs | 26 +++++++++++++++++++++++--- regex-automata/src/meta/wrappers.rs | 6 ++++++ 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index 8cfdecbec..6bc4bdc71 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -611,7 +611,8 @@ impl Regex { &'r self, input: I, ) -> FindMatches<'r, 'h> { - let cache = self.pool.get(); + let mut cache = self.pool.get(); + cache.keep_lookaround_state(true); let it = iter::Searcher::new(input.into()); FindMatches { re: self, cache, it } } @@ -652,7 +653,8 @@ impl Regex { &'r self, input: I, ) -> CapturesMatches<'r, 'h> { - let cache = self.pool.get(); + let mut cache = self.pool.get(); + cache.keep_lookaround_state(true); let caps = self.create_captures(); let it = iter::Searcher::new(input.into()); CapturesMatches { re: self, cache, caps, it } @@ -2076,7 +2078,11 @@ impl<'r, 'h> Iterator for FindMatches<'r, 'h> { #[inline] fn next(&mut self) -> Option { let FindMatches { re, ref mut cache, ref mut it } = *self; - it.advance(|input| Ok(re.search_with(cache, input))) + let result = it.advance(|input| Ok(re.search_with(cache, input))); + if result.is_none() { + cache.keep_lookaround_state(false); + } + result } #[inline] @@ -2149,6 +2155,7 @@ impl<'r, 'h> Iterator for CapturesMatches<'r, 'h> { if caps.is_match() { Some(caps.clone()) } else { + cache.keep_lookaround_state(false); None } } @@ -2385,6 +2392,19 @@ impl Cache { re.imp.strat.reset_cache(self) } + /// Set this cache to keep the state of look-behind assertions upon a + /// match being found. + /// + /// This must only be called with a value of `true` when a new search is + /// started at the end of a previously found match, otherwise the result + /// of any search after this call will most likely be wrong. + /// + /// Calling this function with a value of `false` will clear any previously + /// stored look-behind state. + pub fn keep_lookaround_state(&mut self, keep: bool) { + self.pikevm.keep_lookaround_state(keep); + } + /// Returns the heap memory usage, in bytes, of this cache. /// /// This does **not** include the stack size used up by this cache. To diff --git a/regex-automata/src/meta/wrappers.rs b/regex-automata/src/meta/wrappers.rs index f7c5c1096..83f5c12ab 100644 --- a/regex-automata/src/meta/wrappers.rs +++ b/regex-automata/src/meta/wrappers.rs @@ -133,6 +133,12 @@ impl PikeVMCache { PikeVMCache(Some(builder.get().0.create_cache())) } + pub(crate) fn keep_lookaround_state(&mut self, keep: bool) { + if let Some(cache) = self.0.as_mut() { + cache.keep_lookaround_state(keep); + } + } + pub(crate) fn reset(&mut self, builder: &PikeVM) { self.0.as_mut().unwrap().reset(&builder.get().0); } From d6a7d6d61fef05dadf235344db3d472d9de4ddb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Thu, 1 May 2025 11:58:03 +0200 Subject: [PATCH 58/66] Replace catchall with explicit ignore --- regex-automata/src/nfa/thompson/pikevm.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index e57a54d7e..2183a7483 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1510,7 +1510,8 @@ impl PikeVM { // It makes no sense to keep any look-behind state for this version of // the search, since the caller receives no information about // where the search ended. - .. + keep_lookaround_state: _, + match_lookaround: _, } = cache; for look_behind_start in self.nfa.look_behind_starts() { From 0af9347511fe1c24dd61ad25ab6ee475bc9c075b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Thu, 1 May 2025 12:16:29 +0200 Subject: [PATCH 59/66] Rephrase doc and fix lb start state order --- regex-automata/src/nfa/thompson/compiler.rs | 2 +- regex-automata/src/nfa/thompson/pikevm.rs | 25 ++++++++++++--------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 5a7bccd72..42dd32127 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -1038,12 +1038,12 @@ impl Compiler { let unanchored = self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; + self.builder.borrow_mut().start_look_behind(unanchored.start); let sub = self.c(lookaround.sub())?; let write = self.add_write_lookaround(idx)?; self.patch(unanchored.end, sub.start)?; self.patch(sub.end, write)?; - self.builder.borrow_mut().start_look_behind(unanchored.start); Ok(ThompsonRef { start: check, end: check }) } diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 2183a7483..e7c84ba51 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1278,9 +1278,11 @@ impl PikeVM { // Note: since capture groups are not allowed inside look-behinds, // there won't be any Capture epsilon transitions and hence it is ok to // use &mut [] for the slots parameter. We need to add the start states - // in reverse because nested look-behinds have a higher index but must - // be executed first. - for look_behind_start in self.nfa.look_behind_starts() { + // in reverse because more deeply nested look-behinds have a higher index + // but must be executed first, so that the result is available for the + // outer expression. + for look_behind_start in self.nfa.look_behind_starts().iter().rev() + { self.epsilon_closure( stack, &mut [], @@ -2091,8 +2093,10 @@ pub struct Cache { curr_lookaround: ActiveStates, /// The next set of states to be explored for look-behind subexpressions. next_lookaround: ActiveStates, - /// The active set of states when a match was found. This is needed - /// to resume a search without recomputing look-behind subexpressions. + /// The set of active threads, belonging to look-behind expressions, + /// when a match was found. This is needed to resume a search after a match + /// was found (to look for further matches), without having to re-scan the + /// beginning of the haystack. match_lookaround: Option, /// When true, use the states of `match_lookaround` to initialize a search, /// otherwise recompute from the beginning of the haystack. @@ -2168,12 +2172,13 @@ impl Cache { self.keep_lookaround_state = false; } - /// Set this cache to keep the state of look-behind assertions upon a - /// match being found. + /// Set this cache to store a copy of the active threads belonging + /// to look-behind assertions upon a match being found. /// - /// This must only be called with a value of `true` when a new search is - /// started at the end of a previously found match, otherwise the result - /// of any search after this call will most likely be wrong. + /// This is a performance optimization and must only be called with a + /// value of `true` when intending to start a new search at the end of + /// a previously found match. Otherwise, the result of look-behind + /// sub-expressions will be out of sync with the main regex. /// /// Calling this function with a value of `false` will clear any previously /// stored look-behind state. From a5d8a9e92b003f640e19afd8c785871990083d9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Thu, 1 May 2025 14:50:16 +0200 Subject: [PATCH 60/66] Disable lookaround scanning when none present --- regex-automata/src/nfa/thompson/pikevm.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index e7c84ba51..813804884 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1273,7 +1273,7 @@ impl PikeVM { if let Some(active) = match_lookaround { *curr_lookaround = active.clone(); - } else { + } else if self.lookaround_count() > 0 { // This initializes the look-behind threads from the start of the input // Note: since capture groups are not allowed inside look-behinds, // there won't be any Capture epsilon transitions and hence it is ok to From 8d29519e14b4766b61d9e31881348b9e30cfd5b2 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Wed, 7 May 2025 21:40:07 +0200 Subject: [PATCH 61/66] Fast forward look-around threads upon prefiltering --- regex-automata/src/nfa/thompson/pikevm.rs | 95 +++++++++++++++-------- 1 file changed, 63 insertions(+), 32 deletions(-) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 813804884..4763d2dbd 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1293,24 +1293,16 @@ impl PikeVM { *look_behind_start, ); } - - // This brings the look-behind threads into the state they must be for - // starting at input.start() instead of the beginning. This is - // necessary for lookbehinds to be able to match outside of the input - // span. - for lb_at in 0..input.start() { - self.nexts( - stack, - curr_lookaround, - next_lookaround, - lookaround, - input, - lb_at, - &mut [], - ); - core::mem::swap(curr_lookaround, next_lookaround); - next_lookaround.set.clear(); - } + // This is necessary for look-behinds to be able to match outside of the + // input span. + self.fast_forward_lookbehinds( + Span { start: 0, end: input.start() }, + input, + stack, + curr_lookaround, + next_lookaround, + lookaround, + ); } let mut hm = None; @@ -1352,7 +1344,21 @@ impl PikeVM { let span = Span::from(at..input.end()); match pre.find(input.haystack(), span) { None => break, - Some(ref span) => at = span.start, + Some(ref span) => { + if self.lookaround_count() > 0 { + // We are jumping ahead due to the pre-filter, thus we must bring + // the look-behind threads to the new position. + self.fast_forward_lookbehinds( + Span { start: at, end: span.start }, + input, + stack, + curr_lookaround, + next_lookaround, + lookaround, + ); + } + at = span.start + } } } } @@ -1459,6 +1465,36 @@ impl PikeVM { hm } + /// This brings the look-behind threads into the state they must be for + /// starting at [input.end]. The assumption is that they are currently + /// at [input.start]. + fn fast_forward_lookbehinds( + &self, + forward_span: Span, + input: &Input<'_>, + stack: &mut Vec, + curr_lookaround: &mut ActiveStates, + next_lookaround: &mut ActiveStates, + lookaround: &mut Vec>, + ) { + for lb_at in forward_span.start..forward_span.end { + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + lb_at, + // Since capture groups are not allowed inside look-arounds, + // there won't be any Capture epsilon transitions and hence it is ok to + // use &mut [] for the slots parameter. + &mut [], + ); + core::mem::swap(curr_lookaround, next_lookaround); + next_lookaround.set.clear(); + } + } + /// The implementation for the 'which_overlapping_matches' API. Basically, /// we do a single scan through the entire haystack (unless our regex /// or search is anchored) and record every pattern that matched. In @@ -1527,19 +1563,14 @@ impl PikeVM { *look_behind_start, ); } - for lb_at in 0..input.start() { - self.nexts( - stack, - curr_lookaround, - next_lookaround, - lookaround, - input, - lb_at, - &mut [], - ); - core::mem::swap(curr_lookaround, next_lookaround); - next_lookaround.set.clear(); - } + self.fast_forward_lookbehinds( + Span { start: 0, end: input.start() }, + input, + stack, + curr_lookaround, + next_lookaround, + lookaround, + ); for at in input.start()..=input.end() { let any_matches = !patset.is_empty(); if curr.set.is_empty() { From 515ccffae645f04ce76ce3ae994c0f2a67246f6b Mon Sep 17 00:00:00 2001 From: shilangyu Date: Wed, 7 May 2025 21:45:58 +0200 Subject: [PATCH 62/66] Add small test for prefiltered regex with lookbehind --- testdata/lookaround.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/testdata/lookaround.toml b/testdata/lookaround.toml index 14a303d7c..91fab56a0 100644 --- a/testdata/lookaround.toml +++ b/testdata/lookaround.toml @@ -84,3 +84,9 @@ matches = [ [[1, 3], [1, 2], [2, 3]], [[5, 7], [5, 6], [6, 7]], ] + +[[test]] +name = "lookbehind matching before the prefiltered start position" +regex = "b(?<=ab)" +haystack = "ab" +matches = [[1, 2]] From 093df676f447ad757e36c46af10cf5d8a3774257 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Thu, 8 May 2025 08:03:40 +0200 Subject: [PATCH 63/66] Change literal extraction for look-arounds --- regex-automata/src/meta/strategy.rs | 5 +++++ regex-syntax/src/hir/literal.rs | 5 +++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 0ac830b9d..19823b555 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -258,6 +258,11 @@ impl Pre<()> { if !info.props()[0].look_set().is_empty() { return None; } + // For a similar reason, we require that it has zero look-around + // expressions. + if info.props()[0].contains_lookaround_expr() { + return None; + } // Finally, currently, our prefilters are all oriented around // leftmost-first match semantics, so don't try to use them if the // caller asked for anything else. diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index f419dd70e..84c89c1ee 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -172,8 +172,9 @@ impl Extractor { use crate::hir::HirKind::*; match *hir.kind() { - Empty | Look(_) => Seq::singleton(self::Literal::exact(vec![])), - LookAround(_) => Seq::infinite(), + Empty | Look(_) | LookAround(_) => { + Seq::singleton(self::Literal::exact(vec![])) + } Literal(hir::Literal(ref bytes)) => { let mut seq = Seq::singleton(self::Literal::exact(bytes.to_vec())); From 2f761ec82d17a391b7aef2be8f3d7e0bbf1494ef Mon Sep 17 00:00:00 2001 From: shilangyu Date: Thu, 8 May 2025 10:30:10 +0200 Subject: [PATCH 64/66] Update wrong doc --- regex-automata/src/nfa/thompson/pikevm.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 4763d2dbd..d976bfc12 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1466,8 +1466,8 @@ impl PikeVM { } /// This brings the look-behind threads into the state they must be for - /// starting at [input.end]. The assumption is that they are currently - /// at [input.start]. + /// starting at [forward_span.end]. The assumption is that they are currently + /// at [forward_span.start]. fn fast_forward_lookbehinds( &self, forward_span: Span, From c3c02fc05196de8175b4c6e13f391264739f2289 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Thu, 8 May 2025 10:30:22 +0200 Subject: [PATCH 65/66] Fix literal extraction tests --- regex-syntax/src/hir/literal.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index 84c89c1ee..e09879d81 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -2457,16 +2457,16 @@ mod tests { #[test] fn lookaround() { - assert_eq!(inexact([I("a")], [I("b")]), e(r"a(?<=qwa)b")); - assert_eq!(inexact([I("a")], [I("b")]), e(r"a(? Date: Thu, 8 May 2025 10:31:20 +0200 Subject: [PATCH 66/66] Reverse look_behind_starts --- regex-automata/src/nfa/thompson/pikevm.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index d976bfc12..b18101c53 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1552,7 +1552,7 @@ impl PikeVM { match_lookaround: _, } = cache; - for look_behind_start in self.nfa.look_behind_starts() { + for look_behind_start in self.nfa.look_behind_starts().iter().rev() { self.epsilon_closure( stack, &mut [],