diff --git a/rinja_parser/Cargo.toml b/rinja_parser/Cargo.toml index 82e75d25c..d4265dae3 100644 --- a/rinja_parser/Cargo.toml +++ b/rinja_parser/Cargo.toml @@ -14,6 +14,7 @@ edition = "2021" rust-version = "1.71" [dependencies] +memchr = "2" nom = { version = "7", default-features = false, features = ["alloc"] } [dev-dependencies] diff --git a/rinja_parser/src/lib.rs b/rinja_parser/src/lib.rs index 5f6078f69..ffb34067d 100644 --- a/rinja_parser/src/lib.rs +++ b/rinja_parser/src/lib.rs @@ -20,6 +20,7 @@ use nom::{error_position, AsChar, InputTakeAtPosition}; pub mod expr; pub use expr::{Expr, Filter}; +mod memchr_splitter; pub mod node; pub use node::Node; @@ -362,22 +363,22 @@ fn ws<'a, O>( /// Skips input until `end` was found, but does not consume it. /// Returns tuple that would be returned when parsing `end`. -fn skip_till<'a, O>( +fn skip_till<'a, 'b, O>( + candidate_finder: impl crate::memchr_splitter::Splitter, end: impl FnMut(&'a str) -> ParseResult<'a, O>, ) -> impl FnMut(&'a str) -> ParseResult<'a, (&'a str, O)> { - enum Next { - IsEnd(O), - NotEnd, - } - let mut next = alt((map(end, Next::IsEnd), map(anychar, |_| Next::NotEnd))); + let mut next = alt((map(end, Some), map(anychar, |_| None))); move |start: &'a str| { let mut i = start; loop { - let (j, is_end) = next(i)?; - match is_end { - Next::IsEnd(lookahead) => return Ok((i, (j, lookahead))), - Next::NotEnd => i = j, - } + i = match candidate_finder.split(i) { + Some((_, j)) => j, + None => return Err(nom::Err::Error(ErrorContext::new("`end` not found`", i))), + }; + i = match next(i)? { + (j, Some(lookahead)) => return Ok((i, (j, lookahead))), + (j, None) => j, + }; } } } diff --git a/rinja_parser/src/memchr_splitter.rs b/rinja_parser/src/memchr_splitter.rs new file mode 100644 index 000000000..0a149f89a --- /dev/null +++ b/rinja_parser/src/memchr_splitter.rs @@ -0,0 +1,101 @@ +pub(crate) trait Splitter: Copy { + /// If any of the needles was found in the haystack, then split the haystack at the first hit. + /// + /// Since only the first byte of a needle is inspected, be aware that there can be + /// false-positives. Always compare the latter string of the output if it fits the expected + /// prefix. + fn split<'a>(&self, haystack: &'a str) -> Option<(&'a str, &'a str)>; +} + +impl Splitter for &T { + #[inline] + fn split<'a>(&self, haystack: &'a str) -> Option<(&'a str, &'a str)> { + T::split(self, haystack) + } +} + +// define and implement a string splitter using memchr +macro_rules! new_memchr_type { + ($struct:ident $split_unchecked:ident $memchr:ident $($field:ident)*) => { + #[derive(Debug, Clone, Copy)] + pub(crate) struct $struct { + $($field: u8,)* + } + + impl $struct { + #[track_caller] + pub(crate) fn new($($field: &str),*) -> Self { + Self { + $($field: $field.as_bytes()[0],)* + } + } + + #[inline] + pub(crate) fn split<'a>(&self, haystack: &'a str) -> Option<(&'a str, &'a str)> { + // SAFETY: During the construction of `self` we used strings as inputs, and a + // string always starts with a byte at char boundary. + unsafe { $split_unchecked($(self.$field,)* haystack) } + } + } + + impl Splitter for $struct { + #[inline] + fn split<'a>(&self, haystack: &'a str) -> Option<(&'a str, &'a str)> { + self.split(haystack) + } + } + + /// SAFETY: caller has to ensure that the needle is at a char boundary + pub(crate) unsafe fn $split_unchecked( + $($field: u8,)* + haystack: &str, + ) -> Option<(&str, &str)> { + let idx = memchr::$memchr($($field,)* haystack.as_bytes())?; + // SAFETY: The caller ensures that the needles are at char boundary. + // The found index `< haystack.len()`. + Some((haystack.get_unchecked(..idx), haystack.get_unchecked(idx..))) + } + }; +} + +new_memchr_type!(Splitter1 split1_unchecked memchr a); +new_memchr_type!(Splitter2 split2_unchecked memchr2 a b); +new_memchr_type!(Splitter3 split3_unchecked memchr3 a b c); + +#[test] +fn candidate_finder() { + assert_eq!( + Splitter1::new("test").split("abctefg"), + Some(("abc", "tefg")), + ); + assert_eq!(Splitter1::new("xyz").split("abctefg"), None); + + assert_eq!( + Splitter2::new("xyz", "foo").split("abctefg"), + Some(("abcte", "fg")), + ); + assert_eq!(Splitter2::new("oof", "xyz").split("abctefg"), None); + + assert_eq!( + Splitter3::new("oof", "apples", "xyz").split("abctefg"), + Some(("", "abctefg")), + ); + assert_eq!( + Splitter3::new("oof", "peaches", "xyz").split("abctefg"), + None + ); + + assert_eq!( + Splitter3::new("test", "test", "test").split("abctefg"), + Some(("abc", "tefg")), + ); + + assert_eq!( + Splitter3::new("🧚‍♀️Life", "😀Laugh", "😻Love") + .split("sed diam nonumy eirmod tempor 🧚‍♀️Life ut labore et dolore magna aliquyam"), + Some(( + "sed diam nonumy eirmod tempor ", + "🧚‍♀️Life ut labore et dolore magna aliquyam" + )), + ); +} diff --git a/rinja_parser/src/node.rs b/rinja_parser/src/node.rs index 4be2ec33a..5a822bb25 100644 --- a/rinja_parser/src/node.rs +++ b/rinja_parser/src/node.rs @@ -9,6 +9,7 @@ use nom::error_position; use nom::multi::{many0, many1, separated_list0}; use nom::sequence::{delimited, pair, preceded, tuple}; +use crate::memchr_splitter::{Splitter1, Splitter2, Splitter3}; use crate::{ filter, identifier, is_ws, keyword, not_ws, skip_till, str_lit, ws, ErrorContext, Expr, Filter, ParseResult, State, Target, WithSpan, @@ -755,14 +756,20 @@ pub struct Lit<'a> { impl<'a> Lit<'a> { fn parse(i: &'a str, s: &State<'_>) -> ParseResult<'a, WithSpan<'a, Self>> { let start = i; + let (i, _) = not(eof)(i)?; + + let candidate_finder = Splitter3::new( + s.syntax.block_start, + s.syntax.comment_start, + s.syntax.expr_start, + ); let p_start = alt(( tag(s.syntax.block_start), tag(s.syntax.comment_start), tag(s.syntax.expr_start), )); - let (i, _) = not(eof)(i)?; - let (i, content) = opt(recognize(skip_till(p_start)))(i)?; + let (i, content) = opt(recognize(skip_till(candidate_finder, p_start)))(i)?; let (i, content) = match content { Some("") => { // {block,comment,expr}_start follows immediately. @@ -810,7 +817,7 @@ impl<'a> Raw<'a> { cut(tuple(( opt(Whitespace::parse), |i| s.tag_block_end(i), - consumed(skip_till(endraw)), + consumed(skip_till(Splitter1::new(s.syntax.block_start), endraw)), ))), )); @@ -989,7 +996,8 @@ impl<'a> Comment<'a> { let mut depth = 0usize; loop { let start = i; - let (_, tag) = opt(skip_till(|i| tag(i, s)))(i)?; + let splitter = Splitter2::new(s.syntax.comment_start, s.syntax.comment_end); + let (_, tag) = opt(skip_till(splitter, |i| tag(i, s)))(i)?; let Some((j, tag)) = tag else { return Err( ErrorContext::unclosed("comment", s.syntax.comment_end, start).into(), diff --git a/rinja_parser/src/tests.rs b/rinja_parser/src/tests.rs index 2860ce692..e2e9f6c53 100644 --- a/rinja_parser/src/tests.rs +++ b/rinja_parser/src/tests.rs @@ -371,10 +371,36 @@ fn change_delimiters_parse_filter() { expr_end: "=}", ..Syntax::default() }; - Ast::from_str("{= strvar|e =}", None, &syntax).unwrap(); } +#[test] +fn unicode_delimiters_in_syntax() { + let syntax = Syntax { + expr_start: "🖎", // U+1F58E == b"\xf0\x9f\x96\x8e" + expr_end: "✍", // U+270D = b'\xe2\x9c\x8d' + ..Syntax::default() + }; + assert_eq!( + Ast::from_str("Here comes the expression: 🖎 e ✍.", None, &syntax) + .unwrap() + .nodes(), + [ + Node::Lit(WithSpan::no_span(Lit { + lws: "", + val: "Here comes the expression:", + rws: " ", + })), + Node::Expr(Ws(None, None), WithSpan::no_span(Expr::Var("e")),), + Node::Lit(WithSpan::no_span(Lit { + lws: "", + val: ".", + rws: "", + })), + ], + ); +} + #[test] fn test_precedence() { let syntax = Syntax::default();