Skip to content

Commit

Permalink
Merge pull request #90 from Kijewski/pr-faster-skip-till
Browse files Browse the repository at this point in the history
parser: use `memchr` to speed-up `skip_till()`
  • Loading branch information
GuillaumeGomez authored Jul 28, 2024
2 parents 732d6b4 + 7b99783 commit bf03e44
Show file tree
Hide file tree
Showing 5 changed files with 153 additions and 16 deletions.
1 change: 1 addition & 0 deletions rinja_parser/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ edition = "2021"
rust-version = "1.71"

[dependencies]
memchr = "2"
nom = { version = "7", default-features = false, features = ["alloc"] }

[dev-dependencies]
Expand Down
23 changes: 12 additions & 11 deletions rinja_parser/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use nom::{error_position, AsChar, InputTakeAtPosition};

pub mod expr;
pub use expr::{Expr, Filter};
mod memchr_splitter;
pub mod node;
pub use node::Node;

Expand Down Expand Up @@ -362,22 +363,22 @@ fn ws<'a, O>(

/// Skips input until `end` was found, but does not consume it.
/// Returns tuple that would be returned when parsing `end`.
fn skip_till<'a, O>(
fn skip_till<'a, 'b, O>(
candidate_finder: impl crate::memchr_splitter::Splitter,
end: impl FnMut(&'a str) -> ParseResult<'a, O>,
) -> impl FnMut(&'a str) -> ParseResult<'a, (&'a str, O)> {
enum Next<O> {
IsEnd(O),
NotEnd,
}
let mut next = alt((map(end, Next::IsEnd), map(anychar, |_| Next::NotEnd)));
let mut next = alt((map(end, Some), map(anychar, |_| None)));
move |start: &'a str| {
let mut i = start;
loop {
let (j, is_end) = next(i)?;
match is_end {
Next::IsEnd(lookahead) => return Ok((i, (j, lookahead))),
Next::NotEnd => i = j,
}
i = match candidate_finder.split(i) {
Some((_, j)) => j,
None => return Err(nom::Err::Error(ErrorContext::new("`end` not found`", i))),
};
i = match next(i)? {
(j, Some(lookahead)) => return Ok((i, (j, lookahead))),
(j, None) => j,
};
}
}
}
Expand Down
101 changes: 101 additions & 0 deletions rinja_parser/src/memchr_splitter.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
pub(crate) trait Splitter: Copy {
/// If any of the needles was found in the haystack, then split the haystack at the first hit.
///
/// Since only the first byte of a needle is inspected, be aware that there can be
/// false-positives. Always compare the latter string of the output if it fits the expected
/// prefix.
fn split<'a>(&self, haystack: &'a str) -> Option<(&'a str, &'a str)>;
}

impl<T: Splitter + ?Sized> Splitter for &T {
#[inline]
fn split<'a>(&self, haystack: &'a str) -> Option<(&'a str, &'a str)> {
T::split(self, haystack)
}
}

// define and implement a string splitter using memchr
macro_rules! new_memchr_type {
($struct:ident $split_unchecked:ident $memchr:ident $($field:ident)*) => {
#[derive(Debug, Clone, Copy)]
pub(crate) struct $struct {
$($field: u8,)*
}

impl $struct {
#[track_caller]
pub(crate) fn new($($field: &str),*) -> Self {
Self {
$($field: $field.as_bytes()[0],)*
}
}

#[inline]
pub(crate) fn split<'a>(&self, haystack: &'a str) -> Option<(&'a str, &'a str)> {
// SAFETY: During the construction of `self` we used strings as inputs, and a
// string always starts with a byte at char boundary.
unsafe { $split_unchecked($(self.$field,)* haystack) }
}
}

impl Splitter for $struct {
#[inline]
fn split<'a>(&self, haystack: &'a str) -> Option<(&'a str, &'a str)> {
self.split(haystack)
}
}

/// SAFETY: caller has to ensure that the needle is at a char boundary
pub(crate) unsafe fn $split_unchecked(
$($field: u8,)*
haystack: &str,
) -> Option<(&str, &str)> {
let idx = memchr::$memchr($($field,)* haystack.as_bytes())?;
// SAFETY: The caller ensures that the needles are at char boundary.
// The found index `< haystack.len()`.
Some((haystack.get_unchecked(..idx), haystack.get_unchecked(idx..)))
}
};
}

new_memchr_type!(Splitter1 split1_unchecked memchr a);
new_memchr_type!(Splitter2 split2_unchecked memchr2 a b);
new_memchr_type!(Splitter3 split3_unchecked memchr3 a b c);

#[test]
fn candidate_finder() {
assert_eq!(
Splitter1::new("test").split("abctefg"),
Some(("abc", "tefg")),
);
assert_eq!(Splitter1::new("xyz").split("abctefg"), None);

assert_eq!(
Splitter2::new("xyz", "foo").split("abctefg"),
Some(("abcte", "fg")),
);
assert_eq!(Splitter2::new("oof", "xyz").split("abctefg"), None);

assert_eq!(
Splitter3::new("oof", "apples", "xyz").split("abctefg"),
Some(("", "abctefg")),
);
assert_eq!(
Splitter3::new("oof", "peaches", "xyz").split("abctefg"),
None
);

assert_eq!(
Splitter3::new("test", "test", "test").split("abctefg"),
Some(("abc", "tefg")),
);

assert_eq!(
Splitter3::new("🧚‍♀️Life", "😀Laugh", "😻Love")
.split("sed diam nonumy eirmod tempor 🧚‍♀️Life ut labore et dolore magna aliquyam"),
Some((
"sed diam nonumy eirmod tempor ",
"🧚‍♀️Life ut labore et dolore magna aliquyam"
)),
);
}
16 changes: 12 additions & 4 deletions rinja_parser/src/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use nom::error_position;
use nom::multi::{many0, many1, separated_list0};
use nom::sequence::{delimited, pair, preceded, tuple};

use crate::memchr_splitter::{Splitter1, Splitter2, Splitter3};
use crate::{
filter, identifier, is_ws, keyword, not_ws, skip_till, str_lit, ws, ErrorContext, Expr, Filter,
ParseResult, State, Target, WithSpan,
Expand Down Expand Up @@ -755,14 +756,20 @@ pub struct Lit<'a> {
impl<'a> Lit<'a> {
fn parse(i: &'a str, s: &State<'_>) -> ParseResult<'a, WithSpan<'a, Self>> {
let start = i;
let (i, _) = not(eof)(i)?;

let candidate_finder = Splitter3::new(
s.syntax.block_start,
s.syntax.comment_start,
s.syntax.expr_start,
);
let p_start = alt((
tag(s.syntax.block_start),
tag(s.syntax.comment_start),
tag(s.syntax.expr_start),
));

let (i, _) = not(eof)(i)?;
let (i, content) = opt(recognize(skip_till(p_start)))(i)?;
let (i, content) = opt(recognize(skip_till(candidate_finder, p_start)))(i)?;
let (i, content) = match content {
Some("") => {
// {block,comment,expr}_start follows immediately.
Expand Down Expand Up @@ -810,7 +817,7 @@ impl<'a> Raw<'a> {
cut(tuple((
opt(Whitespace::parse),
|i| s.tag_block_end(i),
consumed(skip_till(endraw)),
consumed(skip_till(Splitter1::new(s.syntax.block_start), endraw)),
))),
));

Expand Down Expand Up @@ -989,7 +996,8 @@ impl<'a> Comment<'a> {
let mut depth = 0usize;
loop {
let start = i;
let (_, tag) = opt(skip_till(|i| tag(i, s)))(i)?;
let splitter = Splitter2::new(s.syntax.comment_start, s.syntax.comment_end);
let (_, tag) = opt(skip_till(splitter, |i| tag(i, s)))(i)?;
let Some((j, tag)) = tag else {
return Err(
ErrorContext::unclosed("comment", s.syntax.comment_end, start).into(),
Expand Down
28 changes: 27 additions & 1 deletion rinja_parser/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -371,10 +371,36 @@ fn change_delimiters_parse_filter() {
expr_end: "=}",
..Syntax::default()
};

Ast::from_str("{= strvar|e =}", None, &syntax).unwrap();
}

#[test]
fn unicode_delimiters_in_syntax() {
let syntax = Syntax {
expr_start: "🖎", // U+1F58E == b"\xf0\x9f\x96\x8e"
expr_end: "✍", // U+270D = b'\xe2\x9c\x8d'
..Syntax::default()
};
assert_eq!(
Ast::from_str("Here comes the expression: 🖎 e ✍.", None, &syntax)
.unwrap()
.nodes(),
[
Node::Lit(WithSpan::no_span(Lit {
lws: "",
val: "Here comes the expression:",
rws: " ",
})),
Node::Expr(Ws(None, None), WithSpan::no_span(Expr::Var("e")),),
Node::Lit(WithSpan::no_span(Lit {
lws: "",
val: ".",
rws: "",
})),
],
);
}

#[test]
fn test_precedence() {
let syntax = Syntax::default();
Expand Down

0 comments on commit bf03e44

Please sign in to comment.