Skip to content

Commit 2786fb4

Browse files
committed
Parse grammar without regexes
We'd been parsing the grammar with a combination of recursive descent and regular expression matchers. This combination has its merits, and it's done tastefully here, but it seems maybe more straightforward to do the parsing entirely with recursive descent. Among other things, doing it this way allows us to provide more precise error reporting on malformed inputs. The cost, in terms of lines of code, of doing this entirely with recursive descent is rather modest, and the result seems at least as clear -- there's some mental cost to code switching between the two worlds. So let's make the switch and parse the grammar without regular expressions. We verified that the rendered output of the Reference is byte identical before and after this change.
1 parent c703c8d commit 2786fb4

File tree

1 file changed

+68
-48
lines changed

1 file changed

+68
-48
lines changed

mdbook-spec/src/grammar/parser.rs

+68-48
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
//! A parser of the ENBF-like grammar.
22
33
use super::{Characters, Expression, ExpressionKind, Grammar, Production};
4-
use regex::{Captures, Regex};
54
use std::fmt;
65
use std::fmt::Display;
76
use std::path::Path;
8-
use std::sync::LazyLock;
97

108
struct Parser<'a> {
119
input: &'a str,
@@ -76,18 +74,6 @@ impl Parser<'_> {
7674
&self.input[i..i + upper]
7775
}
7876

79-
/// If the input matches the given regex, it is returned and the head is moved forward.
80-
///
81-
/// Note that regexes must start with `^`.
82-
fn take_re(&mut self, re: &Regex) -> Option<Captures<'_>> {
83-
if let Some(cap) = re.captures(&self.input[self.index..]) {
84-
self.index += cap[0].len();
85-
Some(cap)
86-
} else {
87-
None
88-
}
89-
}
90-
9177
/// Returns whether or not the given string is next, and advances the head if it is.
9278
fn take_str(&mut self, s: &str) -> bool {
9379
if self.input[self.index..].starts_with(s) {
@@ -168,13 +154,12 @@ impl Parser<'_> {
168154
}
169155

170156
fn parse_expression(&mut self) -> Result<Option<Expression>> {
171-
static ALT_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^ *\| *").unwrap());
172-
173157
let mut es = Vec::new();
174158
loop {
175159
let Some(e) = self.parse_seq()? else { break };
176160
es.push(e);
177-
if self.take_re(&ALT_RE).is_none() {
161+
_ = self.space0();
162+
if !self.take_str("|") {
178163
break;
179164
}
180165
}
@@ -268,21 +253,28 @@ impl Parser<'_> {
268253
Some(ExpressionKind::Nt(nt))
269254
}
270255

256+
/// Parse terminal within backticks.
271257
fn parse_terminal(&mut self) -> Result<ExpressionKind> {
272-
static TERMINAL_RE: LazyLock<Regex> =
273-
LazyLock::new(|| Regex::new(r"^`([^`\n]+)`").unwrap());
274-
match self.take_re(&TERMINAL_RE) {
275-
Some(cap) => Ok(ExpressionKind::Terminal(cap[1].to_string())),
276-
None => bail!(self, "unterminated terminal, expected closing backtick"),
258+
Ok(ExpressionKind::Terminal(self.parse_terminal_str()?))
259+
}
260+
261+
/// Parse string within backticks.
262+
fn parse_terminal_str(&mut self) -> Result<String> {
263+
self.expect("`", "expected opening backtick")?;
264+
let term = self.take_while(&|x| !['\n', '`'].contains(&x)).to_string();
265+
if term.is_empty() {
266+
bail!(self, "expected terminal");
277267
}
268+
self.expect("`", "expected closing backtick")?;
269+
Ok(term)
278270
}
279271

280272
fn parse_charset(&mut self) -> Result<ExpressionKind> {
281273
self.expect("[", "expected opening [")?;
282274
let mut characters = Vec::new();
283275
loop {
284276
self.space0();
285-
let Some(ch) = self.parse_characters() else {
277+
let Some(ch) = self.parse_characters()? else {
286278
break;
287279
};
288280
characters.push(ch);
@@ -295,27 +287,48 @@ impl Parser<'_> {
295287
Ok(ExpressionKind::Charset(characters))
296288
}
297289

298-
fn parse_characters(&mut self) -> Option<Characters> {
299-
static RANGE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^`(.)`-`(.)`").unwrap());
300-
static TERMINAL_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new("^`([^`\n]+)`").unwrap());
301-
if let Some(cap) = self.take_re(&RANGE_RE) {
302-
let a = cap[1].chars().next().unwrap();
303-
let b = cap[2].chars().next().unwrap();
304-
Some(Characters::Range(a, b))
305-
} else if let Some(cap) = self.take_re(&TERMINAL_RE) {
306-
Some(Characters::Terminal(cap[1].to_string()))
290+
/// Parse an element of a character class, e.g.
291+
/// `` `a`-`b` `` | `` `term` `` | `` NonTerminal ``.
292+
fn parse_characters(&mut self) -> Result<Option<Characters>> {
293+
if let Some(b'`') = self.peek() {
294+
let recov = self.index;
295+
let a = self.parse_terminal_str()?;
296+
if self.take_str("-") {
297+
//~^ Parse `` `a`-`b` `` character range.
298+
if a.len() > 1 {
299+
self.index = recov + 1;
300+
bail!(self, "invalid start terminal in range");
301+
}
302+
let recov = self.index;
303+
let b = self.parse_terminal_str()?;
304+
if b.len() > 1 {
305+
self.index = recov + 1;
306+
bail!(self, "invalid end terminal in range");
307+
}
308+
let a = a.chars().next().unwrap();
309+
let b = b.chars().next().unwrap();
310+
Ok(Some(Characters::Range(a, b)))
311+
} else {
312+
//~^ Parse terminal in backticks.
313+
Ok(Some(Characters::Terminal(a)))
314+
}
315+
} else if let Some(name) = self.parse_name() {
316+
//~^ Parse nonterminal identifier.
317+
Ok(Some(Characters::Named(name)))
307318
} else {
308-
let name = self.parse_name()?;
309-
Some(Characters::Named(name))
319+
Ok(None)
310320
}
311321
}
312322

323+
/// Parse e.g. `<prose text>`.
313324
fn parse_prose(&mut self) -> Result<ExpressionKind> {
314-
static PROSE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^<([^>\n]+)>").unwrap());
315-
match self.take_re(&PROSE_RE) {
316-
Some(cap) => Ok(ExpressionKind::Prose(cap[1].to_string())),
317-
None => bail!(self, "unterminated prose, expected closing `>`"),
325+
self.expect("<", "expected opening `<`")?;
326+
let text = self.take_while(&|x| !['\n', '>'].contains(&x)).to_string();
327+
if text.is_empty() {
328+
bail!(self, "expected prose text");
318329
}
330+
self.expect(">", "expected closing `>`")?;
331+
Ok(ExpressionKind::Prose(text))
319332
}
320333

321334
fn parse_grouped(&mut self) -> Result<ExpressionKind> {
@@ -344,13 +357,19 @@ impl Parser<'_> {
344357
Ok(ExpressionKind::NegExpression(box_kind(kind)))
345358
}
346359

360+
/// Parse e.g. `F00F` after `U+`.
347361
fn parse_unicode(&mut self) -> Result<ExpressionKind> {
348-
static UNICODE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^[A-Z0-9]{4}").unwrap());
349-
350-
match self.take_re(&UNICODE_RE) {
351-
Some(s) => Ok(ExpressionKind::Unicode(s[0].to_string())),
352-
None => bail!(self, "expected 4 hexadecimal uppercase digits after U+"),
362+
let mut xs = Vec::with_capacity(4);
363+
for _ in 0..4 {
364+
match self.peek() {
365+
Some(x @ (b'0'..=b'9' | b'A'..=b'F')) => {
366+
xs.push(x);
367+
self.index += 1;
368+
}
369+
_ => bail!(self, "expected 4 uppercase hexidecimal digits after `U+`"),
370+
}
353371
}
372+
Ok(ExpressionKind::Unicode(String::from_utf8(xs).unwrap()))
354373
}
355374

356375
/// Parse `?` after expression.
@@ -428,16 +447,17 @@ impl Parser<'_> {
428447
Ok(Some(self.input[start..self.index - 1].to_string()))
429448
}
430449

450+
/// Parse footnote reference, e.g. `[^id]`.
431451
fn parse_footnote(&mut self) -> Result<Option<String>> {
432-
static FOOTNOTE_RE: LazyLock<Regex> =
433-
LazyLock::new(|| Regex::new(r"^([^\]\n]+)]").unwrap());
434452
if !self.take_str("[^") {
435453
return Ok(None);
436454
}
437-
match self.take_re(&FOOTNOTE_RE) {
438-
Some(cap) => Ok(Some(cap[1].to_string())),
439-
None => bail!(self, "unterminated footnote, expected closing `]`"),
455+
let id = self.take_while(&|x| !['\n', ']'].contains(&x)).to_string();
456+
if id.is_empty() {
457+
bail!(self, "expected footnote id");
440458
}
459+
self.expect("]", "expected closing `]`")?;
460+
Ok(Some(id))
441461
}
442462
}
443463

0 commit comments

Comments
 (0)