Skip to content

Commit 9525e43

Browse files
committed
Parse grammar without regexes
We'd been parsing the grammar with a combination of recursive descent and regular expression matchers. This combination has its merits, and it's done tastefully here, but it seems maybe more straightforward to do the parsing entirely with recursive descent. Among other things, doing it this way allows us to provide more precise error reporting on malformed inputs. The cost, in terms of lines of code, of doing this entirely with recursive descent is rather modest, and the result seems at least as clear -- there's some mental cost to code switching between the two worlds. So let's make the switch and parse the grammar without regular expressions. We verified that the rendered output of the Reference is byte identical before and after this change.
1 parent c703c8d commit 9525e43

File tree

1 file changed

+70
-48
lines changed

1 file changed

+70
-48
lines changed

mdbook-spec/src/grammar/parser.rs

+70-48
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,16 @@
11
//! A parser of the ENBF-like grammar.
22
33
use super::{Characters, Expression, ExpressionKind, Grammar, Production};
4-
use regex::{Captures, Regex};
54
use std::fmt;
65
use std::fmt::Display;
76
use std::path::Path;
8-
use std::sync::LazyLock;
97

108
struct Parser<'a> {
119
input: &'a str,
1210
index: usize,
1311
}
1412

13+
#[derive(Debug)]
1514
pub struct Error {
1615
message: String,
1716
line: String,
@@ -76,18 +75,6 @@ impl Parser<'_> {
7675
&self.input[i..i + upper]
7776
}
7877

79-
/// If the input matches the given regex, it is returned and the head is moved forward.
80-
///
81-
/// Note that regexes must start with `^`.
82-
fn take_re(&mut self, re: &Regex) -> Option<Captures<'_>> {
83-
if let Some(cap) = re.captures(&self.input[self.index..]) {
84-
self.index += cap[0].len();
85-
Some(cap)
86-
} else {
87-
None
88-
}
89-
}
90-
9178
/// Returns whether or not the given string is next, and advances the head if it is.
9279
fn take_str(&mut self, s: &str) -> bool {
9380
if self.input[self.index..].starts_with(s) {
@@ -168,13 +155,12 @@ impl Parser<'_> {
168155
}
169156

170157
fn parse_expression(&mut self) -> Result<Option<Expression>> {
171-
static ALT_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^ *\| *").unwrap());
172-
173158
let mut es = Vec::new();
174159
loop {
175160
let Some(e) = self.parse_seq()? else { break };
176161
es.push(e);
177-
if self.take_re(&ALT_RE).is_none() {
162+
_ = self.space0();
163+
if !self.take_str("|") {
178164
break;
179165
}
180166
}
@@ -268,21 +254,28 @@ impl Parser<'_> {
268254
Some(ExpressionKind::Nt(nt))
269255
}
270256

257+
/// Parse terminal within backticks.
271258
fn parse_terminal(&mut self) -> Result<ExpressionKind> {
272-
static TERMINAL_RE: LazyLock<Regex> =
273-
LazyLock::new(|| Regex::new(r"^`([^`\n]+)`").unwrap());
274-
match self.take_re(&TERMINAL_RE) {
275-
Some(cap) => Ok(ExpressionKind::Terminal(cap[1].to_string())),
276-
None => bail!(self, "unterminated terminal, expected closing backtick"),
259+
Ok(ExpressionKind::Terminal(self.parse_terminal_str()?))
260+
}
261+
262+
/// Parse string within backticks.
263+
fn parse_terminal_str(&mut self) -> Result<String> {
264+
self.expect("`", "expected opening backtick")?;
265+
let term = self.take_while(&|x| !['\n', '`'].contains(&x)).to_string();
266+
if term.is_empty() {
267+
bail!(self, "expected terminal");
277268
}
269+
self.expect("`", "expected closing backtick")?;
270+
Ok(term)
278271
}
279272

280273
fn parse_charset(&mut self) -> Result<ExpressionKind> {
281274
self.expect("[", "expected opening [")?;
282275
let mut characters = Vec::new();
283276
loop {
284277
self.space0();
285-
let Some(ch) = self.parse_characters() else {
278+
let Some(ch) = self.parse_characters()? else {
286279
break;
287280
};
288281
characters.push(ch);
@@ -295,27 +288,49 @@ impl Parser<'_> {
295288
Ok(ExpressionKind::Charset(characters))
296289
}
297290

298-
fn parse_characters(&mut self) -> Option<Characters> {
299-
static RANGE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^`(.)`-`(.)`").unwrap());
300-
static TERMINAL_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new("^`([^`\n]+)`").unwrap());
301-
if let Some(cap) = self.take_re(&RANGE_RE) {
302-
let a = cap[1].chars().next().unwrap();
303-
let b = cap[2].chars().next().unwrap();
304-
Some(Characters::Range(a, b))
305-
} else if let Some(cap) = self.take_re(&TERMINAL_RE) {
306-
Some(Characters::Terminal(cap[1].to_string()))
291+
/// Parse an element of a character class, e.g.
292+
/// `` `a`-`b` `` | `` `term` `` | `` NonTerminal ``.
293+
fn parse_characters(&mut self) -> Result<Option<Characters>> {
294+
if let Some(b'`') = self.peek() {
295+
let recov = self.index;
296+
let a = self.parse_terminal_str()?;
297+
if let Some(b'-') = self.peek() {
298+
//~^ Parse `` `a`-`b` `` character range.
299+
if a.len() > 1 {
300+
self.index = recov + 1;
301+
bail!(self, "invalid start terminal in range");
302+
}
303+
self.expect("-", "expected `-`").unwrap();
304+
let recov = self.index;
305+
let b = self.parse_terminal_str()?;
306+
if b.len() > 1 {
307+
self.index = recov + 1;
308+
bail!(self, "invalid end terminal in range");
309+
}
310+
let a = a.chars().next().unwrap();
311+
let b = b.chars().next().unwrap();
312+
Ok(Some(Characters::Range(a, b)))
313+
} else {
314+
//~^ Parse terminal in backticks.
315+
Ok(Some(Characters::Terminal(a)))
316+
}
317+
} else if let Some(name) = self.parse_name() {
318+
//~^ Parse nonterminal identifier.
319+
Ok(Some(Characters::Named(name)))
307320
} else {
308-
let name = self.parse_name()?;
309-
Some(Characters::Named(name))
321+
Ok(None)
310322
}
311323
}
312324

325+
/// Parse e.g. `<prose text>`.
313326
fn parse_prose(&mut self) -> Result<ExpressionKind> {
314-
static PROSE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^<([^>\n]+)>").unwrap());
315-
match self.take_re(&PROSE_RE) {
316-
Some(cap) => Ok(ExpressionKind::Prose(cap[1].to_string())),
317-
None => bail!(self, "unterminated prose, expected closing `>`"),
327+
self.expect("<", "expected opening `<`")?;
328+
let text = self.take_while(&|x| !['\n', '>'].contains(&x)).to_string();
329+
if text.is_empty() {
330+
bail!(self, "expected prose text");
318331
}
332+
self.expect(">", "expected closing `>`")?;
333+
Ok(ExpressionKind::Prose(text))
319334
}
320335

321336
fn parse_grouped(&mut self) -> Result<ExpressionKind> {
@@ -344,13 +359,19 @@ impl Parser<'_> {
344359
Ok(ExpressionKind::NegExpression(box_kind(kind)))
345360
}
346361

362+
/// Parse e.g. `F00F` after `U+`.
347363
fn parse_unicode(&mut self) -> Result<ExpressionKind> {
348-
static UNICODE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^[A-Z0-9]{4}").unwrap());
349-
350-
match self.take_re(&UNICODE_RE) {
351-
Some(s) => Ok(ExpressionKind::Unicode(s[0].to_string())),
352-
None => bail!(self, "expected 4 hexadecimal uppercase digits after U+"),
364+
let mut xs = Vec::with_capacity(4);
365+
for _ in 0..4 {
366+
match self.peek() {
367+
Some(x @ (b'0'..=b'9' | b'A'..=b'F')) => {
368+
xs.push(x);
369+
self.index += 1;
370+
}
371+
_ => bail!(self, "expected 4 uppercase hexidecimal digits after `U+`"),
372+
}
353373
}
374+
Ok(ExpressionKind::Unicode(String::from_utf8(xs).unwrap()))
354375
}
355376

356377
/// Parse `?` after expression.
@@ -428,16 +449,17 @@ impl Parser<'_> {
428449
Ok(Some(self.input[start..self.index - 1].to_string()))
429450
}
430451

452+
/// Parse footnote reference, e.g. `[^id]`.
431453
fn parse_footnote(&mut self) -> Result<Option<String>> {
432-
static FOOTNOTE_RE: LazyLock<Regex> =
433-
LazyLock::new(|| Regex::new(r"^([^\]\n]+)]").unwrap());
434454
if !self.take_str("[^") {
435455
return Ok(None);
436456
}
437-
match self.take_re(&FOOTNOTE_RE) {
438-
Some(cap) => Ok(Some(cap[1].to_string())),
439-
None => bail!(self, "unterminated footnote, expected closing `]`"),
457+
let id = self.take_while(&|x| !['\n', ']'].contains(&x)).to_string();
458+
if id.is_empty() {
459+
bail!(self, "expected footnote id");
440460
}
461+
self.expect("]", "expected closing `]`")?;
462+
Ok(Some(id))
441463
}
442464
}
443465

0 commit comments

Comments
 (0)