1
1
//! A parser of the ENBF-like grammar.
2
2
3
3
use super :: { Characters , Expression , ExpressionKind , Grammar , Production } ;
4
- use regex:: { Captures , Regex } ;
5
4
use std:: fmt;
6
5
use std:: fmt:: Display ;
7
6
use std:: path:: Path ;
8
- use std:: sync:: LazyLock ;
9
7
10
8
struct Parser < ' a > {
11
9
input : & ' a str ,
@@ -76,18 +74,6 @@ impl Parser<'_> {
76
74
& self . input [ i..i + upper]
77
75
}
78
76
79
- /// If the input matches the given regex, it is returned and the head is moved forward.
80
- ///
81
- /// Note that regexes must start with `^`.
82
- fn take_re ( & mut self , re : & Regex ) -> Option < Captures < ' _ > > {
83
- if let Some ( cap) = re. captures ( & self . input [ self . index ..] ) {
84
- self . index += cap[ 0 ] . len ( ) ;
85
- Some ( cap)
86
- } else {
87
- None
88
- }
89
- }
90
-
91
77
/// Returns whether or not the given string is next, and advances the head if it is.
92
78
fn take_str ( & mut self , s : & str ) -> bool {
93
79
if self . input [ self . index ..] . starts_with ( s) {
@@ -168,13 +154,12 @@ impl Parser<'_> {
168
154
}
169
155
170
156
fn parse_expression ( & mut self ) -> Result < Option < Expression > > {
171
- static ALT_RE : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( r"^ *\| *" ) . unwrap ( ) ) ;
172
-
173
157
let mut es = Vec :: new ( ) ;
174
158
loop {
175
159
let Some ( e) = self . parse_seq ( ) ? else { break } ;
176
160
es. push ( e) ;
177
- if self . take_re ( & ALT_RE ) . is_none ( ) {
161
+ _ = self . space0 ( ) ;
162
+ if !self . take_str ( "|" ) {
178
163
break ;
179
164
}
180
165
}
@@ -268,21 +253,28 @@ impl Parser<'_> {
268
253
Some ( ExpressionKind :: Nt ( nt) )
269
254
}
270
255
256
+ /// Parse terminal within backticks.
271
257
fn parse_terminal ( & mut self ) -> Result < ExpressionKind > {
272
- static TERMINAL_RE : LazyLock < Regex > =
273
- LazyLock :: new ( || Regex :: new ( r"^`([^`\n]+)`" ) . unwrap ( ) ) ;
274
- match self . take_re ( & TERMINAL_RE ) {
275
- Some ( cap) => Ok ( ExpressionKind :: Terminal ( cap[ 1 ] . to_string ( ) ) ) ,
276
- None => bail ! ( self , "unterminated terminal, expected closing backtick" ) ,
258
+ Ok ( ExpressionKind :: Terminal ( self . parse_terminal_str ( ) ?) )
259
+ }
260
+
261
+ /// Parse string within backticks.
262
+ fn parse_terminal_str ( & mut self ) -> Result < String > {
263
+ self . expect ( "`" , "expected opening backtick" ) ?;
264
+ let term = self . take_while ( & |x| ![ '\n' , '`' ] . contains ( & x) ) . to_string ( ) ;
265
+ if term. is_empty ( ) {
266
+ bail ! ( self , "expected terminal" ) ;
277
267
}
268
+ self . expect ( "`" , "expected closing backtick" ) ?;
269
+ Ok ( term)
278
270
}
279
271
280
272
fn parse_charset ( & mut self ) -> Result < ExpressionKind > {
281
273
self . expect ( "[" , "expected opening [" ) ?;
282
274
let mut characters = Vec :: new ( ) ;
283
275
loop {
284
276
self . space0 ( ) ;
285
- let Some ( ch) = self . parse_characters ( ) else {
277
+ let Some ( ch) = self . parse_characters ( ) ? else {
286
278
break ;
287
279
} ;
288
280
characters. push ( ch) ;
@@ -295,27 +287,48 @@ impl Parser<'_> {
295
287
Ok ( ExpressionKind :: Charset ( characters) )
296
288
}
297
289
298
- fn parse_characters ( & mut self ) -> Option < Characters > {
299
- static RANGE_RE : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( r"^`(.)`-`(.)`" ) . unwrap ( ) ) ;
300
- static TERMINAL_RE : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( "^`([^`\n ]+)`" ) . unwrap ( ) ) ;
301
- if let Some ( cap) = self . take_re ( & RANGE_RE ) {
302
- let a = cap[ 1 ] . chars ( ) . next ( ) . unwrap ( ) ;
303
- let b = cap[ 2 ] . chars ( ) . next ( ) . unwrap ( ) ;
304
- Some ( Characters :: Range ( a, b) )
305
- } else if let Some ( cap) = self . take_re ( & TERMINAL_RE ) {
306
- Some ( Characters :: Terminal ( cap[ 1 ] . to_string ( ) ) )
290
+ /// Parse an element of a character class, e.g.
291
+ /// `` `a`-`b` `` | `` `term` `` | `` NonTerminal ``.
292
+ fn parse_characters ( & mut self ) -> Result < Option < Characters > > {
293
+ if let Some ( b'`' ) = self . peek ( ) {
294
+ let recov = self . index ;
295
+ let a = self . parse_terminal_str ( ) ?;
296
+ if self . take_str ( "-" ) {
297
+ //~^ Parse `` `a`-`b` `` character range.
298
+ if a. len ( ) > 1 {
299
+ self . index = recov + 1 ;
300
+ bail ! ( self , "invalid start terminal in range" ) ;
301
+ }
302
+ let recov = self . index ;
303
+ let b = self . parse_terminal_str ( ) ?;
304
+ if b. len ( ) > 1 {
305
+ self . index = recov + 1 ;
306
+ bail ! ( self , "invalid end terminal in range" ) ;
307
+ }
308
+ let a = a. chars ( ) . next ( ) . unwrap ( ) ;
309
+ let b = b. chars ( ) . next ( ) . unwrap ( ) ;
310
+ Ok ( Some ( Characters :: Range ( a, b) ) )
311
+ } else {
312
+ //~^ Parse terminal in backticks.
313
+ Ok ( Some ( Characters :: Terminal ( a) ) )
314
+ }
315
+ } else if let Some ( name) = self . parse_name ( ) {
316
+ //~^ Parse nonterminal identifier.
317
+ Ok ( Some ( Characters :: Named ( name) ) )
307
318
} else {
308
- let name = self . parse_name ( ) ?;
309
- Some ( Characters :: Named ( name) )
319
+ Ok ( None )
310
320
}
311
321
}
312
322
323
+ /// Parse e.g. `<prose text>`.
313
324
fn parse_prose ( & mut self ) -> Result < ExpressionKind > {
314
- static PROSE_RE : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( r"^<([^>\n]+)>" ) . unwrap ( ) ) ;
315
- match self . take_re ( & PROSE_RE ) {
316
- Some ( cap ) => Ok ( ExpressionKind :: Prose ( cap [ 1 ] . to_string ( ) ) ) ,
317
- None => bail ! ( self , "unterminated prose, expected closing `>`" ) ,
325
+ self . expect ( "<" , "expected opening `<`" ) ? ;
326
+ let text = self . take_while ( & |x| ! [ '\n' , '>' ] . contains ( & x ) ) . to_string ( ) ;
327
+ if text . is_empty ( ) {
328
+ bail ! ( self , "expected prose text" ) ;
318
329
}
330
+ self . expect ( ">" , "expected closing `>`" ) ?;
331
+ Ok ( ExpressionKind :: Prose ( text) )
319
332
}
320
333
321
334
fn parse_grouped ( & mut self ) -> Result < ExpressionKind > {
@@ -344,13 +357,19 @@ impl Parser<'_> {
344
357
Ok ( ExpressionKind :: NegExpression ( box_kind ( kind) ) )
345
358
}
346
359
360
+ /// Parse e.g. `F00F` after `U+`.
347
361
fn parse_unicode ( & mut self ) -> Result < ExpressionKind > {
348
- static UNICODE_RE : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( r"^[A-Z0-9]{4}" ) . unwrap ( ) ) ;
349
-
350
- match self . take_re ( & UNICODE_RE ) {
351
- Some ( s) => Ok ( ExpressionKind :: Unicode ( s[ 0 ] . to_string ( ) ) ) ,
352
- None => bail ! ( self , "expected 4 hexadecimal uppercase digits after U+" ) ,
362
+ let mut xs = Vec :: with_capacity ( 4 ) ;
363
+ for _ in 0 ..4 {
364
+ match self . peek ( ) {
365
+ Some ( x @ ( b'0' ..=b'9' | b'A' ..=b'F' ) ) => {
366
+ xs. push ( x) ;
367
+ self . index += 1 ;
368
+ }
369
+ _ => bail ! ( self , "expected 4 uppercase hexidecimal digits after `U+`" ) ,
370
+ }
353
371
}
372
+ Ok ( ExpressionKind :: Unicode ( String :: from_utf8 ( xs) . unwrap ( ) ) )
354
373
}
355
374
356
375
/// Parse `?` after expression.
@@ -428,16 +447,17 @@ impl Parser<'_> {
428
447
Ok ( Some ( self . input [ start..self . index - 1 ] . to_string ( ) ) )
429
448
}
430
449
450
+ /// Parse footnote reference, e.g. `[^id]`.
431
451
fn parse_footnote ( & mut self ) -> Result < Option < String > > {
432
- static FOOTNOTE_RE : LazyLock < Regex > =
433
- LazyLock :: new ( || Regex :: new ( r"^([^\]\n]+)]" ) . unwrap ( ) ) ;
434
452
if !self . take_str ( "[^" ) {
435
453
return Ok ( None ) ;
436
454
}
437
- match self . take_re ( & FOOTNOTE_RE ) {
438
- Some ( cap ) => Ok ( Some ( cap [ 1 ] . to_string ( ) ) ) ,
439
- None => bail ! ( self , "unterminated footnote, expected closing `]`" ) ,
455
+ let id = self . take_while ( & |x| ! [ '\n' , ']' ] . contains ( & x ) ) . to_string ( ) ;
456
+ if id . is_empty ( ) {
457
+ bail ! ( self , "expected footnote id" ) ;
440
458
}
459
+ self . expect ( "]" , "expected closing `]`" ) ?;
460
+ Ok ( Some ( id) )
441
461
}
442
462
}
443
463
0 commit comments