Skip to content

Commit a66df89

Browse files
committed
Rewrite parser as part of new regex-syntax crate.
This commit introduces a new `regex-syntax` crate that provides a regular expression parser and an abstract syntax for regular expressions. As part of this effort, the parser has been rewritten and has grown a substantial number of tests. The `regex` crate itself hasn't changed too much. I opted for the smallest possible delta to get it working with the new regex AST. In most cases, this simplified code because it no longer has to deal with unwieldy flags. (Instead, flag information is baked into the AST.) Here is a list of public facing non-breaking changes: * A new `regex-syntax` crate with a parser, regex AST and lots of tests. This closes #29 and fixes #84. * A new flag, `x`, has been added. This allows one to write regexes with insignificant whitespace and comments. * Repetition operators can now be directly applied to zero-width matches. e.g., `\b+` was previously not allowed but now works. Note that one could always write `(\b)+` previously. This change is mostly about lifting an arbitrary restriction. And a list of breaking changes: * A new `Regex::with_size_limit` constructor function, that allows one to tweak the limit on the size of a compiled regex. This fixes #67. The new method isn't a breaking change, but regexes that exceed the size limit (set to 10MB by default) will no longer compile. To fix, simply call `Regex::with_size_limit` with a bigger limit. * Capture group names cannot start with a number. This is a breaking change because regexes that previously compiled (e.g., `(?P<1a>.)`) will now return an error. This fixes #69. * The `regex::Error` type has been changed to reflect the better error reporting in the `regex-syntax` crate, and a new error for limiting regexes to a certain size. This is a breaking change. Most folks just call `unwrap()` on `Regex::new`, so I expect this to have minimal impact. Closes #29, #67, #69, #79, #84. [breaking-change]
1 parent 3e26dc6 commit a66df89

File tree

15 files changed

+4268
-1503
lines changed

15 files changed

+4268
-1503
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,7 @@
22
/Cargo.lock
33
/regex_macros/target
44
/regex_macros/Cargo.lock
5+
/regex_syntax/target
6+
/regex_syntax/Cargo.lock
7+
/bench-log
58
.*.swp

Cargo.toml

+8
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,16 @@ path = "regex_macros/benches/bench_dynamic.rs"
2121
test = false
2222
bench = true
2323

24+
[dependencies.regex-syntax]
25+
path = "regex_syntax"
26+
version = "*"
27+
2428
[dev-dependencies]
2529
rand = "0.3"
2630

2731
[features]
2832
pattern = []
33+
34+
[profile.bench]
35+
opt-level = 3
36+
lto = true

regex_macros/src/lib.rs

+67-71
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,7 @@ use rustc::plugin::Registry;
3636

3737
use regex::Regex;
3838
use regex::native::{
39-
OneChar, CharClass, Any, Save, Jump, Split,
40-
Match, EmptyBegin, EmptyEnd, EmptyWordBoundary,
41-
Program, Dynamic, ExDynamic, Native,
42-
FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED,
39+
Inst, Program, Dynamic, ExDynamic, Native,
4340
simple_case_fold,
4441
};
4542

@@ -79,7 +76,9 @@ fn native(cx: &mut ExtCtxt, sp: codemap::Span, tts: &[ast::TokenTree])
7976
// error is logged in 'parse' with cx.span_err
8077
None => return DummyResult::any(sp),
8178
};
82-
let re = match Regex::new(&regex) {
79+
// We use the largest possible size limit because this is happening at
80+
// compile time. We trust the programmer.
81+
let re = match Regex::with_size_limit(::std::usize::MAX, &regex) {
8382
Ok(re) => re,
8483
Err(err) => {
8584
cx.span_err(sp, &err.to_string());
@@ -121,11 +120,10 @@ impl<'a> NfaGen<'a> {
121120
None => cx.expr_none(self.sp),
122121
}
123122
);
124-
let prefix_anchor =
125-
match self.prog.insts[1] {
126-
EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true,
127-
_ => false,
128-
};
123+
let prefix_anchor = match self.prog.insts[1] {
124+
Inst::StartText => true,
125+
_ => false,
126+
};
129127
let init_groups = self.vec_expr(0..num_cap_locs,
130128
&mut |cx, _| cx.expr_none(self.sp));
131129

@@ -338,49 +336,55 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
338336
let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
339337
let nextpc = pc + 1;
340338
let body = match *inst {
341-
EmptyBegin(flags) => {
342-
let cond =
343-
if flags & FLAG_MULTI > 0 {
344-
quote_expr!(self.cx,
345-
self.chars.is_begin()
346-
|| self.chars.prev == Some('\n')
347-
)
348-
} else {
349-
quote_expr!(self.cx, self.chars.is_begin())
350-
};
339+
Inst::StartLine => {
351340
quote_expr!(self.cx, {
352341
nlist.add_empty($pc);
353-
if $cond { self.add(nlist, $nextpc, &mut *groups) }
342+
if self.chars.is_begin() || self.chars.prev == Some('\n') {
343+
self.add(nlist, $nextpc, &mut *groups)
344+
}
354345
})
355346
}
356-
EmptyEnd(flags) => {
357-
let cond =
358-
if flags & FLAG_MULTI > 0 {
359-
quote_expr!(self.cx,
360-
self.chars.is_end()
361-
|| self.chars.cur == Some('\n')
362-
)
363-
} else {
364-
quote_expr!(self.cx, self.chars.is_end())
365-
};
347+
Inst::StartText => {
366348
quote_expr!(self.cx, {
367349
nlist.add_empty($pc);
368-
if $cond { self.add(nlist, $nextpc, &mut *groups) }
350+
if self.chars.is_begin() {
351+
self.add(nlist, $nextpc, &mut *groups)
352+
}
369353
})
370354
}
371-
EmptyWordBoundary(flags) => {
372-
let cond =
373-
if flags & FLAG_NEGATED > 0 {
374-
quote_expr!(self.cx, !self.chars.is_word_boundary())
375-
} else {
376-
quote_expr!(self.cx, self.chars.is_word_boundary())
377-
};
355+
Inst::EndLine => {
356+
quote_expr!(self.cx, {
357+
nlist.add_empty($pc);
358+
if self.chars.is_end() || self.chars.cur == Some('\n') {
359+
self.add(nlist, $nextpc, &mut *groups)
360+
}
361+
})
362+
}
363+
Inst::EndText => {
364+
quote_expr!(self.cx, {
365+
nlist.add_empty($pc);
366+
if self.chars.is_end() {
367+
self.add(nlist, $nextpc, &mut *groups)
368+
}
369+
})
370+
}
371+
Inst::WordBoundary => {
378372
quote_expr!(self.cx, {
379373
nlist.add_empty($pc);
380-
if $cond { self.add(nlist, $nextpc, &mut *groups) }
374+
if self.chars.is_word_boundary() {
375+
self.add(nlist, $nextpc, &mut *groups)
376+
}
377+
})
378+
}
379+
Inst::NotWordBoundary => {
380+
quote_expr!(self.cx, {
381+
nlist.add_empty($pc);
382+
if !self.chars.is_word_boundary() {
383+
self.add(nlist, $nextpc, &mut *groups)
384+
}
381385
})
382386
}
383-
Save(slot) => {
387+
Inst::Save(slot) => {
384388
let save = quote_expr!(self.cx, {
385389
let old = groups[$slot];
386390
groups[$slot] = Some(self.ic);
@@ -411,20 +415,20 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
411415
})
412416
}
413417
}
414-
Jump(to) => {
418+
Inst::Jump(to) => {
415419
quote_expr!(self.cx, {
416420
nlist.add_empty($pc);
417421
self.add(nlist, $to, &mut *groups);
418422
})
419423
}
420-
Split(x, y) => {
424+
Inst::Split(x, y) => {
421425
quote_expr!(self.cx, {
422426
nlist.add_empty($pc);
423427
self.add(nlist, $x, &mut *groups);
424428
self.add(nlist, $y, &mut *groups);
425429
})
426430
}
427-
// For Match, OneChar, CharClass, Any
431+
// For Match, OneChar, CharClass, Any, AnyNoNL
428432
_ => quote_expr!(self.cx, nlist.add($pc, &*groups)),
429433
};
430434
self.arm_inst(pc, body)
@@ -439,7 +443,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
439443
let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
440444
let nextpc = pc + 1;
441445
let body = match *inst {
442-
Match => {
446+
Inst::Match => {
443447
quote_expr!(self.cx, {
444448
match self.which {
445449
Exists => {
@@ -459,8 +463,8 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
459463
}
460464
})
461465
}
462-
OneChar(c, flags) => {
463-
if flags & FLAG_NOCASE > 0 {
466+
Inst::OneChar { c, casei } => {
467+
if casei {
464468
let upc = simple_case_fold(c);
465469
quote_expr!(self.cx, {
466470
let upc = self.chars.prev.map(simple_case_fold);
@@ -476,45 +480,37 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
476480
})
477481
}
478482
}
479-
CharClass(ref ranges, flags) => {
480-
let negate = flags & FLAG_NEGATED > 0;
481-
let casei = flags & FLAG_NOCASE > 0;
483+
Inst::CharClass(ref cls) => {
484+
let ranges: Vec<(char, char)> =
485+
cls.iter().map(|r| (r.start, r.end)).collect();
486+
let mranges = self.match_class(&ranges);
482487
let get_char =
483-
if casei {
488+
if cls.is_case_insensitive() {
484489
quote_expr!(
485490
self.cx,
486491
simple_case_fold(self.chars.prev.unwrap()))
487492
} else {
488493
quote_expr!(self.cx, self.chars.prev.unwrap())
489494
};
490-
let negcond =
491-
if negate {
492-
quote_expr!(self.cx, !found)
493-
} else {
494-
quote_expr!(self.cx, found)
495-
};
496-
let mranges = self.match_class(&ranges);
497495
quote_expr!(self.cx, {
498496
if self.chars.prev.is_some() {
499497
let c = $get_char;
500-
let found = $mranges;
501-
if $negcond {
498+
if $mranges {
502499
self.add(nlist, $nextpc, caps);
503500
}
504501
}
505502
})
506503
}
507-
Any(flags) => {
508-
if flags & FLAG_DOTNL > 0 {
509-
quote_expr!(self.cx, self.add(nlist, $nextpc, caps))
510-
} else {
511-
quote_expr!(self.cx, {
512-
if self.chars.prev != Some('\n') {
513-
self.add(nlist, $nextpc, caps)
514-
}
515-
()
516-
})
517-
}
504+
Inst::Any => {
505+
quote_expr!(self.cx, self.add(nlist, $nextpc, caps))
506+
}
507+
Inst::AnyNoNL => {
508+
quote_expr!(self.cx, {
509+
if self.chars.prev != Some('\n') {
510+
self.add(nlist, $nextpc, caps);
511+
}
512+
()
513+
})
518514
}
519515
// EmptyBegin, EmptyEnd, EmptyWordBoundary, Save, Jump, Split
520516
_ => self.empty_block(),

regex_macros/tests/tests.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,8 @@ replace!(rep_named, replace_all,
203203
"w1 w2 w3 w4", "$last $first$space", "w2 w1 w4 w3");
204204
replace!(rep_trim, replace_all, "^[ \t]+|[ \t]+$", " \t trim me\t \t",
205205
"", "trim me");
206+
replace!(rep_number_hypen, replace, r"(.)(.)", "ab", "$1-$2", "a-b");
207+
replace!(rep_number_underscore, replace, r"(.)(.)", "ab", "$1_$2", "a_b");
206208

207209
macro_rules! noparse(
208210
($name:ident, $re:expr) => (
@@ -219,7 +221,6 @@ macro_rules! noparse(
219221

220222
noparse!(fail_double_repeat, "a**");
221223
noparse!(fail_no_repeat_arg, "*");
222-
noparse!(fail_no_repeat_arg_begin, "^*");
223224
noparse!(fail_incomplete_escape, "\\");
224225
noparse!(fail_class_incomplete, "[A-");
225226
noparse!(fail_class_not_closed, "[A");
@@ -235,8 +236,7 @@ noparse!(fail_bad_capture_name, "(?P<na-me>)");
235236
noparse!(fail_bad_flag, "(?a)a");
236237
noparse!(fail_empty_alt_before, "|a");
237238
noparse!(fail_empty_alt_after, "a|");
238-
noparse!(fail_counted_big_exact, "a{1001}");
239-
noparse!(fail_counted_big_min, "a{1001,}");
239+
noparse!(fail_too_big, "a{10000000}");
240240
noparse!(fail_counted_no_close, "a{1001");
241241
noparse!(fail_unfinished_cap, "(?");
242242
noparse!(fail_unfinished_escape, "\\");

regex_syntax/Cargo.toml

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[package]
2+
name = "regex-syntax"
3+
version = "0.1.0"
4+
authors = ["The Rust Project Developers"]
5+
license = "MIT/Apache-2.0"
6+
repository = "https://github.com/rust-lang/regex"
7+
documentation = "http://doc.rust-lang.org/regex"
8+
homepage = "https://github.com/rust-lang/regex"
9+
description = "A regular expression parser (RE2 only)."
10+
11+
[dev-dependencies]
12+
quickcheck = "*"
13+
rand = "*"

0 commit comments

Comments
 (0)