Skip to content

Commit e063ab6

Browse files
committed
api: introduce new regex-lite crate
Closes #961
1 parent 78b865e commit e063ab6

29 files changed

+7381
-0
lines changed

.github/workflows/ci.yml

+2
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,8 @@ jobs:
122122
- name: Run subset of regex-automata tests
123123
if: matrix.build != 'win-gnu' # Just horrifically slow.
124124
run: ${{ env.CARGO }} test --verbose --manifest-path regex-automata/Cargo.toml $TARGET
125+
- name: Run regex-lite tests
126+
run: ${{ env.CARGO }} test --verbose --manifest-path regex-lite/Cargo.toml $TARGET
125127

126128
# This job runs a stripped down version of CI to test the MSRV. The specific
127129
# reason for doing this is that the regex crate's dev-dependencies tend to

Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ members = [
2222
"regex-automata",
2323
"regex-capi",
2424
"regex-cli",
25+
"regex-lite",
2526
"regex-syntax",
2627
"regex-test",
2728
]

regex-cli/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ log = { version = "0.4.17", features = ["std"] }
3030
memmap2 = "0.5.10"
3131
regex = { path = ".." }
3232
regex-automata = { path = "../regex-automata", features = ["logging"] }
33+
regex-lite = { path = "../regex-lite" }
3334
regex-syntax = { path = "../regex-syntax" }
3435
tabwriter = { version = "1.2.1", features = ["ansi_formatting"] }
3536
textwrap = { version = "0.16.0", default-features = false }

regex-cli/args/lite.rs

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
use {
2+
lexopt::{Arg, Parser},
3+
regex_automata::util::syntax,
4+
regex_lite::Regex,
5+
};
6+
7+
use crate::args::{self, Configurable, Usage};
8+
9+
/// Exposes the configuration for the top-level `Regex` API.
10+
#[derive(Debug, Default)]
11+
pub struct Config {
12+
size_limit: Option<usize>,
13+
}
14+
15+
impl Config {
16+
/// Builds a `Regex` from the given syntax configuration and sequence of
17+
/// patterns. This returns an error is `patterns.len() != 1`.
18+
///
19+
/// Note that this also returns an error if any syntax options are set
20+
/// that aren't supported by `regex-lite`.
21+
pub fn from_patterns(
22+
&self,
23+
syntax: &syntax::Config,
24+
patterns: &[String],
25+
) -> anyhow::Result<Regex> {
26+
anyhow::ensure!(
27+
patterns.len() == 1,
28+
"API-level regex requires exactly one pattern, \
29+
but {} were given",
30+
patterns.len(),
31+
);
32+
anyhow::ensure!(
33+
!syntax.get_octal(),
34+
"regex-lite does not support octal mode",
35+
);
36+
anyhow::ensure!(
37+
syntax.get_utf8(),
38+
"regex-lite does not support disabling UTF-8 mode",
39+
);
40+
anyhow::ensure!(
41+
syntax.get_unicode(),
42+
"regex-lite does not support disabling Unicode mode",
43+
);
44+
let mut b = regex_lite::RegexBuilder::new(&patterns[0]);
45+
b.case_insensitive(syntax.get_case_insensitive());
46+
b.multi_line(syntax.get_multi_line());
47+
b.crlf(syntax.get_crlf());
48+
b.dot_matches_new_line(syntax.get_dot_matches_new_line());
49+
b.swap_greed(syntax.get_swap_greed());
50+
b.ignore_whitespace(syntax.get_ignore_whitespace());
51+
b.nest_limit(syntax.get_nest_limit());
52+
b.size_limit(self.size_limit.unwrap_or(usize::MAX));
53+
b.build().map_err(anyhow::Error::from)
54+
}
55+
}
56+
57+
impl Configurable for Config {
58+
fn configure(
59+
&mut self,
60+
p: &mut Parser,
61+
arg: &mut Arg,
62+
) -> anyhow::Result<bool> {
63+
match *arg {
64+
Arg::Long("size-limit") => {
65+
self.size_limit = args::parse_maybe(p, "--size-limit")?;
66+
}
67+
_ => return Ok(false),
68+
}
69+
Ok(true)
70+
}
71+
72+
fn usage(&self) -> &[Usage] {
73+
const USAGES: &'static [Usage] = &[Usage::new(
74+
"--size-limit",
75+
"Set a limit on heap used by a regex.",
76+
r#"
77+
This sets a limit, in bytes, on the heap memory used by a regex.
78+
79+
The special value 'none' indicates that no size limit should be imposed.
80+
"#,
81+
)];
82+
USAGES
83+
}
84+
}

regex-cli/args/mod.rs

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ pub mod flags;
1616
pub mod haystack;
1717
pub mod hybrid;
1818
pub mod input;
19+
pub mod lite;
1920
pub mod meta;
2021
pub mod onepass;
2122
pub mod overlapping;

regex-cli/cmd/find/capture/mod.rs

+103
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ USAGE:
3232
3333
ENGINES:
3434
backtrack Search with the bounded backtracker regex engine.
35+
lite Search with the regex-lite engine.
3536
meta Search with the meta regex engine.
3637
onepass Search with the one-pass DFA regex engine.
3738
pikevm Search with the PikeVM regex engine.
@@ -40,6 +41,7 @@ ENGINES:
4041
let cmd = args::next_as_command(USAGE, p)?;
4142
match &*cmd {
4243
"backtrack" => nfa::run_backtrack(p),
44+
"lite" => run_lite(p),
4345
"meta" => run_meta(p),
4446
"onepass" => dfa::run_onepass(p),
4547
"pikevm" => nfa::run_pikevm(p),
@@ -219,6 +221,107 @@ OPTIONS:
219221
Ok(())
220222
}
221223

224+
fn run_lite(p: &mut lexopt::Parser) -> anyhow::Result<()> {
225+
const USAGE: &'static str = "\
226+
Executes a search for full matches using the top-level regex-lite engine.
227+
228+
USAGE:
229+
regex-cli find capture lite [-p <pattern> ...] <haystack-path>
230+
regex-cli find capture lite [-p <pattern> ...] -y <haystack>
231+
232+
TIP:
233+
use -h for short docs and --help for long docs
234+
235+
OPTIONS:
236+
%options%
237+
";
238+
239+
let mut common = args::common::Config::default();
240+
let mut patterns = args::patterns::Config::only_flags();
241+
let mut haystack = args::haystack::Config::default();
242+
let mut syntax = args::syntax::Config::default();
243+
let mut lite = args::lite::Config::default();
244+
let mut find = super::Config::default();
245+
args::configure(
246+
p,
247+
USAGE,
248+
&mut [
249+
&mut common,
250+
&mut patterns,
251+
&mut haystack,
252+
&mut syntax,
253+
&mut lite,
254+
&mut find,
255+
],
256+
)?;
257+
258+
let pats = patterns.get()?;
259+
let syn = syntax.syntax()?;
260+
let mut table = Table::empty();
261+
let (re, time) = util::timeitr(|| lite.from_patterns(&syn, &pats))?;
262+
table.add("build regex time", time);
263+
264+
// Check that the haystack is valid UTF-8 since regex-lite doesn't support
265+
// searching arbitrary byte sequences. (At time of writing.)
266+
haystack.get()?.to_str()?;
267+
268+
// The top-level API doesn't support regex-automata's more granular Input
269+
// abstraction.
270+
let input = args::input::Config::default();
271+
// The top-level API also doesn't use 'Captures' from regex-automata
272+
// directly, but we can map between them with some annoyance.
273+
let group_info = GroupInfo::new([re.capture_names()])
274+
.context("could not build capture group info")?;
275+
let mut locs = re.capture_locations();
276+
let search = |input: &Input<'_>, caps: &mut Captures| {
277+
let haystack = input.haystack().to_str().unwrap();
278+
caps.set_pattern(None);
279+
if !re.captures_read_at(&mut locs, haystack, input.start()).is_some() {
280+
return Ok(());
281+
}
282+
caps.set_pattern(Some(PatternID::ZERO));
283+
for i in 0..locs.len() {
284+
use regex_automata::util::primitives::NonMaxUsize;
285+
286+
let slot_start = i * 2;
287+
let slot_end = slot_start + 1;
288+
match locs.get(i) {
289+
None => {
290+
caps.slots_mut()[slot_start] = None;
291+
caps.slots_mut()[slot_end] = None;
292+
}
293+
Some((start, end)) => {
294+
caps.slots_mut()[slot_start] = NonMaxUsize::new(start);
295+
caps.slots_mut()[slot_end] = NonMaxUsize::new(end);
296+
}
297+
}
298+
}
299+
Ok(())
300+
};
301+
if find.count {
302+
run_counts(
303+
&mut table,
304+
&common,
305+
&find,
306+
&input,
307+
&haystack,
308+
&group_info,
309+
search,
310+
)?;
311+
} else {
312+
run_search(
313+
&mut table,
314+
&common,
315+
&find,
316+
&input,
317+
&haystack,
318+
&group_info,
319+
search,
320+
)?;
321+
}
322+
Ok(())
323+
}
324+
222325
/// A function that takes in a bunch of configuration, runs the given search
223326
/// routine, and prints out a table of counts.
224327
fn run_counts(

regex-cli/cmd/find/match/mod.rs

+67
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ ENGINES:
2626
backtrack Search with the bounded backtracker regex engine.
2727
dense Search with the dense DFA regex engine.
2828
hybrid Search with the lazy DFA regex engine.
29+
lite Search with the regex-lite engine.
2930
meta Search with the meta regex engine.
3031
onepass Search with the one-pass DFA regex engine.
3132
pikevm Search with the PikeVM regex engine.
@@ -37,6 +38,7 @@ ENGINES:
3738
"backtrack" => nfa::run_backtrack(p),
3839
"dense" => dfa::run_dense(p),
3940
"hybrid" => dfa::run_hybrid(p),
41+
"lite" => run_lite(p),
4042
"meta" => run_meta(p),
4143
"onepass" => dfa::run_onepass(p),
4244
"pikevm" => nfa::run_pikevm(p),
@@ -164,6 +166,71 @@ OPTIONS:
164166
Ok(())
165167
}
166168

169+
fn run_lite(p: &mut lexopt::Parser) -> anyhow::Result<()> {
170+
const USAGE: &'static str = "\
171+
Executes a search for full matches using the top-level regex-lite engine.
172+
173+
Note that since the regex-lite crate doesn't have an API for search arbitrary
174+
byte slices, the haystack must be valid UTF-8. If it isn't, this command will
175+
report an error.
176+
177+
USAGE:
178+
regex-cli find match lite [-p <pattern> ...] <haystack-path>
179+
regex-cli find match lite [-p <pattern> ...] -y <haystack>
180+
181+
TIP:
182+
use -h for short docs and --help for long docs
183+
184+
OPTIONS:
185+
%options%
186+
";
187+
188+
let mut common = args::common::Config::default();
189+
let mut patterns = args::patterns::Config::only_flags();
190+
let mut haystack = args::haystack::Config::default();
191+
let mut syntax = args::syntax::Config::default();
192+
let mut lite = args::lite::Config::default();
193+
let mut find = super::Config::default();
194+
args::configure(
195+
p,
196+
USAGE,
197+
&mut [
198+
&mut common,
199+
&mut patterns,
200+
&mut haystack,
201+
&mut syntax,
202+
&mut lite,
203+
&mut find,
204+
],
205+
)?;
206+
207+
let pats = patterns.get()?;
208+
let syn = syntax.syntax()?;
209+
let mut table = Table::empty();
210+
let (re, time) = util::timeitr(|| lite.from_patterns(&syn, &pats))?;
211+
table.add("build regex time", time);
212+
213+
// Check that the haystack is valid UTF-8 since regex-lite doesn't support
214+
// searching arbitrary byte sequences. (At time of writing.)
215+
haystack.get()?.to_str()?;
216+
217+
// The top-level regex-lite API doesn't support regex-automata's more
218+
// granular Input abstraction.
219+
let input = args::input::Config::default();
220+
let search = |input: &Input<'_>| {
221+
let haystack = input.haystack().to_str().unwrap();
222+
Ok(re
223+
.find_at(haystack, input.start())
224+
.map(|m| Match::new(PatternID::ZERO, m.start()..m.end())))
225+
};
226+
if find.count {
227+
run_counts(&mut table, &common, &find, &input, &haystack, 1, search)?;
228+
} else {
229+
run_search(&mut table, &common, &find, &input, &haystack, search)?;
230+
}
231+
Ok(())
232+
}
233+
167234
/// A function that takes in a bunch of configuration, runs the given search
168235
/// routine, and prints out a table of counts.
169236
fn run_counts(

regex-lite/Cargo.toml

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
[package]
2+
name = "regex-lite"
3+
version = "0.1.0" #:version
4+
authors = ["The Rust Project Developers", "Andrew Gallant <[email protected]>"]
5+
license = "MIT OR Apache-2.0"
6+
repository = "https://github.com/rust-lang/regex/tree/master/regex-lite"
7+
documentation = "https://docs.rs/regex-lite"
8+
description = """
9+
A lightweight regex engine that optimizes for binary size and compilation time.
10+
"""
11+
workspace = ".."
12+
edition = "2021"
13+
rust-version = "1.60.0"
14+
autotests = false
15+
16+
# Features are documented in the "Crate features" section of the crate docs:
17+
# https://docs.rs/regex-syntax/*/#crate-features
18+
[features]
19+
default = ["std"]
20+
std = []
21+
22+
[dev-dependencies]
23+
anyhow = "1.0.69"
24+
regex-test = { path = "../regex-test", version = "0.1.0" }
25+
26+
[[test]]
27+
path = "tests/lib.rs"
28+
name = "integration"
29+
30+
[package.metadata.docs.rs]
31+
# We want to document all features.
32+
all-features = true
33+
# To test this locally, run:
34+
#
35+
# RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc --all-features
36+
rustdoc-args = ["--cfg", "docsrs"]

0 commit comments

Comments
 (0)