Skip to content

Commit d29bf7b

Browse files
committed
fuzz: improve fuzz testing
It's still not as good as it could be, but we add fuzz targets for regex-lite and DFA deserialization in regex-automata.
1 parent e063ab6 commit d29bf7b

6 files changed

+136
-31
lines changed

.vim/coc-settings.json

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"rust-analyzer.linkedProjects": [
3+
"fuzz/Cargo.toml",
4+
"Cargo.toml"
5+
]
6+
}

fuzz/Cargo.toml

+16-4
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,16 @@ name = "regex-fuzz"
33
version = "0.0.0"
44
authors = ["David Korczynski <[email protected]>"]
55
publish = false
6-
edition = "2018"
6+
edition = "2021"
77

88
[package.metadata]
99
cargo-fuzz = true
1010

1111
[dependencies]
1212
libfuzzer-sys = "0.4.1"
13-
14-
[dependencies.regex]
15-
path = ".."
13+
regex = { path = ".." }
14+
regex-automata = { path = "../regex-automata" }
15+
regex-lite = { path = "../regex-lite" }
1616

1717
# Prevent this from interfering with workspaces
1818
[workspace]
@@ -22,6 +22,18 @@ members = ["."]
2222
name = "fuzz_regex_match"
2323
path = "fuzz_targets/fuzz_regex_match.rs"
2424

25+
[[bin]]
26+
name = "fuzz_regex_lite_match"
27+
path = "fuzz_targets/fuzz_regex_lite_match.rs"
28+
29+
[[bin]]
30+
name = "fuzz_regex_automata_deserialize_dense_dfa"
31+
path = "fuzz_targets/fuzz_regex_automata_deserialize_dense_dfa.rs"
32+
33+
[[bin]]
34+
name = "fuzz_regex_automata_deserialize_sparse_dfa"
35+
path = "fuzz_targets/fuzz_regex_automata_deserialize_sparse_dfa.rs"
36+
2537
[profile.release]
2638
opt-level = 3
2739
debug = true
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#![no_main]
2+
3+
use libfuzzer_sys::fuzz_target;
4+
5+
fuzz_target!(|data: &[u8]| {
6+
let _ = run(data);
7+
});
8+
9+
fn run(given_data: &[u8]) -> Option<()> {
10+
use regex_automata::dfa::Automaton;
11+
12+
if given_data.len() < 2 {
13+
return None;
14+
}
15+
let haystack_len = usize::from(given_data[0]);
16+
let haystack = given_data.get(1..1 + haystack_len)?;
17+
let given_dfa_bytes = given_data.get(1 + haystack_len..)?;
18+
19+
// We help the fuzzer along by adding a preamble to the bytes that should
20+
// at least make these first parts valid. The preamble expects a very
21+
// specific sequence of bytes, so it makes sense to just force this.
22+
let label = "rust-regex-automata-dfa-dense\x00\x00\x00";
23+
assert_eq!(0, label.len() % 4);
24+
let endianness_check = 0xFEFFu32.to_ne_bytes().to_vec();
25+
let version_check = 2u32.to_ne_bytes().to_vec();
26+
let mut dfa_bytes: Vec<u8> = vec![];
27+
dfa_bytes.extend(label.as_bytes());
28+
dfa_bytes.extend(&endianness_check);
29+
dfa_bytes.extend(&version_check);
30+
dfa_bytes.extend(given_dfa_bytes);
31+
// This is the real test: checking that any input we give to
32+
// DFA::from_bytes will never result in a panic.
33+
let (dfa, _) =
34+
regex_automata::dfa::dense::DFA::from_bytes(&dfa_bytes).ok()?;
35+
let _ = dfa.try_search_fwd(&regex_automata::Input::new(haystack));
36+
Some(())
37+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#![no_main]
2+
3+
use libfuzzer_sys::fuzz_target;
4+
5+
fuzz_target!(|data: &[u8]| {
6+
let _ = run(data);
7+
});
8+
9+
fn run(given_data: &[u8]) -> Option<()> {
10+
use regex_automata::dfa::Automaton;
11+
12+
if given_data.len() < 2 {
13+
return None;
14+
}
15+
let haystack_len = usize::from(given_data[0]);
16+
let haystack = given_data.get(1..1 + haystack_len)?;
17+
let given_dfa_bytes = given_data.get(1 + haystack_len..)?;
18+
19+
// We help the fuzzer along by adding a preamble to the bytes that should
20+
// at least make these first parts valid. The preamble expects a very
21+
// specific sequence of bytes, so it makes sense to just force this.
22+
let label = "rust-regex-automata-dfa-sparse\x00\x00";
23+
assert_eq!(0, label.len() % 4);
24+
let endianness_check = 0xFEFFu32.to_ne_bytes().to_vec();
25+
let version_check = 2u32.to_ne_bytes().to_vec();
26+
let mut dfa_bytes: Vec<u8> = vec![];
27+
dfa_bytes.extend(label.as_bytes());
28+
dfa_bytes.extend(&endianness_check);
29+
dfa_bytes.extend(&version_check);
30+
dfa_bytes.extend(given_dfa_bytes);
31+
// This is the real test: checking that any input we give to
32+
// DFA::from_bytes will never result in a panic.
33+
let (dfa, _) =
34+
regex_automata::dfa::sparse::DFA::from_bytes(&dfa_bytes).ok()?;
35+
let _ = dfa.try_search_fwd(&regex_automata::Input::new(haystack));
36+
Some(())
37+
}
+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#![no_main]
2+
3+
use libfuzzer_sys::fuzz_target;
4+
5+
fuzz_target!(|data: &[u8]| {
6+
let _ = run(data);
7+
});
8+
9+
fn run(data: &[u8]) -> Option<()> {
10+
if data.len() < 2 {
11+
return None;
12+
}
13+
let mut split_at = usize::from(data[0]);
14+
let data = std::str::from_utf8(&data[1..]).ok()?;
15+
// Split data into a regex and haystack to search.
16+
let len = usize::try_from(data.chars().count()).ok()?;
17+
split_at = std::cmp::max(split_at, 1) % len;
18+
let char_index = data.char_indices().nth(split_at)?.0;
19+
let (pattern, input) = data.split_at(char_index);
20+
let re = regex_lite::Regex::new(pattern).ok()?;
21+
re.is_match(input);
22+
Some(())
23+
}

fuzz/fuzz_targets/fuzz_regex_match.rs

+17-27
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,23 @@
11
#![no_main]
2+
23
use libfuzzer_sys::fuzz_target;
34

45
fuzz_target!(|data: &[u8]| {
6+
let _ = run(data);
7+
});
8+
9+
fn run(data: &[u8]) -> Option<()> {
510
if data.len() < 2 {
6-
return;
11+
return None;
712
}
8-
let split_point = data[0] as usize;
9-
if let Ok(data) = std::str::from_utf8(&data[1..]) {
10-
use std::cmp::max;
11-
// split data into regular expression and actual input to search through
12-
let len = data.chars().count();
13-
let split_off_point = max(split_point, 1) % len as usize;
14-
let char_index = data.char_indices().nth(split_off_point);
15-
if let Some((char_index, _)) = char_index {
16-
let (pattern, input) = data.split_at(char_index);
17-
// If the haystack is big, don't use it. The issue is that
18-
// the fuzzer is compiled with sanitizer options and it makes
19-
// everything pretty slow. This was put in here as a result of
20-
// getting timeout errors from OSS-fuzz. There's really nothing to
21-
// be done about them. Unicode word boundaries in the PikeVM are
22-
// slow. It is what it is.
23-
if input.len() >= 8 * (1 << 10) {
24-
return;
25-
}
26-
let result =
27-
regex::RegexBuilder::new(pattern).size_limit(1 << 18).build();
28-
if let Ok(re) = result {
29-
re.is_match(input);
30-
}
31-
}
32-
}
33-
});
13+
let mut split_at = usize::from(data[0]);
14+
let data = std::str::from_utf8(&data[1..]).ok()?;
15+
// Split data into a regex and haystack to search.
16+
let len = usize::try_from(data.chars().count()).ok()?;
17+
split_at = std::cmp::max(split_at, 1) % len;
18+
let char_index = data.char_indices().nth(split_at)?.0;
19+
let (pattern, input) = data.split_at(char_index);
20+
let re = regex::Regex::new(pattern).ok()?;
21+
re.is_match(input);
22+
Some(())
23+
}

0 commit comments

Comments
 (0)