fuzz: improve fuzz testing

BurntSushi · BurntSushi · commit d29bf7b2335c · 2023-07-04T21:28:07.000-04:00
It's still not as good as it could be, but we add fuzz targets for
regex-lite and DFA deserialization in regex-automata.
diff --git a/.vim/coc-settings.json b/.vim/coc-settings.json
@@ -0,0 +1,6 @@
+{
+  "rust-analyzer.linkedProjects": [
+    "fuzz/Cargo.toml",
+    "Cargo.toml"
+  ]
+}
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
@@ -3,16 +3,16 @@ name = "regex-fuzz"
 version = "0.0.0"
 authors = ["David Korczynski <david@adalogics.com>"]
 publish = false
-edition = "2018"
+edition = "2021"
 
 [package.metadata]
 cargo-fuzz = true
 
 [dependencies]
 libfuzzer-sys = "0.4.1"
-
-[dependencies.regex]
-path = ".."
+regex = { path = ".." }
+regex-automata = { path = "../regex-automata" }
+regex-lite = { path = "../regex-lite" }
 
 # Prevent this from interfering with workspaces
 [workspace]
@@ -22,6 +22,18 @@ members = ["."]
 name = "fuzz_regex_match"
 path = "fuzz_targets/fuzz_regex_match.rs"
 
+[[bin]]
+name = "fuzz_regex_lite_match"
+path = "fuzz_targets/fuzz_regex_lite_match.rs"
+
+[[bin]]
+name = "fuzz_regex_automata_deserialize_dense_dfa"
+path = "fuzz_targets/fuzz_regex_automata_deserialize_dense_dfa.rs"
+
+[[bin]]
+name = "fuzz_regex_automata_deserialize_sparse_dfa"
+path = "fuzz_targets/fuzz_regex_automata_deserialize_sparse_dfa.rs"
+
 [profile.release]
 opt-level = 3
 debug = true
diff --git a/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_dense_dfa.rs b/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_dense_dfa.rs
@@ -0,0 +1,37 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+
+fuzz_target!(|data: &[u8]| {
+    let _ = run(data);
+});
+
+fn run(given_data: &[u8]) -> Option<()> {
+    use regex_automata::dfa::Automaton;
+
+    if given_data.len() < 2 {
+        return None;
+    }
+    let haystack_len = usize::from(given_data[0]);
+    let haystack = given_data.get(1..1 + haystack_len)?;
+    let given_dfa_bytes = given_data.get(1 + haystack_len..)?;
+
+    // We help the fuzzer along by adding a preamble to the bytes that should
+    // at least make these first parts valid. The preamble expects a very
+    // specific sequence of bytes, so it makes sense to just force this.
+    let label = "rust-regex-automata-dfa-dense\x00\x00\x00";
+    assert_eq!(0, label.len() % 4);
+    let endianness_check = 0xFEFFu32.to_ne_bytes().to_vec();
+    let version_check = 2u32.to_ne_bytes().to_vec();
+    let mut dfa_bytes: Vec<u8> = vec![];
+    dfa_bytes.extend(label.as_bytes());
+    dfa_bytes.extend(&endianness_check);
+    dfa_bytes.extend(&version_check);
+    dfa_bytes.extend(given_dfa_bytes);
+    // This is the real test: checking that any input we give to
+    // DFA::from_bytes will never result in a panic.
+    let (dfa, _) =
+        regex_automata::dfa::dense::DFA::from_bytes(&dfa_bytes).ok()?;
+    let _ = dfa.try_search_fwd(&regex_automata::Input::new(haystack));
+    Some(())
+}
diff --git a/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_sparse_dfa.rs b/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_sparse_dfa.rs
@@ -0,0 +1,37 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+
+fuzz_target!(|data: &[u8]| {
+    let _ = run(data);
+});
+
+fn run(given_data: &[u8]) -> Option<()> {
+    use regex_automata::dfa::Automaton;
+
+    if given_data.len() < 2 {
+        return None;
+    }
+    let haystack_len = usize::from(given_data[0]);
+    let haystack = given_data.get(1..1 + haystack_len)?;
+    let given_dfa_bytes = given_data.get(1 + haystack_len..)?;
+
+    // We help the fuzzer along by adding a preamble to the bytes that should
+    // at least make these first parts valid. The preamble expects a very
+    // specific sequence of bytes, so it makes sense to just force this.
+    let label = "rust-regex-automata-dfa-sparse\x00\x00";
+    assert_eq!(0, label.len() % 4);
+    let endianness_check = 0xFEFFu32.to_ne_bytes().to_vec();
+    let version_check = 2u32.to_ne_bytes().to_vec();
+    let mut dfa_bytes: Vec<u8> = vec![];
+    dfa_bytes.extend(label.as_bytes());
+    dfa_bytes.extend(&endianness_check);
+    dfa_bytes.extend(&version_check);
+    dfa_bytes.extend(given_dfa_bytes);
+    // This is the real test: checking that any input we give to
+    // DFA::from_bytes will never result in a panic.
+    let (dfa, _) =
+        regex_automata::dfa::sparse::DFA::from_bytes(&dfa_bytes).ok()?;
+    let _ = dfa.try_search_fwd(&regex_automata::Input::new(haystack));
+    Some(())
+}
diff --git a/fuzz/fuzz_targets/fuzz_regex_lite_match.rs b/fuzz/fuzz_targets/fuzz_regex_lite_match.rs
@@ -0,0 +1,23 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+
+fuzz_target!(|data: &[u8]| {
+    let _ = run(data);
+});
+
+fn run(data: &[u8]) -> Option<()> {
+    if data.len() < 2 {
+        return None;
+    }
+    let mut split_at = usize::from(data[0]);
+    let data = std::str::from_utf8(&data[1..]).ok()?;
+    // Split data into a regex and haystack to search.
+    let len = usize::try_from(data.chars().count()).ok()?;
+    split_at = std::cmp::max(split_at, 1) % len;
+    let char_index = data.char_indices().nth(split_at)?.0;
+    let (pattern, input) = data.split_at(char_index);
+    let re = regex_lite::Regex::new(pattern).ok()?;
+    re.is_match(input);
+    Some(())
+}
diff --git a/fuzz/fuzz_targets/fuzz_regex_match.rs b/fuzz/fuzz_targets/fuzz_regex_match.rs
@@ -1,33 +1,23 @@
 #![no_main]
+
 use libfuzzer_sys::fuzz_target;
 
 fuzz_target!(|data: &[u8]| {
+    let _ = run(data);
+});
+
+fn run(data: &[u8]) -> Option<()> {
     if data.len() < 2 {
-        return;
+        return None;
     }
-    let split_point = data[0] as usize;
-    if let Ok(data) = std::str::from_utf8(&data[1..]) {
-        use std::cmp::max;
-        // split data into regular expression and actual input to search through
-        let len = data.chars().count();
-        let split_off_point = max(split_point, 1) % len as usize;
-        let char_index = data.char_indices().nth(split_off_point);
-        if let Some((char_index, _)) = char_index {
-            let (pattern, input) = data.split_at(char_index);
-            // If the haystack is big, don't use it. The issue is that
-            // the fuzzer is compiled with sanitizer options and it makes
-            // everything pretty slow. This was put in here as a result of
-            // getting timeout errors from OSS-fuzz. There's really nothing to
-            // be done about them. Unicode word boundaries in the PikeVM are
-            // slow. It is what it is.
-            if input.len() >= 8 * (1 << 10) {
-                return;
-            }
-            let result =
-                regex::RegexBuilder::new(pattern).size_limit(1 << 18).build();
-            if let Ok(re) = result {
-                re.is_match(input);
-            }
-        }
-    }
-});
+    let mut split_at = usize::from(data[0]);
+    let data = std::str::from_utf8(&data[1..]).ok()?;
+    // Split data into a regex and haystack to search.
+    let len = usize::try_from(data.chars().count()).ok()?;
+    split_at = std::cmp::max(split_at, 1) % len;
+    let char_index = data.char_indices().nth(split_at)?.0;
+    let (pattern, input) = data.split_at(char_index);
+    let re = regex::Regex::new(pattern).ok()?;
+    re.is_match(input);
+    Some(())
+}