-
Hello, I am a regex noob, but I’d like to have a function that returns all matches from a DFA regex (because I have lookahead in some regexes). From the documentation about the use regex_automata::{
Input, MatchKind,
hybrid::dfa::{DFA, OverlappingState},
}; // 0.4.9
const EMAIL_REGEX: &str = r"[a-zA-Z0-9-_.]*[a-zA-Z0-9_](@[a-zA-Z0-9_]+)(\.[a-zA-Z0-9_]+(\.[a-zA-Z0-9_]+)?[a-zA-Z])(?-u:\b)";
const SHA_REGEX: &str = r"(?-u:\b)[0-9a-fA-F]{7,28}(?-u:\b)";
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct Match {
pattern_id: usize,
offset_start: usize,
offset_end: usize,
}
fn main() {
let haystack = "[email protected]";
assert_eq!(
&find_all_matches(haystack),
&[
Match {
pattern_id: 0,
offset_start: 0,
offset_end: 18,
},
Match {
pattern_id: 1,
offset_start: 6,
offset_end: 14,
}
]
)
}
fn find_all_matches(haystack: &str) -> Vec<Match> {
let dfa = DFA::builder()
.configure(DFA::config().match_kind(MatchKind::All))
.build_many(&[EMAIL_REGEX, SHA_REGEX])
.unwrap();
let mut cache = dfa.create_cache();
let mut state = OverlappingState::start();
let mut matches = Vec::<Match>::new();
loop {
if dfa
.try_search_overlapping_fwd(&mut cache, &Input::new(haystack), &mut state)
.is_err()
{
break;
}
let Some(half_match) = state.get_match() else {
break;
};
// half_match reports the end of the first match
todo!("Combine this half match with a reverse search to complete the match")
}
matches
} But now that I successfully have a
It is a bit hand-holding I know, but I really can’t figure out how to close the loop there. Thanks for the library anyways, Gerry |
Beta Was this translation helpful? Give feedback.
Replies: 2 comments 5 replies
-
Does this code snippet and comment help? I think it's what you're trying to do: regex/regex-automata/tests/dfa/suite.rs Lines 393 to 443 in 1a069b9 |
Beta Was this translation helpful? Give feedback.
-
So, yes! That was exactly what I was looking for! But! https://play.rust-lang.org/?version=stable&mode=debug&edition=2024&gist=621360e40dfdea8605f610ea4d7b06c4 use regex_automata::{
Anchored, Input, MatchKind,
hybrid::dfa::{DFA, OverlappingState},
hybrid::regex::{Cache, Regex},
}; // 0.4.9
const EMAIL_REGEX: &str = r"[a-zA-Z0-9-_.]*[a-zA-Z0-9_](@[a-zA-Z0-9_]+)(\.[a-zA-Z0-9_]+(\.[a-zA-Z0-9_]+)?[a-zA-Z])(?-u:\b)";
const SHA_REGEX: &str = r"(?-u:\b)[0-9a-fA-F]{7,28}(?-u:\b)";
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct Match {
pattern_id: usize,
offset_start: usize,
offset_end: usize,
}
fn main() {
let haystack = "[email protected]";
assert_eq!(
&find_all_matches(haystack),
&[
Match {
pattern_id: 0,
offset_start: 0,
offset_end: 18,
},
Match {
pattern_id: 1,
offset_start: 6,
offset_end: 14,
}
]
)
}
fn find_all_matches(haystack: &str) -> Vec<Match> {
let re = Regex::new_many(&[EMAIL_REGEX, SHA_REGEX]).unwrap();
let mut cache = re.create_cache();
try_search_overlapping(&re, &mut cache, &Input::new(haystack))
}
fn try_search_overlapping(re: &Regex, cache: &mut Cache, input: &Input<'_>) -> Vec<Match> {
let mut matches = vec![];
let mut fwd_state = OverlappingState::start();
let (fwd_dfa, rev_dfa) = (re.forward(), re.reverse());
let (fwd_cache, rev_cache) = cache.as_parts_mut();
while let Some(end) = {
fwd_dfa.try_search_overlapping_fwd(fwd_cache, input, &mut fwd_state).unwrap();
fwd_state.get_match()
} {
let revsearch = input
.clone()
.range(input.start()..end.offset())
.anchored(Anchored::Pattern(end.pattern()))
.earliest(false);
let mut rev_state = OverlappingState::start();
while let Some(start) = {
rev_dfa.try_search_overlapping_rev(rev_cache, &revsearch, &mut rev_state).unwrap();
rev_state.get_match()
} {
let mat = Match {
pattern_id: end.pattern().as_usize(),
offset_start: start.offset(),
offset_end: end.offset(),
};
matches.push(mat);
}
}
matches
} leads to At least I’m a lot closer now, so anyway I can try to debug myself out a bit |
Beta Was this translation helpful? Give feedback.
Yeah I just figured that out!! Thanks
Final code to mark the answer: