From 64073eb5fe0ce95cdadd4f5cb1192d7ae5c6847d Mon Sep 17 00:00:00 2001 From: Jeffrey Crocker Date: Wed, 30 Nov 2022 16:46:52 -0500 Subject: [PATCH] Detect and auto complete "nullable" productions (#113) * improve `AppendOnlyVec` to use it in more places * separate "term matching" from "state" * rename `State` to `Traversal` because it was confusing me (this may be controversial! please let's talk about it!) * detect "nullable" productions, and autocomplete them during parsing * added "tracing" ! --- .github/actions-rs/grcov.yml | 2 + Cargo.toml | 11 +- benches/README.md | 17 +- benches/bnf.rs | 24 ++ src/append_vec.rs | 104 ++++++ src/earley.rs | 681 ----------------------------------- src/earley/grammar.rs | 192 ++++++++++ src/earley/input_range.rs | 79 ++++ src/earley/mod.rs | 613 +++++++++++++++++++++++++++++++ src/earley/traversal.rs | 214 +++++++++++ src/grammar.rs | 2 +- src/lib.rs | 2 + src/tracing.rs | 27 ++ 13 files changed, 1282 insertions(+), 686 deletions(-) create mode 100644 src/append_vec.rs delete mode 100644 src/earley.rs create mode 100644 src/earley/grammar.rs create mode 100644 src/earley/input_range.rs create mode 100644 src/earley/mod.rs create mode 100644 src/earley/traversal.rs create mode 100644 src/tracing.rs diff --git a/.github/actions-rs/grcov.yml b/.github/actions-rs/grcov.yml index 018c79c..54832c9 100644 --- a/.github/actions-rs/grcov.yml +++ b/.github/actions-rs/grcov.yml @@ -5,3 +5,5 @@ filter: covered output-type: lcov output-path: ./lcov.info prefix-dir: /home/user/build/ +ignore: + - "../*" diff --git a/Cargo.toml b/Cargo.toml index 2b94d99..278b3ff 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,10 +14,10 @@ repository = "https://github.com/shnewto/bnf" license = "MIT" -[features] -unstable = [] - [dependencies] +tracing = { version = "0.1.37", optional = true } +tracing-subscriber = { version = "0.3.16", optional = true } +tracing-flame = { version = "0.2.0", optional = true } [dependencies.stacker] version = "0.1.2" @@ -41,6 +41,11 @@ version = "1.0.2" [dev-dependencies] criterion = "0.3.5" +[features] +default = [] +unstable = [] +tracing = ["dep:tracing", "dep:tracing-subscriber", "dep:tracing-flame"] + [[bench]] name = "bnf" harness = false diff --git a/benches/README.md b/benches/README.md index 8a653f9..7a42082 100644 --- a/benches/README.md +++ b/benches/README.md @@ -3,6 +3,7 @@ Benchmarking numbers will vary across tests, specific grammars, rust versions, and hardware. With so many sources of noise, it is important to remember that "faster" is not always easy to define. With that in mind, BNF's benchmarking has the following goals: + * identify statistically significant performance regressions * validate performance hypothesis @@ -29,4 +30,18 @@ These benchmarks are not run during continuous integration testing. But if a dev > cargo criterion [criterion]: https://crates.io/crates/criterion -[cargo-criterion]: https://github.com/bheisler/cargo-criterion \ No newline at end of file +[cargo-criterion]: https://github.com/bheisler/cargo-criterion + +#### Flamegraph + +> CARGO_PROFILE_BENCH_DEBUG=true cargo flamegraph --bench bnf -- --bench + +`sudo` may be required for `dtrace` on macOS + +#### Tracing + +BNF has an optional "tracing" feature which will provide tracing spans during parsing. + +The benchmarks are enabled to write these tracing spans to `tracing.folded`. This data can then be parsed to provide a flamegraph. + +> RUST_LOG=TRACE cargo criterion --features "tracing" && cat tracing.folded | inferno-flamegraph > flamegraph.svg diff --git a/benches/bnf.rs b/benches/bnf.rs index 01a2bda..46d3712 100644 --- a/benches/bnf.rs +++ b/benches/bnf.rs @@ -2,7 +2,31 @@ use bnf::Grammar; use criterion::{criterion_group, criterion_main, Criterion}; use rand::seq::SliceRandom; +#[cfg(feature = "tracing")] +fn init_tracing() -> impl Drop { + use tracing_flame::FlameLayer; + use tracing_subscriber::{fmt, prelude::*}; + let fmt_layer = fmt::Layer::default(); + + let (flame_layer, _guard) = FlameLayer::with_file("./tracing.folded").unwrap(); + + tracing_subscriber::registry() + .with(fmt_layer) + .with(flame_layer) + .init(); + + _guard +} + +#[cfg(not(feature = "tracing"))] +fn init_tracing() {} + fn examples(c: &mut Criterion) { + let _tracing = init_tracing(); + + #[cfg(feature = "tracing")] + let _span = tracing::span!(tracing::Level::TRACE, "BENCH ITER").entered(); + c.bench_function("parse postal", |b| { let input = std::include_str!("../tests/fixtures/postal_address.terminated.input.bnf"); b.iter(|| input.parse::().unwrap()); diff --git a/src/append_vec.rs b/src/append_vec.rs new file mode 100644 index 0000000..92c7880 --- /dev/null +++ b/src/append_vec.rs @@ -0,0 +1,104 @@ +/// Create a new id type for an [`AppendOnlyVec`], which will be a wrapped [`usize`]. +/// Example usage: `append_only_vec_id!(pub(crate) ProductionId)`; +macro_rules! append_only_vec_id { + ($visible:vis $id:ident) => { + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] + $visible struct $id(usize); + + impl From for $id { + fn from(id: usize) -> Self { + Self(id) + } + } + + impl From<$id> for usize { + fn from(id: $id) -> Self { + id.0 + } + } + }; +} + +pub(crate) use append_only_vec_id; + +/// Vector type which does *not* allow item removal during lifetime. +/// Useful for data structures with complex, shared ownership, such as graphs. +#[derive(Debug, Clone)] +pub(crate) struct AppendOnlyVec { + vec: Vec, + id_type: std::marker::PhantomData, +} + +impl AppendOnlyVec +where + I: From + Into, +{ + pub fn new() -> Self { + Self::default() + } + pub fn len(&self) -> usize { + self.vec.len() + } + fn next_id(&self) -> I { + I::from(self.len()) + } + pub fn push(&mut self, item: T) -> I { + let id = self.next_id(); + self.vec.push(item); + id + } + pub fn push_with_id(&mut self, build: F) -> &T + where + F: Fn(I) -> T, + { + let id = self.next_id(); + let item = build(id); + let id = self.push(item); + self.get(id).expect("failed to get appended item") + } + pub fn get(&self, id: I) -> Option<&T> { + self.vec.get::(id.into()) + } + pub fn iter(&self) -> impl Iterator { + self.vec.iter() + } +} + +impl Default for AppendOnlyVec { + fn default() -> Self { + Self::from(vec![]) + } +} + +impl From> for AppendOnlyVec { + fn from(vec: Vec) -> Self { + Self { + vec, + id_type: std::marker::PhantomData, + } + } +} + +impl IntoIterator for AppendOnlyVec { + type Item = as IntoIterator>::Item; + type IntoIter = as IntoIterator>::IntoIter; + fn into_iter(self) -> Self::IntoIter { + self.vec.into_iter() + } +} + +impl<'a, T, K> IntoIterator for &'a AppendOnlyVec { + type Item = &'a T; + type IntoIter = std::slice::Iter<'a, T>; + fn into_iter(self) -> Self::IntoIter { + self.vec.iter() + } +} + +impl<'a, T, K> IntoIterator for &'a mut AppendOnlyVec { + type Item = &'a mut T; + type IntoIter = std::slice::IterMut<'a, T>; + fn into_iter(self) -> Self::IntoIter { + self.vec.iter_mut() + } +} diff --git a/src/earley.rs b/src/earley.rs deleted file mode 100644 index b227cba..0000000 --- a/src/earley.rs +++ /dev/null @@ -1,681 +0,0 @@ -use crate::{ - grammar::{ParseTree, ParseTreeNode}, - Term, -}; - -/// Identifier assigned to each `Production`, which are used to ignore duplicate parsing attempts. -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -struct ProductionId(usize); - -/// `crate::Production` offers multiple possible "right hand side" `Expression`s, which is overly flexible for Earley parsing. -/// `earley::Production` is a one-to-one relationship of `Term` -> `Expression`. -struct Production<'gram> { - id: ProductionId, - lhs: &'gram Term, - rhs: &'gram crate::Expression, -} - -/// `Production`s organized to be queried during parsing. -/// Not to be confused with `crate::Grammar`. -struct Grammar<'gram> { - starting_production_ids: Vec, - productions: Vec>, - production_ids_by_lhs: std::collections::HashMap<&'gram Term, Vec>, -} - -impl<'gram> Grammar<'gram> { - /// Create a new `Grammar` which references `crate::Grammar` - pub fn new(grammar: &'gram crate::Grammar) -> Self { - let starting_term = &grammar - .productions_iter() - .next() - .expect("Grammar must have one production to parse") - .lhs; - - let productions: Vec = grammar - .productions_iter() - .flat_map(|prod| prod.rhs_iter().map(|rhs| (&prod.lhs, rhs))) - .enumerate() - .map(|(idx, (lhs, rhs))| Production { - id: ProductionId(idx), - lhs, - rhs, - }) - .collect(); - - let mut production_ids_by_lhs = std::collections::HashMap::new(); - - for prod in &productions { - production_ids_by_lhs - .entry(prod.lhs) - .or_insert_with(Vec::new) - .push(prod.id.clone()); - } - - let starting_production_ids = production_ids_by_lhs - .get(starting_term) - .expect("starting Term has no production") - .clone(); - - Self { - starting_production_ids, - productions, - production_ids_by_lhs, - } - } - pub fn starting_iter(&self) -> impl Iterator> { - self.starting_production_ids - .iter() - .map(|id| &self.productions[id.0]) - } - /// Get `Production` parts by `ProductionId` (useful when building `ParseTree`) - pub fn get_production_parts_by_id( - &self, - prod_id: &ProductionId, - ) -> (&'gram Term, &'gram crate::Expression) { - self.productions - .get(prod_id.0) - .map(|p| (p.lhs, p.rhs)) - .expect("invalid Production ID") - } - /// Get `Production` by the LHS `Term` (useful when predicting new `State`) - pub fn get_productions_by_lhs( - &self, - lhs: &'gram Term, - ) -> impl Iterator> { - self.production_ids_by_lhs - .get(lhs) - .into_iter() - .flat_map(|v| v.iter()) - .map(|id| &self.productions[id.0]) - } -} - -#[derive(Debug)] -struct Terms<'gram> { - slice: &'gram [Term], -} - -impl<'gram> Terms<'gram> { - pub fn new(slice: &'gram [Term]) -> Self { - Self { slice } - } - pub fn matching(&self) -> Option<&'gram Term> { - self.slice.get(0) - } - pub fn advance_by(&self, step: usize) -> &'gram [Term] { - &self.slice[step..] - } -} - -/// Earley parsing operates on "state sets". -/// `StateId` is this "state set" identifier. -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -struct StateId(usize); - -/// A sliding window over the input strings being parsed. -#[derive(Clone)] -struct InputRange<'gram> { - input: &'gram str, - start: usize, - len: usize, -} - -impl<'gram> InputRange<'gram> { - pub fn new(input: &'gram str) -> Self { - Self { - input, - start: 0, - len: 0, - } - } - pub fn next(&self) -> Option<&str> { - let next_idx = self.start + self.len; - self.input.get(next_idx..) - } - pub fn after(&self) -> Self { - Self { - input: self.input, - start: self.start + self.len, - len: 0, - } - } - pub fn advance_by(&self, step: usize) -> Self { - let max_len = self.input.len() - self.start; - Self { - input: self.input, - start: self.start, - len: std::cmp::min(self.len + step, max_len), - } - } - pub fn is_complete(&self) -> bool { - self.start == 0 && self.len == self.input.len() - } - pub fn state_id(&self) -> StateId { - StateId(self.start + self.len) - } -} - -/// A clear view of `InputRange`, in the format "InputRange(before | current | after)" -/// e.g., "`InputRange`(["1", "+", "("] | ["2"] | ["*", "3", "-", "4", ")"])" -impl<'gram> std::fmt::Debug for InputRange<'gram> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let before = &self.input[..self.start]; - let scanned = &self.input[self.start..][..self.len]; - let after = &self.input[self.start..][self.len..]; - write!(f, "InputRange({:?} | {:?} | {:?})", before, scanned, after) - } -} - -/// A matched `Term` which has been partially accepted -#[derive(Debug, Clone, Copy)] -enum TermMatch<'gram> { - /// The `Term` matched with a terminal string - Terminal(&'gram str), - /// The `Term` matched with completed non-terminal `State` - NonTerminal(AppendOnlyVecKey), -} - -/// One state in an Earley parser -#[derive(Debug)] -struct State<'gram> { - /// LHS `Term` which is being parsed - lhs: &'gram Term, - /// matched `Term`s which have already been parsed - matched_terms: Vec>, - /// unmatched `Term`s which have yet to be parsed - unmatched_terms: Terms<'gram>, - /// unique `ProductionId` for which `Production` created this state - production_id: ProductionId, - /// input text range which is being parsed - input_range: InputRange<'gram>, -} - -impl<'gram> State<'gram> { - /// Create new `State` without advancing matched/unmatched `Term`s - pub fn new( - lhs: &'gram Term, - production_id: ProductionId, - unmatched_terms: &'gram [Term], - input_range: InputRange<'gram>, - ) -> Self { - let matched_terms = Vec::with_capacity(0); - let unmatched_terms = Terms::new(unmatched_terms); - Self { - lhs, - matched_terms, - unmatched_terms, - production_id, - input_range, - } - } - /// Create a new `State` by advancing matched/unmatched `Term`s - pub fn new_term_match( - state: &State<'gram>, - matched_term: TermMatch<'gram>, - input_range_step: usize, - ) -> Self { - let mut matched_terms = Vec::with_capacity(state.matched_terms.len() + 1); - matched_terms.extend_from_slice(&state.matched_terms); - matched_terms.push(matched_term); - - Self { - lhs: state.lhs, - matched_terms, - unmatched_terms: Terms::new(state.unmatched_terms.advance_by(1)), - production_id: state.production_id.clone(), - input_range: state.input_range.advance_by(input_range_step), - } - } - /// If the `State` has scanned the full input text AND originates from a starting production - /// then it is a complete parse - pub fn is_complete(&self, starting_prod_ids: &[ProductionId]) -> bool { - starting_prod_ids.contains(&self.production_id) && self.input_range.is_complete() - } -} - -/// Create new `State` by finding `Production` with the current matching `Term` on the LHS -fn predict<'gram, 'a>( - matching: &'gram Term, - input_range: &'a InputRange<'gram>, - grammar: &'a Grammar<'gram>, -) -> impl Iterator> + 'a { - grammar.get_productions_by_lhs(matching).map(|prod| { - State::new( - prod.lhs, - prod.id.clone(), - &prod.rhs.terms, - input_range.after(), - ) - }) -} - -/// Create new `State` if the current matching `Term` matches the next inpu text -fn scan<'gram, 'a>(state: &'a State<'gram>) -> impl Iterator> { - state - .unmatched_terms - .matching() - .zip(state.input_range.next()) - .and_then(|(matching, next_input)| match matching { - Term::Terminal(term) if next_input.starts_with(term) => Some(term), - _ => None, - }) - .map(|term| { - let term_match = TermMatch::Terminal(term); - State::new_term_match(state, term_match, term.len()) - }) - .into_iter() -} - -/// Create new `State` by finding incomplete `State`s which are pending on another `State`. -/// -/// For example, given a pending Earley state " ::= • " ( is pending on ) -/// and a completed Earley state " ::= 'A' •". -/// -/// Then we say " ::= • " is *completed* by " ::= 'A' •", -/// which yields a new state: " ::= •" -/// -/// Note: An `State` can only be completed by another `State` with matching `StateId`. -/// This has been omitted from this example, but must be respected in real parsing. -fn complete<'gram, 'a>( - key: AppendOnlyVecKey, - input_range: &'a InputRange<'gram>, - parent: &'a State<'gram>, -) -> State<'gram> { - let term_match = TermMatch::NonTerminal(key); - State::new_term_match(parent, term_match, input_range.len) -} - -/// Earley parsing will attempt to repeat states. If not de-duplicated, this causes infinite parsing loops as well as useless processing. -/// New `State`s are de-duplicated by `StateProcessingKey`, -/// which is a combination of `InputRange`, `ProductionId`, and unmatched `Term`s. -#[derive(Debug, PartialEq, Eq, Hash)] -struct StateProcessingKey { - input_start: usize, - input_len: usize, - production_id: ProductionId, - unmatched_term_len: usize, -} - -impl<'gram> StateProcessingKey { - pub fn from_state(state: &State<'gram>) -> Self { - Self { - input_start: state.input_range.start, - input_len: state.input_range.len, - production_id: state.production_id.clone(), - unmatched_term_len: state.unmatched_terms.slice.len(), - } - } -} - -/// When completing `State`s, used to find matching `State`s -#[derive(Debug, PartialEq, Eq, Hash)] -struct StateCompletionKey<'gram> { - state_id: StateId, - term: Option<&'gram Term>, -} - -impl<'gram> StateCompletionKey<'gram> { - pub fn from_state(state: &State<'gram>) -> Self { - let state_id = state.input_range.state_id(); - let term = state.unmatched_terms.matching(); - Self { state_id, term } - } - pub fn from_complete(state: &State<'gram>) -> Self { - let state_id = StateId(state.input_range.start); - let term = Some(state.lhs); - Self { state_id, term } - } -} - -#[derive(Debug, Clone, Copy)] -struct AppendOnlyVecKey(usize); - -#[derive(Debug, Clone)] -struct AppendOnlyVec { - vec: Vec, -} - -impl AppendOnlyVec { - pub fn push(&mut self, item: T) -> AppendOnlyVecKey { - let idx = self.vec.len(); - self.vec.push(item); - AppendOnlyVecKey(idx) - } - - pub fn get(&self, key: AppendOnlyVecKey) -> Option<&T> { - self.vec.get(key.0) - } -} - -impl Default for AppendOnlyVec { - fn default() -> Self { - Self { vec: Vec::new() } - } -} - -/// De-duplication set when creating new `State`s -type StateUniqueSet = std::collections::HashSet; - -/// Map for finding matching `State`s on `complete` -type StateCompletionMap<'gram> = - std::collections::HashMap, Vec>; - -/// Arena allocator for `State`s which also manages: -/// * de-duplication of new `State`s -/// * a queue of unprocessed `State`s -/// * a map of `State`s for completion matching -#[derive(Debug, Default)] -struct StateArena<'gram> { - arena: AppendOnlyVec>, - unprocessed: std::collections::VecDeque, - processed_set: StateUniqueSet, - matching_map: StateCompletionMap<'gram>, -} - -/// Unprocessed `State` fields. -/// Does not return `State` directly, for simpler lifetimes. -/// Full `State` is available via `StateArena::get` and `key` -struct Unprocessed<'gram> { - key: AppendOnlyVecKey, - matching: Option<&'gram Term>, - input_range: InputRange<'gram>, -} - -impl<'gram> StateArena<'gram> { - pub fn new() -> Self { - Self::default() - } - /// Allocate new `State`s - pub fn alloc_extend(&mut self, iter: impl Iterator>) { - for state in iter { - let state_key = StateProcessingKey::from_state(&state); - let is_new_state = self.processed_set.insert(state_key); - - if !is_new_state { - continue; - } - - let matching_state_key = StateCompletionKey::from_state(&state); - - let state_key = self.arena.push(state); - self.unprocessed.push_back(state_key); - - if let Some(Term::Nonterminal(_)) = matching_state_key.term { - self.matching_map - .entry(matching_state_key) - .or_insert_with(Vec::new) - .push(state_key); - } - } - } - /// Get `State` stored in arena by `ArenaKey` - pub fn get(&self, key: AppendOnlyVecKey) -> Option<&State<'gram>> { - self.arena.get(key) - } - /// Get `State`s which match for state completion - pub fn get_matching(&self, state: &State<'gram>) -> impl Iterator> { - let key = StateCompletionKey::from_complete(state); - self.matching_map - .get(&key) - .into_iter() - .flat_map(|keys| keys.iter()) - .filter_map(|key| self.get(*key)) - } - /// Pop next unprocessed state fields from front of queue - pub fn pop_unprocessed(&mut self) -> Option> { - self.unprocessed - .pop_front() - .and_then(|key| self.arena.get(key).map(|state| (state, key))) - .map(|(state, key)| Unprocessed { - key, - input_range: state.input_range.clone(), - matching: state.unmatched_terms.matching(), - }) - } -} - -/// Iterator for parsing input according to `Grammar` -struct ParseIter<'gram> { - grammar: Grammar<'gram>, - state_arena: StateArena<'gram>, -} - -impl<'gram> ParseIter<'gram> { - pub fn new(grammar: &'gram crate::Grammar, input: &'gram str) -> Self { - let grammar = Grammar::new(grammar); - - let state_arena = StateArena::new(); - - let mut parse_iter = Self { - grammar, - state_arena, - }; - - let starting_input_range = InputRange::new(input); - let starting_states = parse_iter.grammar.starting_iter().map(|prod| { - State::new( - prod.lhs, - prod.id.clone(), - &prod.rhs.terms, - starting_input_range.clone(), - ) - }); - - parse_iter.state_arena.alloc_extend(starting_states); - - parse_iter - } - fn get_parse_tree(&self, state: &State<'gram>) -> ParseTree<'gram> { - let (lhs, _) = self - .grammar - .get_production_parts_by_id(&state.production_id); - - let rhs = state - .matched_terms - .iter() - .filter_map(|child| match child { - TermMatch::Terminal(term) => Some(ParseTreeNode::Terminal(term)), - TermMatch::NonTerminal(key) => { - let state = self.state_arena.get(*key); - state.map(|state| ParseTreeNode::Nonterminal(self.get_parse_tree(state))) - } - }) - .collect(); - - ParseTree::new(lhs, rhs) - } -} - -impl<'gram> Iterator for ParseIter<'gram> { - type Item = ParseTree<'gram>; - - fn next(&mut self) -> Option { - while let Some(Unprocessed { - key, - matching, - input_range, - }) = self.state_arena.pop_unprocessed() - { - // buffer for when new states are created - let mut created_states = Vec::::new(); - match matching { - // predict - Some(matching @ Term::Nonterminal(_)) => { - let predictions = predict(matching, &input_range, &self.grammar); - self.state_arena.alloc_extend(predictions); - } - // scan - Some(Term::Terminal(_)) => { - let state = self.state_arena.get(key)?; - let scanned = scan(state); - created_states.extend(scanned); - self.state_arena.alloc_extend(created_states.drain(..)); - } - // complete - None => { - let state = self.state_arena.get(key)?; - if state.is_complete(&self.grammar.starting_production_ids) { - let parse_tree = self.get_parse_tree(state); - return Some(parse_tree); - } - - let completed = self - .state_arena - .get_matching(state) - .map(|parent| complete(key, &input_range, parent)); - created_states.extend(completed); - - self.state_arena.alloc_extend(created_states.drain(..)); - } - } - } - None - } -} - -pub fn parse<'gram>( - grammar: &'gram crate::Grammar, - input: &'gram str, -) -> impl Iterator> { - ParseIter::new(grammar, input) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::Grammar; - - #[test] - fn parse_dna_left_recursive() { - let grammar: Grammar = " ::= | - ::= \"A\" | \"C\" | \"G\" | \"T\"" - .parse() - .unwrap(); - - let input = "GATTACA"; - - let parses = parse(&grammar, input); - assert_eq!(parses.count(), 1); - } - - #[test] - fn parse_dna_right_recursive() { - let grammar: Grammar = " ::= | - ::= \"A\" | \"C\" | \"G\" | \"T\"" - .parse() - .unwrap(); - - let input = "GATTACA"; - - let parses = parse(&grammar, input); - assert_eq!(parses.count(), 1); - } - - #[test] - fn parse_ambiguous() { - let grammar: Grammar = " ::= | - ::= \"END\" - ::= \"END\"" - .parse() - .unwrap(); - - let input = "END"; - - let parses = parse(&grammar, input); - assert_eq!(parses.count(), 2); - } - - #[test] - fn parse_complete_empty() { - let grammar: Grammar = " ::= \"hi\" - ::= \"\"" - .parse() - .unwrap(); - - let input = "hi"; - - let parses = parse(&grammar, input); - assert_eq!(parses.count(), 1); - } - - #[test] - fn parse_empty() { - let grammar: Grammar = " ::= \"\"".parse().unwrap(); - - let input = ""; - - let parses = parse(&grammar, input); - assert_eq!(parses.count(), 1); - } - - // (source: ) - // Sum -> Sum [+-] Product - // Sum -> Product - // Product -> Product [*/] Factor - // Product -> Factor - // Factor -> '(' Sum ')' - // Factor -> Number - // Number -> [0-9] Number - // Number -> [0-9] - #[test] - fn parse_math() { - let grammar: Grammar = " ::= - ::= - ::= - ::= - ::= \"+\" | \"-\" - ::= \"*\" | \"/\" - ::= \"(\" \")\" - ::= - ::= - ::= - ::= \"0\" | \"1\" | \"2\" | \"3\" | \"4\" | \"5\" | \"6\" | \"7\" | \"8\" | \"9\" - ".parse().unwrap(); - - let input = "1+(2*3-4)"; - - let parses: Vec<_> = parse(&grammar, input).collect(); - - let expected_parse_tree = " - ::= -├── ::= -│ └── ::= -│ └── ::= -│ └── ::= -│ └── ::= \"1\" -│ └── \"1\" -├── ::= \"+\" -│ └── \"+\" -└── ::= - └── ::= \"(\" \")\" - ├── \"(\" - ├── ::= - │ ├── ::= - │ │ └── ::= - │ │ ├── ::= - │ │ │ └── ::= - │ │ │ └── ::= - │ │ │ └── ::= \"2\" - │ │ │ └── \"2\" - │ │ ├── ::= \"*\" - │ │ │ └── \"*\" - │ │ └── ::= - │ │ └── ::= - │ │ └── ::= \"3\" - │ │ └── \"3\" - │ ├── ::= \"-\" - │ │ └── \"-\" - │ └── ::= - │ └── ::= - │ └── ::= - │ └── ::= \"4\" - │ └── \"4\" - └── \")\"\n" - .trim_start(); - - assert_eq!(parses.len(), 1); - let parse_tree = format!("{}", parses[0]); - assert_eq!(parse_tree, expected_parse_tree) - } -} diff --git a/src/earley/grammar.rs b/src/earley/grammar.rs new file mode 100644 index 0000000..dc71574 --- /dev/null +++ b/src/earley/grammar.rs @@ -0,0 +1,192 @@ +use crate::append_vec::{append_only_vec_id, AppendOnlyVec}; +use crate::tracing; +use std::rc::Rc; + +append_only_vec_id!(pub(crate) ProductionId); + +/// A [`crate::Term`] which has been "matched" while parsing input +#[derive(Debug, Clone)] +pub(crate) enum TermMatch<'gram> { + /// [`crate::Term::Terminal`] which matched with a string literal + Terminal(&'gram str), + /// [`crate::Term::Nonterminal`] which was matched with a fully completed production + Nonterminal(Rc>), +} + +/// A `Term` to be "matched" with input +#[derive(Debug, Clone)] +pub(crate) enum TermMatching<'gram> { + /// A [`crate::Term`] which has not yet been matched + Unmatched(&'gram crate::Term), + /// A [`crate::Term`] which has been matched + Matched(TermMatch<'gram>), +} + +/// [`crate::Production`] offers multiple possible "right hand side" [`crate::Expression`]s, which is overly flexible for Earley parsing. +/// [`Production`] is a one-to-one relationship of [`crate::Term`] -> [`crate::Expression`]. +#[derive(Debug)] +pub(crate) struct Production<'gram> { + pub id: ProductionId, + pub lhs: &'gram crate::Term, + rhs: &'gram crate::Expression, +} + +impl<'gram> Production<'gram> { + pub fn start_matching(&self) -> ProductionMatching<'gram> { + let prod_id = self.id; + let lhs = self.lhs; + let rhs = self.rhs.terms_iter().map(TermMatching::Unmatched).collect(); + ProductionMatching { + prod_id, + lhs, + rhs, + matched_count: 0, + } + } +} + +/// An attempt at matching a [`Production`]'s "right hand side" [`crate::Term`]s +#[derive(Debug, Clone)] +pub(crate) struct ProductionMatching<'gram> { + pub prod_id: ProductionId, + pub lhs: &'gram crate::Term, + /// "right hand side" [`TermMatching`]s which are partitioned by the matched and unmatched. + /// For example: [Matched, Matched, Matched, Unmatched, Unmatched] + rhs: Vec>, + /// The progress cursor used to separate [`TermMatching`]s in the "right hand side" + matched_count: usize, +} + +impl<'gram> ProductionMatching<'gram> { + /// Attempt to "complete" the production, by having no unmatched terms remaining. + pub fn complete(&self) -> Option> { + let rhs: Option> = self + .rhs + .iter() + .map(|term| match term { + TermMatching::Unmatched(_) => None, + TermMatching::Matched(term) => Some(term.clone()), + }) + .collect(); + + rhs.map(|rhs| { + let input_len = rhs + .iter() + .map(|term| match term { + TermMatch::Terminal(term) => term.len(), + TermMatch::Nonterminal(prod) => prod.input_len, + }) + .sum(); + + ProductionMatch { + lhs: self.lhs, + rhs, + input_len, + } + }) + } + /// Get the next unmatched [`crate::Term`] + pub fn next(&self) -> Option<&'gram crate::Term> { + self.rhs.get(self.matched_count).map(|term| match term { + TermMatching::Matched(_) => { + unreachable!("terms ahead of matching cursor cannot already be matched") + } + TermMatching::Unmatched(term) => *term, + }) + } + /// Get how many [`crate::Term`] have been matched + pub fn matched_count(&self) -> usize { + self.matched_count + } + /// Add a [`TermMatch`]. + /// Does **not** check if the added term is a valid match. That responsibility is on the caller, + /// which likely has more context for faster matching of terms. + pub fn add_term_match(&self, term_match: TermMatch<'gram>) -> Option { + // only match term if there is next + self.next().map(|_| { + let Self { + lhs, + matched_count, + rhs, + prod_id, + } = self; + let prod_id = *prod_id; + + let mut rhs = rhs.clone(); + rhs[*matched_count] = TermMatching::Matched(term_match); + let matched_count = matched_count + 1; + + Self { + lhs, + matched_count, + prod_id, + rhs, + } + }) + } +} + +/// A fully complete [`ProductionMatching`]. +/// Created via [`ProductionMatching::complete`] +#[derive(Debug, Clone)] +pub(crate) struct ProductionMatch<'gram> { + pub lhs: &'gram crate::Term, + pub rhs: Vec>, + pub input_len: usize, +} + +type ProdArena<'gram> = AppendOnlyVec, ProductionId>; +type ProdTermMap<'gram> = std::collections::HashMap<&'gram crate::Term, Vec>; + +/// Similar to [`crate::Grammar`], but using [`Production`] and tables useful for parsing. +#[derive(Debug)] +pub(crate) struct GrammarMatching<'gram> { + productions: ProdArena<'gram>, + prods_by_lhs: ProdTermMap<'gram>, +} + +impl<'gram, 'a> GrammarMatching<'gram> { + pub fn new(grammar: &'gram crate::Grammar) -> Self { + let _span = tracing::span!(tracing::Level::TRACE, "GrammarMatching::new").entered(); + + let mut productions = AppendOnlyVec::::new(); + let mut prods_by_lhs = ProdTermMap::new(); + let mut prods_by_rhs = ProdTermMap::new(); + + let flat_prod_iter = grammar + .productions_iter() + .flat_map(|prod| prod.rhs_iter().map(|rhs| (&prod.lhs, rhs))); + + for (lhs, rhs) in flat_prod_iter { + let prod = productions.push_with_id(|id| Production { id, lhs, rhs }); + let id = prod.id; + + prods_by_lhs.entry(lhs).or_default().push(id); + + for rhs in rhs.terms_iter() { + prods_by_rhs.entry(rhs).or_default().push(id); + } + } + + Self { + prods_by_lhs, + productions, + } + } + pub fn get_production_by_id(&'a self, prod_id: ProductionId) -> Option<&'a Production<'gram>> { + self.productions.get(prod_id) + } + pub fn get_productions_by_lhs( + &self, + lhs: &'gram crate::Term, + ) -> impl Iterator> { + self.prods_by_lhs + .get(lhs) + .into_iter() + .flatten() + .filter_map(|prod_id| self.get_production_by_id(*prod_id)) + } + pub fn productions_iter(&self) -> impl Iterator> { + self.productions.iter() + } +} diff --git a/src/earley/input_range.rs b/src/earley/input_range.rs new file mode 100644 index 0000000..4f358dd --- /dev/null +++ b/src/earley/input_range.rs @@ -0,0 +1,79 @@ +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct InputRangeOffset { + pub start: usize, + pub len: usize, +} + +impl InputRangeOffset { + pub fn total_len(&self) -> usize { + self.start + self.len + } +} + +/// A sliding window over the input strings being parsed. +#[derive(Clone)] +pub(crate) struct InputRange<'gram> { + input: &'gram str, + pub offset: InputRangeOffset, +} + +impl<'gram> InputRange<'gram> { + pub fn new(input: &'gram str) -> Self { + Self { + input, + offset: InputRangeOffset { start: 0, len: 0 }, + } + } + pub fn next(&self) -> &'gram str { + let next_idx = self.offset.start + self.offset.len; + &self.input[next_idx..] + } + pub fn after(&self) -> Self { + Self { + input: self.input, + offset: InputRangeOffset { + start: self.offset.start + self.offset.len, + len: 0, + }, + } + } + pub fn advance_by(&self, step: usize) -> Self { + let InputRangeOffset { start, len } = self.offset; + let max_len = self.input.len() - start; + let len = std::cmp::min(len + step, max_len); + Self { + input: self.input, + offset: InputRangeOffset { start, len }, + } + } + pub fn is_complete(&self) -> bool { + self.offset.start == 0 && self.offset.len == self.input.len() + } +} + +/// A clear view of [`InputRange`], in the format "InputRange(before | current | after)" +/// e.g., "InputRange(["1", "+", "("] | ["2"] | ["*", "3", "-", "4", ")"])" +impl<'gram> std::fmt::Debug for InputRange<'gram> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let InputRangeOffset { start, len, .. } = self.offset; + let before = &self.input[..start]; + let scanned = &self.input[start..][..len]; + let after = &self.input[start..][len..]; + write!(f, "InputRange(\"{before}|{scanned}|{after}\")",) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn debug_fmt() { + let input = "GATTACA"; + let input_range = InputRange::new(input).advance_by(3).after().advance_by(2); + + let debug_format = format!("{input_range:?}"); + + assert_eq!(debug_format, "InputRange(\"GAT|TA|CA\")"); + } +} diff --git a/src/earley/mod.rs b/src/earley/mod.rs new file mode 100644 index 0000000..d4b8afa --- /dev/null +++ b/src/earley/mod.rs @@ -0,0 +1,613 @@ +mod grammar; +mod input_range; +mod traversal; + +use crate::{tracing, ParseTree, ParseTreeNode, Term}; +use grammar::{GrammarMatching, ProductionMatch, TermMatch}; +use input_range::InputRange; +use std::rc::Rc; +use traversal::{EarleyStep, Traversal, TraversalCompletionMap, TraversalId, TraversalQueue}; + +fn predict<'gram, 'a>( + traversal: &'a Traversal<'gram>, + nonterminal: &'gram Term, + grammar: &'a GrammarMatching<'gram>, +) -> impl Iterator> + 'a { + grammar + .get_productions_by_lhs(nonterminal) + .map(|prod| Traversal::start_production(prod, &traversal.input_range)) +} + +fn complete_nullable<'gram, 'a>( + traversal: &'a Traversal<'gram>, + nonterminal: &'gram Term, + null_match_map: &'a NullMatchMap<'gram>, +) -> impl Iterator> + 'a { + null_match_map + .get(nonterminal) + .into_iter() + .flatten() + .filter_map(|matched_production| { + let term_match = TermMatch::Nonterminal(matched_production.clone()); + traversal.match_term(term_match) + }) +} + +fn scan<'gram>( + traversal: &Traversal<'gram>, + terminal: &'gram str, +) -> impl Iterator> { + let scanned = if traversal.input_range.next().starts_with(terminal) { + let term_match = TermMatch::Terminal(terminal); + traversal.match_term(term_match) + } else { + None + }; + + scanned.into_iter() +} + +fn complete<'gram, 'a>( + complete_traversal: &'a Traversal<'gram>, + prod_match: &'a Rc>, + arena: &'a crate::append_vec::AppendOnlyVec, TraversalId>, + incomplete: &'a TraversalCompletionMap<'gram>, +) -> impl Iterator> + 'a { + incomplete.get(complete_traversal).filter_map(|id| { + let term_match = TermMatch::Nonterminal(prod_match.clone()); + + arena + .get(id) + .and_then(|traversal| traversal.match_term(term_match)) + }) +} + +type NullMatchMap<'gram> = + std::collections::HashMap<&'gram crate::Term, Vec>>>; + +fn find_null_prod_matches(grammar: Rc) -> NullMatchMap { + let mut null_matches = NullMatchMap::new(); + let input = ""; + + for starting_prod in grammar.productions_iter() { + let starting_term = starting_prod.lhs; + let is_nullable_productions = false; + let parses = crate::earley::parse_matching( + grammar.clone(), + input, + starting_term, + is_nullable_productions, + ); + + for parse in parses { + null_matches.entry(starting_term).or_default().push(parse); + } + } + + null_matches +} + +fn parse_tree(prod_match: Rc) -> ParseTree { + let rhs = prod_match + .rhs + .iter() + .map(|term_match| match term_match { + TermMatch::Terminal(term) => ParseTreeNode::Terminal(term), + TermMatch::Nonterminal(prod_match) => { + ParseTreeNode::Nonterminal(parse_tree(prod_match.clone())) + } + }) + .collect::>(); + + ParseTree::new(prod_match.lhs, rhs) +} +struct ParseIter<'gram> { + grammar: Rc>, + null_match_map: NullMatchMap<'gram>, + traversal_queue: TraversalQueue<'gram>, + starting_term: &'gram Term, +} + +impl<'gram> ParseIter<'gram> { + pub fn new( + grammar: Rc>, + input: &'gram str, + starting_term: &'gram Term, + is_nullable_productions: bool, + ) -> Self { + let input_range = InputRange::new(input); + let traversal_queue = TraversalQueue::new(&grammar, input_range, starting_term); + let null_match_map = if is_nullable_productions { + find_null_prod_matches(grammar.clone()) + } else { + NullMatchMap::new() + }; + + Self { + grammar, + traversal_queue, + starting_term, + null_match_map, + } + } +} + +impl<'gram> Iterator for ParseIter<'gram> { + type Item = Rc>; + + fn next(&mut self) -> Option { + let _span = tracing::span!(tracing::Level::TRACE, "ParseIter::next").entered(); + self.traversal_queue + .handle_pop(|id, arena, incomplete, created| { + let _span = tracing::span!(tracing::Level::TRACE, "ParseIter::handler").entered(); + let traversal = arena.get(id).expect("invalid traversal ID"); + + match traversal.earley() { + EarleyStep::Predict(nonterminal) => { + let _span = tracing::span!(tracing::Level::TRACE, "Predict").entered(); + created.extend(predict(traversal, nonterminal, &self.grammar)); + created.extend(complete_nullable( + traversal, + nonterminal, + &self.null_match_map, + )); + } + EarleyStep::Scan(terminal) => { + let _span = tracing::span!(tracing::Level::TRACE, "Scan").entered(); + created.extend(scan(traversal, terminal)); + } + EarleyStep::Complete(prod_match) => { + let _span = tracing::span!(tracing::Level::TRACE, "Complete").entered(); + created.extend(complete(traversal, &prod_match, arena, incomplete)); + + { + let _span = + tracing::span!(tracing::Level::TRACE, "full_prod_match").entered(); + + let is_full_traversal = traversal.input_range.is_complete() + && traversal.matching.lhs == self.starting_term; + + if is_full_traversal { + return Some(prod_match); + } + } + } + } + + None + }) + } +} + +pub fn parse<'gram>( + grammar: &'gram crate::Grammar, + input: &'gram str, +) -> impl Iterator> { + let _span = tracing::span!(tracing::Level::TRACE, "parse").entered(); + + let first_prod = grammar + .productions_iter() + .next() + .expect("Grammar must have one production to parse"); + + let grammar = GrammarMatching::new(grammar); + let grammar = Rc::new(grammar); + + let starting_term = &first_prod.lhs; + let is_nullable_productions = true; + + parse_matching(grammar, input, starting_term, is_nullable_productions).map(parse_tree) +} + +pub(crate) fn parse_matching<'gram>( + grammar: Rc>, + input: &'gram str, + starting_term: &'gram Term, + is_nullable_productions: bool, +) -> impl Iterator>> { + let _span = tracing::span!(tracing::Level::TRACE, "parse_matching").entered(); + + ParseIter::new(grammar, input, starting_term, is_nullable_productions) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::Grammar; + use quickcheck::{Arbitrary, Gen, QuickCheck, TestResult}; + + #[test] + fn undefined_prod() { + let grammar: Grammar = " + ::= | + ::= 'a' + " + .parse() + .unwrap(); + + let input = "a"; + + let parses = parse(&grammar, input); + assert_eq!(parses.count(), 1); + } + + #[test] + fn dna_left_recursive() { + let grammar: Grammar = " ::= | + ::= \"A\" | \"C\" | \"G\" | \"T\"" + .parse() + .unwrap(); + + let input = "GATTACA"; + + let parses = parse(&grammar, input); + assert_eq!(parses.count(), 1); + } + + #[test] + fn dna_right_recursive() { + let grammar: Grammar = " ::= | + ::= \"A\" | \"C\" | \"G\" | \"T\"" + .parse() + .unwrap(); + + let input = "GATTACA"; + + let parses = parse(&grammar, input); + assert_eq!(parses.count(), 1); + } + + #[test] + fn ambiguous() { + let grammar: Grammar = " ::= | + ::= \"END\" + ::= \"END\"" + .parse() + .unwrap(); + + let input = "END"; + + let parses = parse(&grammar, input); + assert_eq!(parses.count(), 2); + } + + #[test] + fn optional_noop() { + let grammar: Grammar = " + ::= | 'a' + ::= " + .parse() + .unwrap(); + + let input = "a"; + + let parses = parse(&grammar, input); + assert_eq!(parses.count(), 2); + } + + #[test] + fn recursive_nested() { + let grammar: Grammar = " + ::= | 'a' + ::= + " + .parse() + .unwrap(); + + let input = "a"; + + let parses = parse(&grammar, input); + assert_eq!(parses.count(), 2); + } + + #[test] + fn empty_right_recursive() { + let grammar: Grammar = " ::= '' | 'a' ".parse().unwrap(); + + let input = "aaaaaaaaaa"; + + let parses = parse(&grammar, input); + assert_eq!(parses.count(), 1); + } + + #[test] + fn empty_left_recursive() { + let grammar: Grammar = " ::= '' | 'a'".parse().unwrap(); + + let input = "aaaaaaaaaa"; + + let parses = parse(&grammar, input); + assert_eq!(parses.count(), 1); + } + + #[test] + fn complete_empty() { + let grammar: Grammar = " ::= \"hi\" + ::= \"\"" + .parse() + .unwrap(); + + let input = "hi"; + + let parses = parse(&grammar, input); + assert_eq!(parses.count(), 1); + } + + #[test] + fn empty() { + let grammar: Grammar = " ::= \"\"".parse().unwrap(); + + let input = ""; + + let parses = parse(&grammar, input); + assert_eq!(parses.count(), 1); + } + + #[test] + fn nested_empty_post() { + let grammar: Grammar = " + ::= + ::= 'a' + ::= ''" + .parse() + .unwrap(); + + let input = "a"; + + let parses = parse(&grammar, input); + assert_eq!(parses.count(), 1); + } + + #[test] + fn nested_empty_pre() { + let grammar: Grammar = " + ::= + ::= 'a' + ::= ''" + .parse() + .unwrap(); + + let input = "a"; + + let parses = parse(&grammar, input); + assert_eq!(parses.count(), 1); + } + + #[test] + fn nested_empty_pre_and_post() { + let grammar: Grammar = " + ::= + ::= 'a' + ::= ''" + .parse() + .unwrap(); + + let input = "a"; + + let parses = parse(&grammar, input); + assert_eq!(parses.count(), 1); + } + + #[test] + fn empty_inline() { + let grammar: Grammar = " + ::= '' + ::= 'a'" + .parse() + .unwrap(); + + let input = "aa"; + + let parses = parse(&grammar, input); + assert_eq!(parses.count(), 1); + } + + #[test] + fn empty_ambiguous() { + let grammar: Grammar = " + ::= | + ::= '' + ::= ''" + .parse() + .unwrap(); + + let input = ""; + + let parses = parse(&grammar, input); + assert_eq!(parses.count(), 2); + } + + #[test] + fn empty_first_nested() { + // this structure exposes improper "nullable" production detection + let grammar: Grammar = " + ::= '' | '' + ::= + ::= + " + .parse() + .unwrap(); + + let input = ""; + + let parses = parse(&grammar, input); + assert_eq!(parses.count(), 2); + } + + #[test] + fn optional_whitespace() { + let grammar: Grammar = " + ::= + | '' + + ::= '(' + ::= ')' + + ::= '' | + ::= ' ' | ' ' + " + .parse() + .unwrap(); + + let input = "()"; + + assert!( + grammar.parse_input(input).next().is_some(), + "can't parse: {input}" + ); + } + + #[test] + fn qualified_whitespace() { + let grammar: Grammar = " + ::= + | + ::= + | 'unqualified' + ::= 'QUALIFIER:' + ::= + | 'qualified' + ::= ' ' | ' ' + " + .parse() + .unwrap(); + + let input = "QUALIFIER:qualified unqualified"; + + assert!( + grammar.parse_input(input).next().is_some(), + "can't parse: {input}" + ); + } + + #[derive(Debug, Clone)] + struct NestedEmptyGrammar(Grammar); + impl Arbitrary for NestedEmptyGrammar { + fn arbitrary(g: &mut Gen) -> Self { + let mut grammar: Grammar = " + ::= + ::= 'a' " + .parse() + .unwrap(); + + let mut expressions: Vec<_> = grammar + .productions_iter_mut() + .flat_map(|prod| prod.rhs_iter_mut()) + .collect(); + + let expr_indexes: Vec = (0..expressions.len()).collect(); + let expr_choice_index = g.choose(&expr_indexes).unwrap(); + let expr_choice: &mut crate::Expression = expressions[*expr_choice_index]; + + let term_choice_indexes: Vec = (0..expr_choice.terms.len()).collect(); + let term_choice_index = g.choose(&term_choice_indexes).unwrap(); + + expr_choice + .terms + .insert(*term_choice_index, Term::Nonterminal(String::from("empty"))); + + grammar.add_production(" ::= ''".parse().unwrap()); + + Self(grammar) + } + } + + fn prop_empty_rules_allow_parse(grammar: NestedEmptyGrammar) -> TestResult { + let input = "a"; + + let mut parses = parse(&grammar.0, input); + TestResult::from_bool(parses.next().is_some()) + } + + #[test] + fn empty_rules_allow_parse() { + QuickCheck::new() + .tests(1000) + .quickcheck(prop_empty_rules_allow_parse as fn(NestedEmptyGrammar) -> TestResult) + } + + // (source: ) + #[test] + fn empty_noop() { + let grammar: Grammar = " + ::= '' | + ::= " + .parse() + .unwrap(); + + let input = ""; + + let parses = parse(&grammar, input); + assert_eq!(parses.count(), 2); + } + + // (source: ) + // Sum -> Sum [+-] Product + // Sum -> Product + // Product -> Product [*/] Factor + // Product -> Factor + // Factor -> '(' Sum ')' + // Factor -> Number + // Number -> [0-9] Number + // Number -> [0-9] + #[test] + fn math() { + let grammar: Grammar = " ::= + ::= + ::= + ::= + ::= \"+\" | \"-\" + ::= \"*\" | \"/\" + ::= \"(\" \")\" + ::= + ::= + ::= + ::= \"0\" | \"1\" | \"2\" | \"3\" | \"4\" | \"5\" | \"6\" | \"7\" | \"8\" | \"9\" + ".parse().unwrap(); + + let input = "1+(2*3-4)"; + + let parses: Vec<_> = parse(&grammar, input).collect(); + + let expected_parse_tree = " + ::= +├── ::= +│ └── ::= +│ └── ::= +│ └── ::= +│ └── ::= \"1\" +│ └── \"1\" +├── ::= \"+\" +│ └── \"+\" +└── ::= + └── ::= \"(\" \")\" + ├── \"(\" + ├── ::= + │ ├── ::= + │ │ └── ::= + │ │ ├── ::= + │ │ │ └── ::= + │ │ │ └── ::= + │ │ │ └── ::= \"2\" + │ │ │ └── \"2\" + │ │ ├── ::= \"*\" + │ │ │ └── \"*\" + │ │ └── ::= + │ │ └── ::= + │ │ └── ::= \"3\" + │ │ └── \"3\" + │ ├── ::= \"-\" + │ │ └── \"-\" + │ └── ::= + │ └── ::= + │ └── ::= + │ └── ::= \"4\" + │ └── \"4\" + └── \")\"\n" + .trim_start(); + + assert_eq!(parses.len(), 1); + let parse_tree = format!("{}", parses[0]); + assert_eq!(parse_tree, expected_parse_tree) + } +} diff --git a/src/earley/traversal.rs b/src/earley/traversal.rs new file mode 100644 index 0000000..3e4d032 --- /dev/null +++ b/src/earley/traversal.rs @@ -0,0 +1,214 @@ +use super::grammar::{ + GrammarMatching, Production, ProductionId, ProductionMatch, ProductionMatching, TermMatch, +}; +use super::input_range::{InputRange, InputRangeOffset}; +use crate::{ + append_vec::{append_only_vec_id, AppendOnlyVec}, + tracing, Term, +}; +use std::collections::{HashMap, HashSet, VecDeque}; +use std::rc::Rc; + +/// The three main steps of the "Earley" parsing algorithm +#[derive(Debug)] +pub(crate) enum EarleyStep<'gram> { + /// If the next [`crate::Term`] is [`crate::Term::Nonterminal`] then "predict" more [`Traversal`]s + Predict(&'gram Term), + /// If the next [`crate::Term`] is [`crate::Term::Terminal`] then "scan" input text + Scan(&'gram String), + /// If the [`ProductionMatching`] has no unmatched [`crate::Term`]s then "complete" pending [`Traversal`]s + Complete(Rc>), +} + +append_only_vec_id!(pub(crate) TraversalId); + +/// A step in traversing a [`crate::Grammar`] +#[derive(Debug)] +pub(crate) struct Traversal<'gram> { + pub input_range: InputRange<'gram>, + pub matching: ProductionMatching<'gram>, +} + +impl<'gram> Traversal<'gram> { + pub fn start_production(prod: &Production<'gram>, input_range: &InputRange<'gram>) -> Self { + let _span = tracing::span!(tracing::Level::TRACE, "start_production").entered(); + let input_range = input_range.after(); + let matching = prod.start_matching(); + Self { + matching, + input_range, + } + } + pub fn duplicate_key(&self) -> TraversalDuplicateKey { + let _span = tracing::span!(tracing::Level::TRACE, "duplicate_key").entered(); + TraversalDuplicateKey { + input_range: self.input_range.offset.clone(), + prod_id: self.matching.prod_id, + matched_term_count: self.matching.matched_count(), + } + } + pub fn earley(&self) -> EarleyStep<'gram> { + let _span = tracing::span!(tracing::Level::TRACE, "earley").entered(); + match self.matching.next() { + None => { + let prod_match = self + .matching + .complete() + .expect("matching must be complete because no next term"); + EarleyStep::Complete(Rc::new(prod_match)) + } + Some(term) => match term { + Term::Nonterminal(_) => EarleyStep::Predict(term), + Term::Terminal(term) => EarleyStep::Scan(term), + }, + } + } + pub fn match_term(&self, term_match: TermMatch<'gram>) -> Option { + let _span = tracing::span!(tracing::Level::TRACE, "match_term").entered(); + let input_len = match &term_match { + TermMatch::Terminal(term) => term.len(), + TermMatch::Nonterminal(prod) => prod.input_len, + }; + + self.matching.add_term_match(term_match).map(|matching| { + let input_range = self.input_range.advance_by(input_len); + + Self { + input_range, + matching, + } + }) + } +} + +/// Key used for ignoring duplicate [`Traversal`]s +#[derive(Debug, PartialEq, Eq, Hash)] +pub(crate) struct TraversalDuplicateKey { + input_range: InputRangeOffset, + prod_id: ProductionId, + matched_term_count: usize, +} + +#[derive(Debug, Default)] +pub(crate) struct TraversalCompletionMap<'gram> { + map: HashMap, Vec>, +} + +impl<'gram> TraversalCompletionMap<'gram> { + pub fn get( + &'_ self, + complete_traversal: &Traversal<'gram>, + ) -> impl Iterator + '_ { + let key = TermCompletionKey::new( + complete_traversal.matching.lhs, + complete_traversal.input_range.offset.start, + ); + self.map.get(&key).into_iter().flatten().cloned() + } + pub fn insert(&mut self, traversal: &Traversal<'gram>, id: TraversalId) -> bool { + match traversal.matching.next() { + Some(unmatched @ Term::Nonterminal(_)) => { + let key = + TermCompletionKey::new(unmatched, traversal.input_range.offset.total_len()); + self.map.entry(key).or_default().push(id); + true + } + _ => false, + } + } +} + +/// Key used for "incomplete" [`Traversal`] +#[derive(Debug, PartialEq, Eq, Hash)] +pub(crate) struct TermCompletionKey<'gram> { + input_start: usize, + matching: &'gram Term, +} + +impl<'gram> TermCompletionKey<'gram> { + pub fn new(matching: &'gram Term, input_start: usize) -> Self { + Self { + matching, + input_start, + } + } +} + +#[derive(Debug, Default)] +pub(crate) struct TraversalQueue<'gram> { + arena: AppendOnlyVec, TraversalId>, + queue: VecDeque, + incomplete: TraversalCompletionMap<'gram>, + processed: HashSet, +} + +impl<'gram> TraversalQueue<'gram> { + pub fn new( + grammar: &GrammarMatching<'gram>, + input_range: InputRange<'gram>, + starting_term: &'gram Term, + ) -> Self { + let queue = VecDeque::new(); + let starting_traversals = grammar + .get_productions_by_lhs(starting_term) + .map(|prod| Traversal::start_production(prod, &input_range)); + + let mut traversal_queue = Self { + queue, + ..Default::default() + }; + + traversal_queue.extend(starting_traversals); + + traversal_queue + } + + /// Extend queue with new [`Traversal`]s. Ignores duplicates, according to [`TraversalDuplicateKey`] + fn extend(&mut self, traversals: I) + where + I: Iterator>, + { + let _span = tracing::span!(tracing::Level::TRACE, "Queue::extend").entered(); + for traversal in traversals { + let processed_key = traversal.duplicate_key(); + let is_new_traversal = self.processed.insert(processed_key); + + if !is_new_traversal { + continue; + } + + let id = self.arena.push(traversal); + self.queue.push_back(id); + + let traversal = self.arena.get(id).unwrap(); + self.incomplete.insert(traversal, id); + } + } + + /// Pop the next [`Traversal`] from the queue, and invoke a provided "handler" function. + /// Any newly created [`Traversal`] by the "handler" should be placed in the provided output buffer, + /// which will be added to the queue (and filtered for duplicates). + pub fn handle_pop(&mut self, mut handler: H) -> Option>> + where + H: FnMut( + TraversalId, + &AppendOnlyVec, TraversalId>, + &mut TraversalCompletionMap<'gram>, + &mut Vec>, + ) -> Option>>, + { + let _span = tracing::span!(tracing::Level::TRACE, "Queue::handle_pop").entered(); + let mut created = Vec::::new(); + + while let Some(id) = self.queue.pop_front() { + let prod_match = handler(id, &self.arena, &mut self.incomplete, &mut created); + self.extend(created.drain(..)); + + if prod_match.is_some() { + return prod_match; + } + } + + None + } +} diff --git a/src/grammar.rs b/src/grammar.rs index 5f53218..6376639 100644 --- a/src/grammar.rs +++ b/src/grammar.rs @@ -16,7 +16,7 @@ pub enum ParseTreeNode<'gram> { Nonterminal(ParseTree<'gram>), } -/// A tree derived by successing parsing an input string via [`Grammar::parse_input`] +/// A tree derived by successfully parsing an input string via [`Grammar::parse_input`] #[derive(Debug, Clone, PartialEq, Eq)] pub struct ParseTree<'gram> { /// the "left hand side" `Term` used for this `ParseTree` diff --git a/src/lib.rs b/src/lib.rs index 9cc380c..4fd46fc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,6 @@ #![doc = include_str!("../README.md")] +mod append_vec; mod earley; mod error; mod expression; @@ -8,6 +9,7 @@ mod parsers; mod production; mod slice_iter; mod term; +mod tracing; pub use crate::error::Error; pub use crate::expression::Expression; pub use crate::grammar::{Grammar, ParseTree, ParseTreeNode}; diff --git a/src/tracing.rs b/src/tracing.rs new file mode 100644 index 0000000..447e8b9 --- /dev/null +++ b/src/tracing.rs @@ -0,0 +1,27 @@ +#[cfg(feature = "tracing")] +mod defs { + pub(crate) use tracing::span; + pub(crate) use tracing::Level; +} + +#[cfg(not(feature = "tracing"))] +mod defs { + pub struct Span {} + + impl Span { + pub fn entered(&self) -> Self { + Self {} + } + } + + macro_rules! span { + ($($any:tt)*) => {{ + use crate::tracing::Span; + Span {} + }}; + } + + pub(crate) use span; +} + +pub(crate) use defs::*;