From 06311e6c438f5bc3e1524bf624ea026997604757 Mon Sep 17 00:00:00 2001 From: Igor Date: Tue, 5 Nov 2024 17:19:46 +0400 Subject: [PATCH] feat: the new parser based on winnow --- Cargo.lock | 45 +- brush-parser/Cargo.toml | 3 + brush-parser/benches/parser.rs | 106 ++- brush-parser/src/ast.rs | 10 +- brush-parser/src/lib.rs | 3 + brush-parser/src/parser.rs | 2 +- brush-parser/src/parser2.rs | 293 +++++++ brush-parser/src/parser2/assignment.rs | 182 +++++ brush-parser/src/parser2/complete_command.rs | 198 +++++ .../complete_command/compound_command.rs | 726 ++++++++++++++++++ .../parser2/complete_command/extended_test.rs | 239 ++++++ .../complete_command/function_definition.rs | 107 +++ .../complete_command/simple_command.rs | 168 ++++ .../src/parser2/custom_combinators.rs | 456 +++++++++++ brush-parser/src/parser2/expansion.rs | 2 + brush-parser/src/parser2/io.rs | 197 +++++ brush-parser/src/parser2/pipeline.rs | 117 +++ brush-parser/src/parser2/precedence.rs | 258 +++++++ brush-parser/src/parser2/trivia.rs | 84 ++ brush-parser/src/parser2/word.rs | 423 ++++++++++ 20 files changed, 3604 insertions(+), 15 deletions(-) create mode 100644 brush-parser/src/parser2.rs create mode 100644 brush-parser/src/parser2/assignment.rs create mode 100644 brush-parser/src/parser2/complete_command.rs create mode 100644 brush-parser/src/parser2/complete_command/compound_command.rs create mode 100644 brush-parser/src/parser2/complete_command/extended_test.rs create mode 100644 brush-parser/src/parser2/complete_command/function_definition.rs create mode 100644 brush-parser/src/parser2/complete_command/simple_command.rs create mode 100644 brush-parser/src/parser2/custom_combinators.rs create mode 100644 brush-parser/src/parser2/expansion.rs create mode 100644 brush-parser/src/parser2/io.rs create mode 100644 brush-parser/src/parser2/pipeline.rs create mode 100644 brush-parser/src/parser2/precedence.rs create mode 100644 brush-parser/src/parser2/trivia.rs create mode 100644 brush-parser/src/parser2/word.rs diff --git a/Cargo.lock b/Cargo.lock index 7e670b9f..c435a2d3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -75,6 +75,21 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" +[[package]] +name = "anstream" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon 1.0.2", + "colorchoice", + "is-terminal", + "utf8parse", +] + [[package]] name = "anstream" version = "0.6.18" @@ -84,7 +99,7 @@ dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", - "anstyle-wincon", + "anstyle-wincon 3.0.6", "colorchoice", "is_terminal_polyfill", "utf8parse", @@ -114,6 +129,16 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "anstyle-wincon" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c677ab05e09154296dd37acecd46420c17b9713e8366facafa8fc0885167cf4c" +dependencies = [ + "anstyle", + "windows-sys 0.48.0", +] + [[package]] name = "anstyle-wincon" version = "3.0.6" @@ -331,6 +356,7 @@ dependencies = [ "thiserror 2.0.3", "tracing", "utf8-chars", + "winnow", ] [[package]] @@ -531,7 +557,7 @@ version = "4.5.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b17a95aa67cc7b5ebd32aa5370189aa0d79069ef1c64ce893bd30fb24bff20ec" dependencies = [ - "anstream", + "anstream 0.6.18", "anstyle", "clap_lex", "strsim", @@ -1210,7 +1236,7 @@ version = "2.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "80b84a66a325082740043a6c28bbea400c129eac0d3a27673a1de971e44bf1f7" dependencies = [ - "anstream", + "anstream 0.6.18", "anstyle", "backtrace", "os_info", @@ -3012,6 +3038,19 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "winnow" +version = "0.6.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36c1fec1a2bb5866f07c25f68c26e565c4c200aebb96d7e55710c19d3e8ac49b" +dependencies = [ + "anstream 0.3.2", + "anstyle", + "is-terminal", + "memchr", + "terminal_size", +] + [[package]] name = "xtask" version = "0.1.0" diff --git a/brush-parser/Cargo.toml b/brush-parser/Cargo.toml index 475a83bc..b06eb184 100644 --- a/brush-parser/Cargo.toml +++ b/brush-parser/Cargo.toml @@ -15,11 +15,14 @@ rust-version.workspace = true bench = false [features] +default = ["debug"] fuzz-testing = ["dep:arbitrary"] +debug = ["winnow/debug"] [dependencies] arbitrary = { version = "1.4.1", optional = true, features = ["derive"] } cached = "0.54.0" +winnow = { version = "0.6.20", features = ["simd"] } indenter = "0.3.3" peg = "0.8.4" thiserror = "2.0.3" diff --git a/brush-parser/benches/parser.rs b/brush-parser/benches/parser.rs index f7f90eb4..8bf5aba8 100644 --- a/brush-parser/benches/parser.rs +++ b/brush-parser/benches/parser.rs @@ -13,15 +13,12 @@ mod unix { .unwrap() } - fn parse_sample_script() -> brush_parser::ast::Program { - let input = r#" - for f in A B C; do - echo "${f@L}" >&2 - done - "#; - + fn parse_sample_script(input: &str) -> brush_parser::ast::Program { parse_script(input) } + fn parse_sample_script2(input: &str) -> brush_parser::ast::Program { + brush_parser::parse_program(brush_parser::ParserOptions::default(), input).unwrap() + } fn benchmark_parsing_script(c: &mut Criterion, script_path: &std::path::Path) { let contents = std::fs::read_to_string(script_path).unwrap(); @@ -37,8 +34,13 @@ mod unix { } pub(crate) fn criterion_benchmark(c: &mut Criterion) { + let input = r#" + for f in A B C; do + echo "${f@L}" >&2 + done + "#; c.bench_function("parse_sample_script", |b| { - b.iter(|| black_box(parse_sample_script())) + b.iter(|| black_box(parse_sample_script(input))) }); const POSSIBLE_BASH_COMPLETION_SCRIPT_PATH: &str = @@ -50,6 +52,84 @@ mod unix { benchmark_parsing_script(c, &well_known_complicated_script); } } + + pub(crate) fn compare_parsers(c: &mut Criterion) { + // compare_parsers_cached(c); + compare_parsers_uncached(c); + } + + fn compare_parsers_uncached(c: &mut Criterion) { + let mut group = c.benchmark_group("compare_parsers"); + // prevent caching + let mut i: usize = 0; + group.bench_function("old_parser_uncached", |b| { + b.iter_batched( + || { + i += 1; + format!( + r#" + for f in A B C; do + echo {i} "${{f@L}}" >&2 + done + "# + ) + }, + |input| black_box(parse_sample_script(input.as_str())), + criterion::BatchSize::SmallInput, + ) + }); + let mut i: usize = 0; + group.bench_function("new_parser_uncached", |b| { + b.iter_batched( + || { + i += 1; + format!( + r#" + for f in A B C; do + echo {i} "${{f@L}}" >&2 + done + "# + ) + }, + |input| { + black_box( + brush_parser::parse_program( + brush_parser::ParserOptions::default(), + input.as_str(), + ) + .unwrap(), + ) + }, + criterion::BatchSize::SmallInput, + ) + }); + + group.finish(); + } + fn compare_parsers_cached(c: &mut Criterion) { + let input = r#" + for f in A B C; do + echo "${f@L}" >&2 + done + "#; + let mut group = c.benchmark_group("compare_parsers_cached"); + + group.bench_function("old_parser_cached", |b| { + b.iter(|| black_box(parse_sample_script(input))) + }); + group.bench_function("new_parser_cached", |b| { + b.iter(|| { + black_box(black_box( + brush_parser::cacheable_parse_program( + brush_parser::ParserOptions::default(), + input.to_string(), + ) + .unwrap(), + )) + }) + }); + group.finish(); + } } #[cfg(unix)] @@ -58,8 +138,16 @@ criterion::criterion_group! { config = criterion::Criterion::default().with_profiler(pprof::criterion::PProfProfiler::new(100, pprof::criterion::Output::Flamegraph(None))); targets = unix::criterion_benchmark } + +#[cfg(unix)] +criterion::criterion_group! { + name = compare_parsers; + config = criterion::Criterion::default().with_profiler(pprof::criterion::PProfProfiler::new(100, pprof::criterion::Output::Flamegraph(None))); + targets =unix::compare_parsers +} + #[cfg(unix)] -criterion::criterion_main!(benches); +criterion::criterion_main!(compare_parsers); #[cfg(not(unix))] fn main() -> () {} diff --git a/brush-parser/src/ast.rs b/brush-parser/src/ast.rs index cdc40c02..d61282c9 100644 --- a/brush-parser/src/ast.rs +++ b/brush-parser/src/ast.rs @@ -42,6 +42,12 @@ pub enum SeparatorOperator { Sequence, } +impl Default for SeparatorOperator { + fn default() -> Self { + SeparatorOperator::Sequence + } +} + impl Display for SeparatorOperator { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { @@ -1031,7 +1037,7 @@ impl Display for ExtendedTestExpr { } /// A unary predicate usable in an extended test expression. -#[derive(Clone, Debug)] +#[derive(Clone, Copy, Debug)] #[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))] #[cfg_attr(test, derive(PartialEq, Eq))] pub enum UnaryPredicate { @@ -1120,7 +1126,7 @@ impl Display for UnaryPredicate { } /// A binary predicate usable in an extended test expression. -#[derive(Clone, Debug)] +#[derive(Clone, Copy, Debug)] #[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))] #[cfg_attr(test, derive(PartialEq, Eq))] pub enum BinaryPredicate { diff --git a/brush-parser/src/lib.rs b/brush-parser/src/lib.rs index 55bb9dde..f63c291f 100644 --- a/brush-parser/src/lib.rs +++ b/brush-parser/src/lib.rs @@ -1,5 +1,6 @@ //! Implements a tokenizer and parsers for POSIX / bash shell syntax. +// #![feature(test)] #![deny(missing_docs)] pub mod arithmetic; @@ -11,8 +12,10 @@ pub mod word; mod error; mod parser; +mod parser2; mod tokenizer; +pub use parser2::{parse_program, cacheable_parse_program}; pub use error::{ParseError, TestCommandParseError, WordParseError}; pub use parser::{parse_tokens, Parser, ParserOptions, SourceInfo}; pub use tokenizer::{tokenize_str, unquote_str, SourcePosition, Token, TokenLocation}; diff --git a/brush-parser/src/parser.rs b/brush-parser/src/parser.rs index 1d1c9af5..3a6b218d 100644 --- a/brush-parser/src/parser.rs +++ b/brush-parser/src/parser.rs @@ -3,7 +3,7 @@ use crate::error; use crate::tokenizer::{Token, TokenEndReason, Tokenizer, TokenizerOptions, Tokens}; /// Options used to control the behavior of the parser. -#[derive(Clone, Eq, Hash, PartialEq)] +#[derive(Debug, Clone, Eq, Hash, PartialEq)] pub struct ParserOptions { /// Whether or not to enable extended globbing (a.k.a. `extglob`). pub enable_extended_globbing: bool, diff --git a/brush-parser/src/parser2.rs b/brush-parser/src/parser2.rs new file mode 100644 index 00000000..ada467da --- /dev/null +++ b/brush-parser/src/parser2.rs @@ -0,0 +1,293 @@ +use crate::{ast, ParserOptions}; + +mod assignment; +mod complete_command; +mod custom_combinators; +mod expansion; +mod io; +mod pipeline; +mod precedence; +mod trivia; +mod word; + +use winnow::{ + combinator::{cut_err, eof, opt, preceded, repeat_till, terminated, trace}, + PResult, Parser, +}; + +use self::complete_command::insignificant; + +type Input<'b> = winnow::Stateful, ParserOptions>; + +/// Top level function to start parsing a script +pub fn parse_program( + state: ParserOptions, + input: &str, +) -> Result { + let parse_result = program.parse(new_input(state, input)); + + match parse_result { + Ok(program) => { + tracing::debug!(target: "parse", "PROG: {:?}", program); + Ok(program) + } + Err(parse_error) => { + tracing::debug!(target: "parse", "Parse error: {:?}", parse_error); + // TODO: errors + Err(crate::error::ParseError::ParsingAtEndOfInput) + } + } +} + +#[cached::proc_macro::cached(size = 64, result = true)] +pub fn cacheable_parse_program( + state: ParserOptions, + input: String, +) -> Result { + parse_program(state, input.as_str()) +} + +pub(crate) fn new_input(options: ParserOptions, s: &str) -> Input<'_> { + winnow::Stateful { + input: winnow::Located::new(winnow::BStr::new(s)), + state: options, + } +} + +/// `program := bom? insignificant_lines* complete_command* insignificant_lines* eof` +pub(crate) fn program(i: &mut Input<'_>) -> PResult { + trace( + "program", + // TODO: streaming + preceded( + // Remove BOM if present + (trace("BOM", opt(b"\xEF\xBB\xBF")), insignificant), + repeat_till( + 0.., + terminated( + cut_err(complete_command::complete_command.map(ast::CompoundList)), + insignificant, + ), + eof.void(), + ), + ) + .map(|(complete_commands, ())| ast::Program { complete_commands }), + ) + .parse_next(i) +} + +#[cfg(test)] +mod tests { + // TODO: tests https://github.com/andrewhickman/protox/blob/main/protox-parse/src/parse/tests.rs + use super::*; + + pub(crate) type Result = std::result::Result< + T, + winnow::error::ParseError, winnow::error::ContextError>, + >; + + pub(crate) fn input<'i>(i: &'i str) -> Input<'i> { + crate::parser2::new_input(crate::ParserOptions::default(), i) + } + + macro_rules! test_variants_ok { + ($parser:ident($($case:ident($i:literal)),+ $(,)?) -> $expected:expr) => { + $( + #[test] + fn $case() -> std::result::Result<(), winnow::error::ParseError, winnow::error::ContextError>> { + assert_eq!($parser.parse(crate::parser2::new_input(crate::ParserOptions::default(), ($i)))?, $expected); + Ok(()) + } + )+ + + }; + } + pub(crate) use test_variants_ok; + + macro_rules! test_variants_err { + ($parser:ident($($case:ident($i:literal)),+ $(,)?)) => { + $( + #[test] + fn $case() { + assert_matches::assert_matches!($parser.parse(crate::parser2::new_input(crate::ParserOptions::default(), $i)), Err(_)); + } + )+ + + }; + } + pub(crate) use test_variants_err; + + pub(crate) fn expect_echo(word: &str) -> ast::CompoundListItem { + ast::CompoundListItem( + ast::AndOrList { + first: ast::Pipeline { + bang: false, + seq: vec![ast::Command::Simple(ast::SimpleCommand { + prefix: None, + word_or_name: Some(ast::Word::new("echo")), + suffix: Some(ast::CommandSuffix(vec![ + ast::CommandPrefixOrSuffixItem::Word(ast::Word::new(word)), + ])), + })], + }, + additional: vec![], + }, + ast::SeparatorOperator::Sequence, + ) + } + + use ast::*; + + #[test] + fn empty() -> Result<()> { + assert_eq!( + program.parse(input(""))?, + Program { + complete_commands: vec![] + } + ); + Ok(()) + } + + #[test] + fn basic() -> Result<()> { + let i = input( + r#" + +#!/usr/bin/env bash + +for f in A B C; do + + # sdfsdf + echo "${f@L}" >&2 + + done + +"#, + ); + let expected = ast::Program { + complete_commands: vec![CompoundList(vec![CompoundListItem( + AndOrList { + first: Pipeline { + bang: false, + seq: vec![Command::Compound( + CompoundCommand::ForClause(ForClauseCommand { + variable_name: "f".into(), + values: Some(vec![Word::new("A"), Word::new("B"), Word::new("C")]), + body: DoGroupCommand(CompoundList(vec![CompoundListItem( + AndOrList { + first: Pipeline { + bang: false, + seq: vec![Command::Simple(SimpleCommand { + prefix: None, + word_or_name: Some(Word::new("echo")), + suffix: Some(CommandSuffix(vec![ + CommandPrefixOrSuffixItem::Word(Word::new( + "${f@L}", + )), + CommandPrefixOrSuffixItem::IoRedirect( + IoRedirect::File( + None, + IoFileRedirectKind::DuplicateOutput, + IoFileRedirectTarget::Fd(2), + ), + ), + ])), + })], + }, + additional: vec![], + }, + SeparatorOperator::Sequence, + )])), + }), + None, + )], + }, + additional: vec![], + }, + SeparatorOperator::Sequence, + )])], + }; + + let r = program.parse(i)?; + assert_eq!(r, expected); + Ok(()) + } + + #[test] + fn two_complete_commands() -> Result<()> { + let i = input( + " + echo hello # comment + # comment 2 + echo world; + # comment3 + ", + ); + let expected = Program { + complete_commands: vec![ + CompoundList(vec![expect_echo("hello")]), + CompoundList(vec![expect_echo("world")]), + ], + }; + assert_eq!(program.parse(i)?, expected); + Ok(()) + } + + #[test] + fn ambigiuos_for() -> crate::parser2::tests::Result<()> { + let i = input(r#"for for in for; do for=for; done; echo $for"#); + let expected = + ast::Program { + complete_commands: vec![CompoundList(vec![ + CompoundListItem( + AndOrList { + first: Pipeline { + bang: false, + seq: vec![Command::Compound( + CompoundCommand::ForClause(ForClauseCommand { + variable_name: "for".into(), + values: Some(vec![Word::new("for")]), + body: DoGroupCommand(CompoundList(vec![CompoundListItem( + AndOrList { + first: Pipeline { + bang: false, + seq: vec![Command::Simple(SimpleCommand { + prefix: Some(CommandPrefix(vec![CommandPrefixOrSuffixItem::AssignmentWord( + Assignment { + name: AssignmentName::VariableName("for".into()), + value: AssignmentValue::Scalar(Word::new("for")), + append: false, + }, + Word::new("for=for"), + )])), + word_or_name: None, + suffix: None, + })], + }, + additional: vec![], + }, + SeparatorOperator::Sequence, + )])), + }), + None, + )], + }, + additional: vec![], + }, + SeparatorOperator::Sequence, + ), + expect_echo("$for"), + ])], + }; + + let r = program.parse(i)?; + assert_eq!(r, expected); + + Ok(()) + } + + // KCORE=$(($SUDO "$PERF" buildid-cache -v -f -k /proc/kcore >/dev/null) 2>&1) + // echo $((time -p $* >/dev/null) 2>&1) | awk '{print $4 "u " $6 "s " $2 "r"}' + // hg_relative_sourcedir=$((cd $sourcedir; pwd) | sed -e "s|$(hg root)/||") +} diff --git a/brush-parser/src/parser2/assignment.rs b/brush-parser/src/parser2/assignment.rs new file mode 100644 index 00000000..b1fc6cf4 --- /dev/null +++ b/brush-parser/src/parser2/assignment.rs @@ -0,0 +1,182 @@ +use std::{borrow::Cow, str::Utf8Error}; + +use crate::ast; + +use winnow::{ + combinator::{alt, cut_err, delimited, opt, repeat, separated_pair, trace}, + prelude::*, + stream::AsChar as _, + token::{one_of, take_while}, + PResult, +}; + +use super::{ + custom_combinators::{expand_later, non_posix_extension}, + trivia::line_trailing, + word, Input, +}; + +// assignment_word? +/// `assignment := name '+'? '=' ( array_value | scalar_value )` +pub fn assignment(i: &mut Input<'_>) -> PResult<(ast::Assignment, ast::Word)> { + trace( + "assignment", + separated_pair( + (assignment_name, opt("+").map(|append| append.is_some())), + "=", + cut_err(alt((non_posix_extension(array::array_value), scalar_value))), + ), + ) + .with_taken() + .try_map::<_, _, Utf8Error>(|(((name, append), value), span)| { + let assignment = ast::Assignment { + name, + value, + append, + }; + let word = ast::Word::new(std::str::from_utf8(span)?); + Ok((assignment, word)) + }) + .parse_next(i) +} + +/// NAME +/// In the shell command language, a word consisting solely of underscores, digits, and alphabetics +/// from the portable character set. The first character of a name is not a digit. +/// https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/V1_chap03.html#tag_03_216 +/// `identifier := (_ +/// | alpha) (_ | alphanum)` A word that is allowed in the assignment as a name +pub(crate) fn name<'i>(i: &mut Input<'i>) -> PResult<&'i str> { + trace( + "identifier", + ( + one_of(|c| (c as char).is_alpha() || c == b'_'), + take_while(0.., |c| (c as char).is_alphanum() || c == b'_'), + ), + ) + .take() + .try_map(std::str::from_utf8) + .parse_next(i) +} + +/// `name := identifier array-index? ` +fn assignment_name(i: &mut Input<'_>) -> PResult { + trace( + "name", + (name, opt(array::array_index)).map(|(name, index)| { + if let Some(index) = index { + ast::AssignmentName::ArrayElementName(name.into(), index.into()) + } else { + ast::AssignmentName::VariableName(name.into()) + } + }), + ) + .parse_next(i) +} + +/// `scalar-value := string?` +pub fn scalar_value<'i>(i: &mut Input<'i>) -> PResult { + trace( + "scalar_value", + // NOTE: maybe be empty: `a=` + opt(alt(( + // a=[$(echo hello)] + expand_later.map(|s| Cow::Borrowed(s)), + word::word, + ))) + .map(|s| s.unwrap_or_default()), + ) + .map(|v| ast::AssignmentValue::Scalar(ast::Word { value: v.into() })) + .parse_next(i) +} + +mod array { + use std::borrow::Cow; + + use winnow::combinator::cut_err; + + use crate::parser2::custom_combinators; + + use super::*; + + /// array-value := '(' (line-trailing* array-element line-trailing*)* ')' + pub(super) fn array_value<'i>(i: &mut Input<'i>) -> PResult { + trace( + "array_value", + delimited( + "(", + // TODO: I'm worried about line_trailing. it's api is not clear what it actually do + repeat(0.., delimited(line_trailing, array_element, line_trailing)), + cut_err(")"), + ) + .map(|elements: Vec<_>| ast::AssignmentValue::Array(elements)), + ) + .parse_next(i) + } + + // NOTE: we cant possibly tell at this point what array is this: indexed (declare -a) or + // associative (declare -A). The differences between them: + // `name[subscript]=value` `subscript` in indexed array is an arithmetic expression, but in + // associative it is an arbitrary string + + // manual https://www.gnu.org/software/bash/manual/html_node/Arrays.html states: + // > associative arrays use arbitrary strings + // > name[subscript]=value + // > The subscript is treated as an **arithmetic** expression that must evaluate to a number. To + // > explicitly declare an array, use + + /// array-index := '[' _* ']' + pub fn array_index<'i>(i: &mut Input<'i>) -> PResult<&'i str> { + custom_combinators::take_inside(b'[', b']') + // delimited( + // '[', + // custom_combinators::take_unil_unbalanced(b"[", b"]"), + // cut_err(']'), + // ) + .try_map(std::str::from_utf8) + .parse_next(i) + } + + /// array-element := ( array-index '=' string? ) | string + fn array_element<'i>(i: &mut Input<'i>) -> PResult<(Option, ast::Word)> { + alt(( + separated_pair(array_index, "=", opt(word::word)).try_map::<_, _, Utf8Error>( + |(key, value)| { + Ok(( + Some(ast::Word::new(key)), + ast::Word::from(String::from(value.unwrap_or(Cow::Borrowed("")))), + )) + }, + ), + word::word.map(|w| (None, ast::Word::from(String::from(w)))), + )) + .parse_next(i) + } +} + +#[cfg(test)] +mod tests { + use crate::parser2::new_input; + use crate::parser2::tests::input; + use crate::parser2::tests::Result; + + use super::*; + + #[test] + fn test_array() -> Result<()> { + let i = input("a=( a b c )"); + let expect = assignment.parse(i)?; + dbg!(&expect); + Ok(()) + // parse("a="); + } + + #[test] + fn test_subshell() -> Result<()> { + let i = input("GPG_TTY=$(tty)"); + let expect = assignment.parse(i)?; + dbg!(&expect); + + Ok(()) + } +} diff --git a/brush-parser/src/parser2/complete_command.rs b/brush-parser/src/parser2/complete_command.rs new file mode 100644 index 00000000..be3dd340 --- /dev/null +++ b/brush-parser/src/parser2/complete_command.rs @@ -0,0 +1,198 @@ +use winnow::{ + ascii::line_ending, + combinator::{ + alt, cut_err, dispatch, eof, fail, not, opt, peek, preceded, repeat, terminated, trace, + }, + prelude::*, + stream::Stream as _, + token::{any, take}, + PResult, +}; + +pub mod compound_command; +mod extended_test; +mod function_definition; +mod simple_command; + +use crate::{ + ast::{self, SeparatorOperator}, + parser2::trivia::line_trailing, +}; + +use super::{ + custom_combinators::{self, non_posix_extension}, + io::redirect_list, + pipeline, + trivia::{self, comment, line_space, space, trim_whitespace}, + Input, +}; + +pub(crate) fn insignificant<'i>(i: &mut Input<'i>) -> PResult<()> { + trace("newline_list", move |i: &mut Input<'i>| { + while i.eof_offset() > 0 { + match peek(any).parse_next(i)? { + b' ' => space(1..).parse_next(i)?, + trivia::LF | trivia::CR => line_ending.void().parse_next(i)?, + trivia::COMMENT => (comment, alt((line_ending, eof))).void().parse_next(i)?, + _ => break, + } + } + Ok(()) + }) + .parse_next(i) +} + +// compound_list : linebreak term +// | linebreak term separator +// ; +// term : term separator and_or +// | and_or + +// complete_commands: complete_commands newline_list complete_command +// | complete_command +// ; +// complete_command : list separator_op +// | list +// ; +// list : list separator_op and_or +// | and_or + +// separator_op : '&' +// | ';' +// ; +// separator : separator_op linebreak +// | newline_list + +// newline_list : NEWLINE +// | newline_list NEWLINE +// ; +// linebreak : newline_list +// | /* empty */ +// TODO: possibly smallvec +type CompleteCommand = Vec; + +// [ echo hello && true ; echo world || false ; #comment \n] +// [echo hello && :\n] +pub(crate) fn complete_command(i: &mut Input<'_>) -> PResult { + trace( + "complete_command", + repeat( + 1.., + preceded( + // N.B emulate `repeat_till` but instead return accumulated result on + // backtrack. because if `repeat_till` fails, it discards + // all the accumulated output... + not(line_ending), + ( + and_or, + // `;` `&` or the end of the line + alt(( + trim_whitespace(0.., separator_op, 0..), + line_space.value(SeparatorOperator::default()), + )), + ) + .map(|(ao, sep)| ast::CompoundListItem(ao, sep)), + ), + ), + ) + .parse_next(i) +} + +/// `compound-list` +/// it is equivalent to a sequence of lists, separated by characters, that can be preceded +/// or followed by an arbitrary number of characters. + +/// https://pubs.opengroup.org/onlinepubs/9799919799/utilities/V3_chap02.html 2.9.3 Lists +/// A list is a sequence of one or more AND-OR lists separated by the operators ';' and '&'. +/// `complete-command := (and-or whitespace* (';' | '&') whitespace*)+` +/// A sequence of commands +/// `compound-list := (and-or ((separator-op | line-trailing) line-trailing*))*` +pub fn compound_list(i: &mut Input<'_>) -> PResult { + trace( + "compound_list", + preceded( + insignificant, + repeat(0.., terminated(complete_command, insignificant)) + .fold(Vec::new, |mut acc, c| { + acc.extend(c.into_iter()); + acc + }) + // N.B: An empty compound list doesn't allowed by the Posix spec + // See: https://unix.stackexchange.com/questions/349632/can-a-function-in-sh-have-zero-statements + // A portable posix compliant script will always needs to provide a non-empty + // `compound_list`. + // TODO: error context explanation + .verify(|l: &Vec<_>| !l.is_empty()), + ), + ) + .map(ast::CompoundList) + .parse_next(i) +} + +/// `separator-op := !(';;' | '&&') ';' | '&' ` +pub fn separator_op(i: &mut Input<'_>) -> PResult { + trace( + "separator_op", + preceded( + // it is case-clause. (or an empty command, but it is not allowed to have and empty + // complete_command) + not(alt((";;", ";&", "&&"))), + alt(( + b';'.value(ast::SeparatorOperator::Sequence), + b'&'.value(ast::SeparatorOperator::Async), + )), + ), + ) + .parse_next(i) +} + +/// https://pubs.opengroup.org/onlinepubs/9799919799/utilities/V3_chap02.html +/// 2.9.3 Lists +/// An AND-OR list is a sequence of one or more pipelines separated by the operators "&&" and "||". +/// `and-or := pipeline` +pub fn and_or(i: &mut Input<'_>) -> PResult { + trace("and_or", (pipeline::pipeline, and_or_items)) + .map(|(first, additional)| ast::AndOrList { first, additional }) + .parse_next(i) +} + +/// `and-or-items := ( ('&&' | '||') line-trailing* pipeline)*` +fn and_or_items(i: &mut Input<'_>) -> PResult> { + trace( + "and_or_items", + repeat( + 0.., + dispatch!(take::<_, Input<'_>, _>(2u8); + // `line-trailing` indicates that: + // ``` + // echo hello [&& # my comment + // ]echo world + // ``` + b"&&" => preceded((line_trailing, space(0..)), cut_err(pipeline::pipeline)).map(ast::AndOr::And), + b"||" => preceded((line_trailing, space(0..)), cut_err(pipeline::pipeline)).map(ast::AndOr::Or), + _ => fail + ), + ), + ) + .parse_next(i) +} + +/// command := simple-command | function-definition | compound-command | extended-test +pub fn command(i: &mut Input<'_>) -> PResult { + trace( + "command", + alt(( + simple_command::simple_command.map(ast::Command::Simple), + function_definition::function_definition.map(ast::Command::Function), + ( + compound_command::compound_command, + opt(preceded(space(0..), redirect_list)), + ) + .map(|(c, r)| ast::Command::Compound(c, r)), + // N.B. Extended test commands are bash extensions. + non_posix_extension(extended_test::extended_test_command) + .map(ast::Command::ExtendedTest), + )), + ) + .parse_next(i) +} diff --git a/brush-parser/src/parser2/complete_command/compound_command.rs b/brush-parser/src/parser2/complete_command/compound_command.rs new file mode 100644 index 00000000..c065b757 --- /dev/null +++ b/brush-parser/src/parser2/complete_command/compound_command.rs @@ -0,0 +1,726 @@ +/// 2.9.4 Compound Commands https://pubs.opengroup.org/onlinepubs/9799919799/utilities/V3_chap02.html +use winnow::{ + ascii::line_ending, + combinator::{alt, cut_err, delimited, opt, preceded, repeat, separated, terminated, trace}, + error::ContextError, + prelude::*, + PResult, +}; + +use crate::{ + ast::{self}, + parser2::{ + custom_combinators::non_posix_extension, + trivia::{line_trailing, space, trim_whitespace}, + word, Input, + }, +}; + +use self::word::space_after_word; +use super::compound_list; + +/// `compound-command := brace-group | arithmetic-command | subshell | if-clause | for-clause | +/// while-or-until-clause | arithmetic-for-clause` +pub fn compound_command(i: &mut Input<'_>) -> PResult { + trace( + "compound_command", + alt(( + brace_group.map(ast::CompoundCommand::BraceGroup), + // N.B. The arithmetic command is a non-sh extension. + non_posix_extension(arithmetics::arithmetic_command) + .map(ast::CompoundCommand::Arithmetic), + subshell.map(ast::CompoundCommand::Subshell), + if_clause::if_clause.map(ast::CompoundCommand::IfClause), + for_clause::for_clause.map(ast::CompoundCommand::ForClause), + while_or_until_clause, + case_clause::case_clause.map(ast::CompoundCommand::CaseClause), + // N.B. The arithmetic for clause command is a non-sh extension. + non_posix_extension(arithmetics::arithmetic_for_clause) + .map(ast::CompoundCommand::ArithmeticForClause), + )), + ) + .parse_next(i) +} + +/// `brace-group := '{' (line-trailing | whitespace) whitespace* compound-list whitespace* '}'` +fn brace_group(i: &mut Input<'_>) -> PResult { + trace( + "brace_group", + bracketed("{", compound_list, "}").map(ast::BraceGroupCommand), + ) + .parse_next(i) +} + +// TODO: somehow stop parsing earlier +// because for now the `compound_list` unsuccessfuly tries to parse `done` and unable to do so +// checks everything down the the grammar:( + +/// A combinator that encapsulates something inside brackets and tailored for the bash grammar +/// `bracketed := bracket1 (line-trailing | whitespace) whitespace* parser whitespace* bracket2` +fn bracketed<'i, Output, Bracket1, I1, Bracket2, I2>( + bracket1: Bracket1, + parser: impl Parser, Output, ContextError>, + bracket2: Bracket2, +) -> impl Parser, Output, ContextError> +where + Bracket1: Parser, I1, ContextError>, + Bracket2: Parser, I2, ContextError>, +{ + trace( + "bracketed", + delimited( + // `whitespace(1)` because of https://www.shellcheck.net/wiki/SC1054 + // > { is only recognized as the start of a command group when it's a separate token + (bracket1, alt((line_ending.void(), space(1).void()))), + cut_err(parser), + // TODO: maybe remove this whitespace + preceded(space(0..), cut_err(bracket2)), + ), + ) +} + +fn subshell(i: &mut Input<'_>) -> PResult { + trace("subshell", bracketed("(", compound_list, ")")) + .map(ast::SubshellCommand) + .parse_next(i) +} + +/// `sequential-sep := (';' line-trailing?) | line-trailing` +fn sequential_sep(i: &mut Input<'_>) -> PResult<()> { + // echo [;] echo\n + // echo[;] # comment\n + // echo[ #comment\n] + // echo[ \n] + trace("sequential_sep", alt((";".void(), line_trailing.void()))).parse_next(i) +} + +mod for_clause { + use crate::parser2::complete_command::insignificant; + + use super::*; + + pub(super) fn for_clause(i: &mut Input<'_>) -> PResult { + trace( + "for_clause", + preceded( + ("for", space(1..)), + ( + // TODO: how to use cut_err correctly here + cut_err(word::word), + // for i[ \n + // \n + //\n + // in 1 2 3 ; \n + // \n + // ]do ... + delimited(insignificant, cut_err(in_range), insignificant), + do_group, + ) + .map(|(i, values, body)| ast::ForClauseCommand { + variable_name: i.into_owned(), + values, + body, + }), + ), + ) + .parse_next(i) + } + + fn in_range(i: &mut Input<'_>) -> PResult>> { + terminated( + // N.B. range may be optional + opt(preceded( + ("in", space(1..)), + repeat(0.., terminated(word::word, space(0..))), + )), + sequential_sep, + ) + .map(|v| { + v.map(|v: Vec<_>| { + v.into_iter() + .map(|s| ast::Word::from(s.into_owned())) + .collect() + }) + }) + .parse_next(i) + } +} + +/// `do-group := do compound-list done` +fn do_group(i: &mut Input<'_>) -> PResult { + bracketed("do", compound_list, "done") + .map(ast::DoGroupCommand) + .parse_next(i) +} + +mod if_clause { + + use super::*; + + pub(super) fn if_clause(i: &mut Input<'_>) -> PResult { + trace( + "if_clause", + winnow::seq!(ast::IfClauseCommand{ + _: ("if", space_after_word), + condition: preceded(space(0..), cut_err(compound_list)), + _: cut_err(delimited(space(0..), "then", space_after_word)), + then: trim_whitespace(0.., cut_err(compound_list), 0..), + elses: else_part.map(|v| (!v.is_empty()).then_some(v)), + _: preceded(space(0..), cut_err("fi")) + }), + ) + .parse_next(i) + } + + fn else_part(i: &mut Input<'_>) -> PResult> { + trace( + "else_part", + ( + repeat(0.., preceded(space(0..), conditional_else_part)), + opt(preceded(space(0..), unconditional_else_part)), + ) + .map(|(mut cs, u): (Vec<_>, _)| { + if let Some(uncond) = u { + cs.push(uncond); + } + cs + }), + ) + .parse_next(i) + } + fn conditional_else_part(i: &mut Input<'_>) -> PResult { + winnow::seq!(ast::ElseClause{ + _: ("elif", space_after_word), + condition: preceded(space(0..), cut_err(compound_list)).map(|c| Some(c)), + _: cut_err(delimited(space(0..), "then", space_after_word)), + body: preceded(space(0..), cut_err(compound_list)) + }) + .parse_next(i) + } + fn unconditional_else_part(i: &mut Input<'_>) -> PResult { + preceded( + ("else", space_after_word), + preceded(space(0..), cut_err(compound_list)), + ) + .map(|body| ast::ElseClause { + condition: None, + body, + }) + .parse_next(i) + } +} + +/// `while-or-until-clause := ('while' | 'until') compound_list do_group` +fn while_or_until_clause(i: &mut Input<'_>) -> PResult { + #[derive(Clone, Copy)] + enum T { + Until, + While, + } + trace( + "while_until_clause", + ( + terminated( + alt(("while".value(T::While), "until".value(T::Until))), + space_after_word, + ), + cut_err((trim_whitespace(0.., compound_list, 0..), do_group)), + ), + ) + .map(|(ty, (c, d))| { + let c = ast::WhileOrUntilClauseCommand(c, d); + match ty { + T::While => ast::CompoundCommand::WhileClause(c), + T::Until => ast::CompoundCommand::UntilClause(c), + } + }) + .parse_next(i) +} + +pub mod case_clause { + use std::borrow::Cow; + + use super::*; + pub fn case_clause(i: &mut Input<'_>) -> PResult { + trace( + "case_clause", + ( + // value + delimited( + ("case", space(1..)), + cut_err(word::word), + ( + cut_err(alt((line_trailing, space(1..)))), + repeat(0.., line_trailing).map(|()| ()), + ), + ), + // cases + delimited( + ( + cut_err((space(0..), "in", space_after_word)), + repeat(0.., line_trailing).map(|()| ()), + ), + ( + repeat( + 0.., + preceded( + (repeat(0.., line_trailing).map(|()| ()), space(0..)), + case_item(false), + ), + ), + preceded(space(0..), opt(case_item(true))), + ), + (space(0..), cut_err("esac")), + ), + ), + ) + .map(|(w, (mut cases, last)): (_, (Vec<_>, _))| { + if let Some(last_item) = last { + cases.push(last_item); + } + ast::CaseClauseCommand { + value: ast::Word::from(w.into_owned()), + cases, + } + }) + .parse_next(i) + } + + fn case_item<'i, 's>( + is_last: bool, + ) -> impl Parser, ast::CaseItem, ContextError> + 's { + move |i: &mut Input<'i>| { + trace( + "case_item", + ( + terminated( + delimited(opt("("), pattern, cut_err(")")), + opt(line_trailing), + ), + opt(compound_list), + move |i: &mut Input<'i>| { + let post_action_parser = + delimited(space(0..), case_item_post_action, opt(line_trailing)); + if is_last { + opt(post_action_parser) + .map(|p| p.unwrap_or(ast::CaseItemPostAction::ExitCase)) + .parse_next(i) + } else { + cut_err(post_action_parser).parse_next(i) + } + }, + ), + ) + .map(|(p, c, post)| ast::CaseItem { + patterns: p + .into_iter() + .map(|p| ast::Word::from(p.into_owned())) + .collect(), + cmd: c, + post_action: post, + }) + .parse_next(i) + } + } + fn case_item_post_action(i: &mut Input<'_>) -> PResult { + alt(( + ";;&".value(ast::CaseItemPostAction::ContinueEvaluatingCases), + ";;".value(ast::CaseItemPostAction::ExitCase), + ";&".value(ast::CaseItemPostAction::UnconditionallyExecuteNextCaseItem), + )) + .parse_next(i) + } + + // a | b | c ) + fn pattern<'i>(i: &mut Input<'i>) -> PResult>> { + separated(1.., trim_whitespace(0.., word::word, 0..), "|").parse_next(i) + } +} + +// TODO: ariphmetic +mod arithmetics { + // TODO: $(( http://www.oilshell.org/blog/2016/11/18.html + use std::str::Utf8Error; + + use winnow::combinator::separated_pair; + + use super::*; + + pub fn arithmetic_command<'i>(i: &mut Input<'i>) -> PResult { + trace( + "arithmetic_command", + delimited("((", trim_whitespace(0.., arithmetic_expression, 0..), "))"), + ) + .map(|expr| ast::ArithmeticCommand { expr }) + .parse_next(i) + } + fn arithmetic_expression<'i>(i: &mut Input<'i>) -> PResult { + repeat(0.., trim_whitespace(0.., arithmetic_expression_piece, 0..)) + .with_taken() + .try_map(|((), s)| { + Ok::<_, Utf8Error>(ast::UnexpandedArithmeticExpr { + value: std::str::from_utf8(s)?.to_string(), + }) + }) + .parse_next(i) + } + fn arithmetic_expression_piece<'i>(i: &mut Input<'i>) -> PResult<()> { + alt(( + trim_whitespace( + 0.., + delimited( + "(", + repeat(0.., trim_whitespace(0.., arithmetic_expression_piece, 0..)), + ")", + ), + 0.., + ), + arithmetic_end, + )) + .parse_next(i) + } + fn arithmetic_end<'i>(i: &mut Input<'i>) -> PResult<()> { + // // TODO: evaluate arithmetic end; the semicolon is used in arithmetic for loops. + alt(("))", ";")).void().parse_next(i) + } + + // N.B. The arithmetic for loop is a non-sh extension. + pub fn arithmetic_for_clause(i: &mut Input<'_>) -> PResult { + winnow::seq!(ast::ArithmeticForClauseCommand{ + _: separated_pair("for", space(0..), "(("), + initializer: terminated(trim_whitespace(0.., opt(arithmetic_expression), 0..), ";"), + condition: terminated(trim_whitespace(0.., opt(arithmetic_expression), 0..), ";"), + updater: trim_whitespace(0.., opt(arithmetic_expression), 0..), + _: trim_whitespace(0.., "))", 0..), + _: sequential_sep, + body: do_group + }) + .parse_next(i) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use self::{case_clause::case_clause, for_clause::for_clause, if_clause::if_clause}; + use crate::parser2::tests::{expect_echo, test_variants_err, test_variants_ok}; + + mod test_brace_group { + + use super::*; + + test_variants_ok! {brace_group( + basic("{ echo hello;}"), + new_lines("{ + + echo hello + + }"), + comments("{ #comment + + #comment 1 +echo hello ; # comment 2 +# comment 3 + +}"), + + ) -> + ast::BraceGroupCommand(ast::CompoundList(vec![expect_echo("hello")])) + } + + test_variants_err! {brace_group( + empty("{}"), + without_space("{:;}"), + without_semicolon("{:;}"), + wrong_comment("{ # comment}"), + newline_semicolon("{ : + ;}"), + )} + } + + mod test_compound_list { + use crate::parser2::tests::input; + + use super::*; + + #[test] + fn for_loop_with_newline() -> crate::parser2::tests::Result<()> { + use ast::*; + let i = input("for i in 1; do echo hello; done\n"); + let expected = CompoundList(vec![CompoundListItem( + AndOrList { + first: Pipeline { + bang: false, + seq: vec![Command::Compound( + CompoundCommand::ForClause(ForClauseCommand { + variable_name: "i".into(), + values: Some(vec![Word::new("1")]), + body: DoGroupCommand(CompoundList(vec![expect_echo("hello")])), + }), + None, + )], + }, + additional: vec![], + }, + SeparatorOperator::Sequence, + )]); + assert_eq!(compound_list.parse(i)?, expected); + Ok(()) + } + + test_variants_ok! {compound_list( + basic(r#"echo hello"#), + basic_semicolon(r#"echo hello"#), + basic_with_comment(r#"echo hello #comment"#), + start_with_a_comment( + r#"#comment + # comment 2 + echo hello + #comment3 + "# + ), + ) -> + ast::CompoundList(vec![expect_echo("hello")]) + } + test_variants_ok! {compound_list( + list(r#"echo hello ; echo world;"#), + multiline_with_comments( + r#"echo hello #comment + # comment 2 + echo world + # comment + "# + ), + ) -> + ast::CompoundList(vec![expect_echo("hello"), expect_echo("world")]) + } + } + + mod test_while_until { + use super::*; + test_variants_ok! {while_or_until_clause( + multiline_predicate( + r#"while + echo hello + echo world + do + echo body + done"# + ), + a_lot_of_comments( + r#"while #comment1 + # comment 2 + echo hello + echo world + # comment 3 + do # comment 4 + echo body # comment 5 + # comment 6 + done"# + ) + ) -> + ast::CompoundCommand::WhileClause(ast::WhileOrUntilClauseCommand(ast::CompoundList(vec![ + expect_echo("hello"), + expect_echo("world"), + ]), ast::DoGroupCommand(ast::CompoundList(vec![ expect_echo("body") ])))) + } + } + + mod test_for_clause { + use crate::parser2::tests::input; + + use super::*; + + fn expect_for_clause(values: Option>) -> ast::ForClauseCommand { + ast::ForClauseCommand { + variable_name: "i".into(), + values, + body: ast::DoGroupCommand(ast::CompoundList(vec![expect_echo("hello")])), + } + } + + test_variants_ok! {for_clause( + basic( + r#"for i in 1 2 3; do + echo hello + done"# + ), + oneline( + r#"for i in 1 2 3; do echo hello; done"# + ), + comments( + r#"for i # comment + # comment 2 + in 1 2 3 # comment 3 + # comment 4 + do + echo hello + done"# + ) + ) -> expect_for_clause(Some(vec![ast::Word::new("1"), ast::Word::new("2"), ast::Word::new("3") ])) + } + test_variants_ok! {for_clause( + without_values( + r#"for i; do + echo hello + done"# + ), + without_values_with_comments( + r#"for i # comment + # comment 2 + do + echo hello + done"# + ), + ) -> expect_for_clause(None) + } + test_variants_ok! {for_clause( + empty_values( + r#"for i in ; do + echo hello + done"# + ), + ) -> expect_for_clause(Some(vec![])) + } + + #[test] + fn ambigiuos_done() -> crate::parser2::tests::Result<()> { + let i = input( + r#"for i; do +echo done + +done"#, + ); + let r = for_clause.parse(i)?; + let expected = ast::ForClauseCommand { + variable_name: "i".into(), + values: None, + body: ast::DoGroupCommand(ast::CompoundList(vec![expect_echo("done")])), + }; + assert_eq!(r, expected); + + Ok(()) + } + } + + mod test_if_else { + use super::*; + + test_variants_ok! {if_clause( + simple_oneliner(r#"if echo hello; then echo world; fi"#), + multiline( + r#"if + echo hello + + then echo world + +fi"# + ), + comments( + r#"if echo hello # comment1 +# comment2 + then # comment 3 + echo world ; + # comment 4 + fi"# + ), + + + ) -> ast::IfClauseCommand{ + condition: ast::CompoundList(vec![expect_echo("hello")]), + then: ast::CompoundList(vec![expect_echo("world")]), + elses: None + } + } + + test_variants_ok! {if_clause( + if_else( + r#"if echo hello + + then + echo world; + +else echo elseworld + fi"# + ), + + ) -> ast::IfClauseCommand{ + condition: ast::CompoundList(vec![expect_echo("hello")]), + then: ast::CompoundList(vec![expect_echo("world")]), + elses: Some(vec![ast::ElseClause{ + condition: None, + body: ast::CompoundList(vec![expect_echo("elseworld")]) + }]) + } + } + test_variants_ok! {if_clause( + if_elif_else( + r#"if echo hello + + then # comment + echo world + elif echo elif1 +then + echo elifthen1; else + echo elseend + fi"# + ), + ) -> ast::IfClauseCommand{ + condition: ast::CompoundList(vec![expect_echo("hello")]), + then: ast::CompoundList(vec![expect_echo("world")]), + elses: Some(vec![ + ast::ElseClause{ + condition: Some(ast::CompoundList(vec![expect_echo("elif1")])), + body: ast::CompoundList(vec![expect_echo("elifthen1")]) + }, + ast::ElseClause{ + condition: None, + body: ast::CompoundList(vec![expect_echo("elseend")]) + }, + ]) + } + } + } + + mod test_case_clause { + use super::*; + test_variants_ok! {case_clause( + complex( + r#"case "1" # comment 1 + + in + + 1);; # comment1 + +# comment 3 +2 | "patt#ern2" | 2he@llo,he_llo) echo hello + +;& + (:) echo world; ;;& + esac"# + ), + ) -> ast::CaseClauseCommand{ + value: ast::Word::new("1"), + cases: vec![ + ast::CaseItem{ + patterns: vec![ast::Word::new("1")], + cmd: None, + post_action: ast::CaseItemPostAction::ExitCase + }, + ast::CaseItem{ + patterns: vec![ast::Word::new("2"), ast::Word::new("patt#ern2"), ast::Word::new("2he@llo,he_llo")], + cmd: Some(ast::CompoundList(vec![expect_echo("hello")])), + post_action: ast::CaseItemPostAction::UnconditionallyExecuteNextCaseItem + }, + ast::CaseItem{ + patterns: vec![ast::Word::new(":")], + cmd: Some(ast::CompoundList(vec![expect_echo("world")])), + post_action: ast::CaseItemPostAction::ContinueEvaluatingCases + } + ] + + } + } + } +} diff --git a/brush-parser/src/parser2/complete_command/extended_test.rs b/brush-parser/src/parser2/complete_command/extended_test.rs new file mode 100644 index 00000000..5658079e --- /dev/null +++ b/brush-parser/src/parser2/complete_command/extended_test.rs @@ -0,0 +1,239 @@ +use winnow::combinator::{cut_err, delimited, empty, peek, separated_pair, terminated}; +use winnow::token::{any, one_of, take_till, take_until}; + +use crate::parser2::trivia::space; +use crate::parser2::word; + +use self::custom_combinators::take_inside; + +use super::*; + +// TODO: https://github.com/oils-for-unix/oils/issues/3 + +pub fn extended_test_command(i: &mut Input<'_>) -> PResult { + delimited( + "[[", + terminated( + trim_whitespace(0.., extended_test_expression, 0..), + line_trailing, + ), + "]]", + ) + .parse_next(i) +} + +fn extended_test_expression(i: &mut Input<'_>) -> PResult { + trace( + "extended_test_expression", + custom_combinators::precedence( + // prefix operators + alt((custom_combinators::unary_op( + 1, + trim_whitespace(0.., "!", 0..), + ),)), + // postfix operators + fail, + // binary operators + alt(( + custom_combinators::binary_op( + 2, + custom_combinators::Assoc::Left, + delimited( + repeat(0.., line_trailing).map(|()| ()), + trim_whitespace(0.., "||", 0..), + repeat(0.., line_trailing).map(|()| ()), + ), + ), + custom_combinators::binary_op( + 2, + custom_combinators::Assoc::Left, + delimited( + repeat(0.., line_trailing).map(|()| ()), + trim_whitespace(0.., "&&", 0..), + repeat(0.., line_trailing).map(|()| ()), + ), + ), + )), + // operands + alt(( + delimited( + "(", + trim_whitespace(0.., extended_test_expression, 0..), + ")", + ), //subexpression handled via recursion + binary_test, + separated_pair(conditional_unary_operator, space(1..), word::word).map(|(p, w)| { + ast::ExtendedTestExpr::UnaryTest(p, ast::Word::from(String::from(w))) + }), + word::word.map(|w| { + ast::ExtendedTestExpr::UnaryTest( + ast::UnaryPredicate::StringHasNonZeroLength, + ast::Word::from(String::from(w)), + ) + }), + )), + |op: custom_combinators::Operation<&[u8], &[u8], &[u8], _>| { + //evaluating the expression step by step + use self::custom_combinators::Operation::*; + match op { + Prefix(b"!", o) => Ok(ast::ExtendedTestExpr::Not(Box::from(o))), + Binary(lhs, b"||", rhs) => { + Ok(ast::ExtendedTestExpr::Or(Box::from(lhs), Box::from(rhs))) + } + Binary(lhs, b"&&", rhs) => { + Ok(ast::ExtendedTestExpr::And(Box::from(lhs), Box::from(rhs))) + } + _ => Err("Invalid combination"), + } + }, + ), + ) + .parse_next(i) +} + +fn binary_test(i: &mut Input<'_>) -> PResult { + trace( + "binary_test", + alt(( + ( + word::word, + trim_whitespace(0.., binary_predicate, 0..), + word::word, + ) + .map(|(l, p, r)| { + ast::ExtendedTestExpr::BinaryTest( + p, + ast::Word::from(l.to_string()), + ast::Word::from(r.to_string()), + ) + }), + binary_test_regex, + )), + ) + .parse_next(i) +} +fn binary_predicate(i: &mut Input<'_>) -> PResult { + let v = |v| empty.value(v); + trace( + "binary_predicate_basic", + terminated(alt(( + one_of((b"=", b"==")).value(ast::BinaryPredicate::StringExactlyMatchesPattern), + "!=".value(ast::BinaryPredicate::StringDoesNotExactlyMatchPattern), + "<".value(ast::BinaryPredicate::LeftSortsBeforeRight), + ">".value(ast::BinaryPredicate::LeftSortsAfterRight), + preceded( + "-", + dispatch! {take::<_, Input<'_>, _>(2usize); + b"ef" => v(ast::BinaryPredicate::FilesReferToSameDeviceAndInodeNumbers), + b"eq" => v(ast::BinaryPredicate::ArithmeticEqualTo), + b"ge" => v(ast::BinaryPredicate::ArithmeticGreaterThanOrEqualTo), + b"gt" => v(ast::BinaryPredicate::ArithmeticGreaterThan), + b"le" => v(ast::BinaryPredicate::ArithmeticLessThanOrEqualTo), + b"lt" => v(ast::BinaryPredicate::ArithmeticLessThan), + b"ne" => v(ast::BinaryPredicate::ArithmeticNotEqualTo), + b"nt" => v(ast::BinaryPredicate::LeftFileIsNewerOrExistsWhenRightDoesNot), + b"ot" => v(ast::BinaryPredicate::LeftFileIsOlderOrDoesNotExistWhenRightDoes), + _ => fail, + }, + ), + + )), space(1..)), + ) + .parse_next(i) +} + +fn binary_test_regex(i: &mut Input<'_>) -> PResult { + trace( + "binary_test_regex", + (word::word, trim_whitespace(0.., "=~", 0..), regex_word).map(|(l, _, r)| { + if r.starts_with(['\'', '\"']) { + // TODO: Confirm it ends with that too? + ast::ExtendedTestExpr::BinaryTest( + ast::BinaryPredicate::StringContainsSubstring, + ast::Word::from(l.into_owned()), + ast::Word::new(r), + ) + } else { + ast::ExtendedTestExpr::BinaryTest( + ast::BinaryPredicate::StringMatchesRegex, + ast::Word::from(l.into_owned()), + ast::Word::new(r), + ) + } + }), + ) + .parse_next(i) +} + +fn conditional_unary_operator(i: &mut Input<'_>) -> PResult { + let v = |v| empty.value(v); + trace( + "extended_unary_expression", + preceded( + "-", + dispatch! {any; + b'a' => v(ast::UnaryPredicate::FileExists), + b'b' => v(ast::UnaryPredicate::FileExistsAndIsBlockSpecialFile), + b'c' => v(ast::UnaryPredicate::FileExistsAndIsCharSpecialFile), + b'd' => v(ast::UnaryPredicate::FileExistsAndIsDir), + b'e' => v(ast::UnaryPredicate::FileExists), + b'f' => v(ast::UnaryPredicate::FileExistsAndIsRegularFile), + b'g' => v(ast::UnaryPredicate::FileExistsAndIsSetgid), + b'h' => v(ast::UnaryPredicate::FileExistsAndIsSymlink), + b'k' => v(ast::UnaryPredicate::FileExistsAndHasStickyBit), + b'n' => v(ast::UnaryPredicate::StringHasNonZeroLength), + b'o' => v(ast::UnaryPredicate::ShellOptionEnabled), + b'p' => v(ast::UnaryPredicate::FileExistsAndIsFifo), + b'r' => v(ast::UnaryPredicate::FileExistsAndIsReadable), + b's' => v(ast::UnaryPredicate::FileExistsAndIsNotZeroLength), + b't' => v(ast::UnaryPredicate::FdIsOpenTerminal), + b'u' => v(ast::UnaryPredicate::FileExistsAndIsSetuid), + b'v' => v(ast::UnaryPredicate::ShellVariableIsSetAndAssigned), + b'w' => v(ast::UnaryPredicate::FileExistsAndIsWritable), + b'x' => v(ast::UnaryPredicate::FileExistsAndIsExecutable), + b'z' => v(ast::UnaryPredicate::StringHasZeroLength), + b'G' => v(ast::UnaryPredicate::FileExistsAndOwnedByEffectiveGroupId), + b'L' => v(ast::UnaryPredicate::FileExistsAndIsSymlink), + b'N' => v(ast::UnaryPredicate::FileExistsAndModifiedSinceLastRead), + b'O' => v(ast::UnaryPredicate::FileExistsAndOwnedByEffectiveUserId), + b'R' => v(ast::UnaryPredicate::ShellVariableIsSetAndNameRef), + b'S' => v(ast::UnaryPredicate::FileExistsAndIsSocket), + _ => fail + + }, + ), + ) + .parse_next(i) +} + +fn regex_word<'i>(i: &mut Input<'i>) -> PResult<&'i str> { + const REGEX_STOP: &[u8] = &[b' ', b'"', b'\'', b'\r', b'\n', b'\t']; + repeat( + 1.., + dispatch! {peek(any); + b'(' => take_inside(b'(', b')'), + b'"' => delimited("\"", take_until(0.., "\""), cut_err("\"")), + b'\'' => delimited("'", take_until(0.., "'"), cut_err("'")), + _ => take_till(1.., REGEX_STOP) + }, + ) + .with_taken() + .try_map(|((), s)| std::str::from_utf8(s)) + .parse_next(i) +} + +#[cfg(test)] +mod tests { + use crate::parser2::new_input; + use crate::ParserOptions; + + use super::*; + #[test] + fn test_parse_regex_word() { + let input = r#"[[ " sss " =~ ( ss()s ) ]]"#; + let r = extended_test_command.parse(new_input(ParserOptions::default(), input)); + dbg!(r); + + // let input = r#"[[ a =~ ^[0-9]{8}$" "]] ]]"#; + } +} diff --git a/brush-parser/src/parser2/complete_command/function_definition.rs b/brush-parser/src/parser2/complete_command/function_definition.rs new file mode 100644 index 00000000..47e992fa --- /dev/null +++ b/brush-parser/src/parser2/complete_command/function_definition.rs @@ -0,0 +1,107 @@ +/// 2.9.5 Function Definition Command +use winnow::{ + combinator::{cut_err, opt, preceded, repeat, separated_pair, trace}, + error::ErrMode, + PResult, Parser as _, +}; + +use crate::{ + ast, + parser2::{ + custom_combinators::non_posix_extension, + io::redirect_list, + trivia::{line_trailing, space, trim_whitespace}, + word::{self, space_after_word}, + Input, + }, +}; + +use super::{compound_command, insignificant}; + +pub fn function_definition<'i>(i: &mut Input<'i>) -> PResult { + trace("function_definition", move |i: &mut Input<'i>| { + // N.B. Non-sh extensions allows use of the 'function' word to indicate a function + let has_keyword = opt(non_posix_extension(( + "function", + space_after_word, + insignificant, + ))) + .parse_next(i)? + .is_some(); + // NOTE: there should be special rule for function identifier because + // characters such as '$' | "'" | '"' are not allowed in Bash here. But it + // unnesessary extra maintenance cost. + // Maybe we should allow string here because any external command can be + // named whatewer it wants, why not internal functions can be named `foo\;bar` ? + let name_parser = move |i: &mut Input<'i>| { + if has_keyword { + // N.B if error occurs, stop parsing only if we have `function` keyword + cut_err(word::non_empty(word::word)).parse_next(i) + } else { + word::non_empty(word::word).parse_next(i) + } + }; + let (name, body) = separated_pair( + trim_whitespace(0.., name_parser, 0..), + ( + "(", + space(0..), + cut_err(")"), + repeat(0.., line_trailing).map(|()| ()), + space(0..), + ), + cut_err(function_body), + ) + .parse_next(i)?; + Ok::<_, ErrMode<_>>((name, body)) + }) + .with_taken() + .try_map(|((fname, body), source)| { + let source = std::str::from_utf8(source)?; + Ok::<_, std::str::Utf8Error>(ast::FunctionDefinition { + fname: fname.into_owned(), + body, + source: source.to_string(), + }) + }) + .parse_next(i) +} + +fn function_body(i: &mut Input<'_>) -> PResult { + ( + compound_command::compound_command, + (preceded(space(0..), opt(redirect_list))), + ) + .map(|(c, r)| ast::FunctionBody(c, r)) + .parse_next(i) +} + +#[cfg(test)] +mod tests { + use crate::parser2::new_input; + use crate::ParserOptions; + + use super::*; + + #[test] + fn parse_function_definition() { + fn parse<'i>(i: &'i str) { + let io = function_definition + .parse_next(&mut new_input(ParserOptions::default(), i)) + .unwrap(); + dbg!(io); + } + + parse( + r#"function + + + +d@d1#ddd () + +{ + : +} 2>&1"#, + ) + } +} diff --git a/brush-parser/src/parser2/complete_command/simple_command.rs b/brush-parser/src/parser2/complete_command/simple_command.rs new file mode 100644 index 00000000..0a5a5489 --- /dev/null +++ b/brush-parser/src/parser2/complete_command/simple_command.rs @@ -0,0 +1,168 @@ +use std::borrow::Cow; + +use winnow::{ + combinator::{ + alt, cut_err, delimited, eof, fail, not, opt, preceded, repeat, separated, trace, + }, + dispatch, + token::{any, one_of}, + PResult, Parser, +}; + +use crate::{ + ast, + parser2::{ + assignment, + custom_combinators::{expand_later, non_posix_extension}, + io::{self, process_substitution}, + trivia::space, + word::{self}, + Input, + }, +}; + +/// `simple-command := (prefix (name suffix?)?) | (name suffix?)` +pub fn simple_command(i: &mut Input<'_>) -> PResult { + trace("simple_command", move |i: &mut Input<'_>| { + let prefix = opt(cmd_prefix).parse_next(i)?; + let (cmd_name, suffix) = if prefix.is_some() { + // if there is a prefix the rest of the command are optional + opt(( + // N.B should be a whitespace between cmd_prefix and cmd_name + delimited(space(1..), cmd_name.map(Some), space(1..)), + opt(cmd_suffix), + )) + .map(|o| o.unwrap_or((None, None))) + .parse_next(i)? + } else { + (cmd_name.map(Some), opt(preceded(space(1..), cmd_suffix))).parse_next(i)? + }; + Ok(ast::SimpleCommand { + prefix, + word_or_name: cmd_name.map(|n| ast::Word::from(n.into_owned())), + suffix, + }) + }) + .parse_next(i) +} + +pub fn cmd_prefix(i: &mut Input<'_>) -> PResult { + trace( + "cmd_prefix", + separated( + 1.., + alt(( + io::io_redirect.map(|i| ast::CommandPrefixOrSuffixItem::IoRedirect(i)), + assignment::assignment.map(|(assignment, word)| { + ast::CommandPrefixOrSuffixItem::AssignmentWord(assignment, word) + }), + )), + space(1..), + ), + ) + .map(ast::CommandPrefix) + .parse_next(i) +} + +// TODO: check rules +// cmd_name : WORD /* Apply rule 7a */ +// ; +// cmd_word : WORD /* Apply rule 7b */ +fn cmd_name<'i>(i: &mut Input<'i>) -> PResult> { + trace( + "cmd_name", + alt(( + // N.B maybe subshell $() or `` that we will expand later inside the interpreter + expand_later.map(|s| Cow::Borrowed(s)), + // Disallow empty names. + // This is differs from Bash. But according to: + // https://unix.stackexchange.com/questions/66965/files-with-empty-names + // filenames cannot be empty. So it is a nice user experience enchantment. + word::non_reserved(word::non_empty(word::word)), + )), + ) + .parse_next(i) +} + +pub fn cmd_suffix(i: &mut Input<'_>) -> PResult { + trace( + "cmd_suffix", + repeat( + 1.., + delimited( + // N.B backtrack optimization + // TODO: use constants + (not(one_of(('#', ';', '&', '|', '\n', '\r'))), not(eof)), + alt(( + io::io_redirect.map(|i| ast::CommandPrefixOrSuffixItem::IoRedirect(i)), + assignment::assignment.map(|(assignment, word)| { + ast::CommandPrefixOrSuffixItem::AssignmentWord(assignment, word) + }), + word::word.map(|w| { + ast::CommandPrefixOrSuffixItem::Word(ast::Word::from(w.into_owned())) + }), + non_posix_extension(process_substitution).map(|(kind, subshell)| { + ast::CommandPrefixOrSuffixItem::ProcessSubstitution(kind, subshell) + }), + )), + // a newline maybe escaped + // echo \ + // hello + (space(0..), opt((b"\\\n", space(0..)))), + ), + ), + ) + .map(ast::CommandSuffix) + .parse_next(i) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::parser2::tests::test_variants_ok; + use ast::*; + + test_variants_ok! {simple_command( + basic( + r#"echo hello "wor ld" hel-lo world"# + ), + escapes( + r#""echo" hello \ + wor\ ld \ + hel-lo\ + world"# + ), + ) -> SimpleCommand{ + prefix: None, + word_or_name: Some(Word::new("echo")), + suffix: Some(CommandSuffix(vec![CommandPrefixOrSuffixItem::Word(Word::new("hello")), + CommandPrefixOrSuffixItem::Word(Word::new("wor ld")), + CommandPrefixOrSuffixItem::Word(Word::new("hel-lo")), + CommandPrefixOrSuffixItem::Word(Word::new("world")) + ])) + } + } + test_variants_ok! {simple_command( + assignment( + r#"FOO=1 &1"# + ) + ) -> SimpleCommand{ + prefix: Some(CommandPrefix(vec!{ + CommandPrefixOrSuffixItem::AssignmentWord( + Assignment { name: AssignmentName::VariableName("FOO".into()), value: AssignmentValue::Scalar(Word::new("1")), append: false }, + Word::new("FOO=1") + ), + CommandPrefixOrSuffixItem::IoRedirect(IoRedirect::File(None, + IoFileRedirectKind::Write, + IoFileRedirectTarget::Filename(Word::new("filename")))) + })), + word_or_name: Some(Word::new("echo")), + suffix: Some(CommandSuffix(vec![CommandPrefixOrSuffixItem::Word(Word::new("hello")), + CommandPrefixOrSuffixItem::Word(Word::new("wor ld")), + CommandPrefixOrSuffixItem::Word(Word::new("hel-lo")), + CommandPrefixOrSuffixItem::Word(Word::new("world")), + CommandPrefixOrSuffixItem::IoRedirect(IoRedirect::File(Some(2), IoFileRedirectKind::DuplicateOutput, IoFileRedirectTarget::Fd(1))) + ])) + } + } +} diff --git a/brush-parser/src/parser2/custom_combinators.rs b/brush-parser/src/parser2/custom_combinators.rs new file mode 100644 index 00000000..6024da6f --- /dev/null +++ b/brush-parser/src/parser2/custom_combinators.rs @@ -0,0 +1,456 @@ +use winnow::{ + ascii::take_escaped, + combinator::{cut_err, delimited, fail, opt, peek, trace}, + dispatch, + error::{ContextError, ErrMode, ErrorKind, ParserError}, + stream::{AsBytes, Offset as _, Stream}, + token::{any, literal, one_of, take, take_till, take_until}, + PResult, Parser, +}; + +use crate::parser2::trivia::ESCAPE; + +use super::Input; + +pub(crate) fn non_posix_extension<'i, 's, ParseNext, Output>( + mut parser: ParseNext, +) -> impl Parser, Output, ContextError> +where + ParseNext: Parser, Output, ContextError>, +{ + trace("non_posix_extension", move |i: &mut Input<'i>| { + if !i.state.sh_mode { + parser.parse_next(i) + } else { + // TODO: errors + Err(ErrMode::from_error_kind(i, ErrorKind::Verify)) + } + }) +} + +// Temporary special case for subshell that goes onto the cmd_name or cmd_suffix +// in the original parser +pub(crate) fn expand_later<'i>(i: &mut Input<'i>) -> PResult<&'i str> { + trace( + "expand_later", + dispatch! {any; + b'$' => take_inside(b'(', b')').void(), + b'`' => take_till(0.., b'`').void(), + _ => fail, + } + .take() + .try_map(std::str::from_utf8), + ) + .parse_next(i) +} + +pub(crate) fn take_inside<'i, 's>( + opening: u8, + closing: u8, +) -> impl Parser, as Stream>::Slice, ContextError> { + move |i: &mut Input<'i>| { + trace("take_inside", move |i: &mut Input<'i>| { + trace("opening", opening).parse_next(i)?; + let start = i.checkpoint(); + cut_err( + (move |i: &mut Input<'i>| { + let mut opened: u32 = 1; + while i.eof_offset() > 0 + && opened != 0 + // N.B it does not consumes characters from the second param -> + && opt(take_till(0.., (opening, closing, ESCAPE))).parse_next(i)?.is_some() + { + // -> consume this characters: (opening, closing or escape) + match i.next_token().unwrap() { + ESCAPE => { + opt(any).parse_next(i)?; + } + c if c == opening => { + opened += 1; + } + c if c == closing => { + opened -= 1; + } + // Can not happen. + _ => unreachable!(), + }; + // special case. but it is meaningless to use this function + // with the same brackets (e.g `"`) because the next one + // is always the closing + // `[abc]`d`fg` + if opening == closing { + opened %= 2; + } + } + + Ok(opened) + }) + // TODO: error unclosed delimiter explanation + .verify(|opened: &u32| *opened == 0), + ) + .parse_next(i)?; + + // take everything consumed + let mut offset = i.offset_from(&start); + i.reset(&start); + offset -= 1; + let take = i.next_slice(offset); + + trace("closing", cut_err(closing)).parse_next(i)?; + Ok(take) + }) + .parse_next(i) + } +} + +/// implementation of a prat parsing table for use with the bash extended tests + +/// An unary operator. +pub struct Unary { + value: V, + precedence: Q, +} + +/// A binary operator. +pub struct Binary { + value: V, + precedence: Q, + assoc: Assoc, +} + +/// A single evaluation step. +pub enum Operation { + /// A prefix operation. + Prefix(P1, O), + /// A postfix operation. + Postfix(O, P2), + /// A binary operation. + Binary(O, P3, O), +} + +/// Associativity for binary operators. +#[derive(Copy, Clone, PartialEq, Eq)] +pub enum Assoc { + /// Left associative. + Left, + /// Right associative. + Right, +} + +/// Element for operator stack. +enum Operator { + Prefix(P1, Q), + Postfix(P2, Q), + Binary(P3, Q, Assoc), +} + +impl Operator +where + Q: Ord + Copy, +{ + fn precedence(&self) -> Q { + match self { + Operator::Prefix(_, p) => *p, + Operator::Postfix(_, p) => *p, + Operator::Binary(_, p, _) => *p, + } + } + + fn is_postfix(&self) -> bool { + match self { + Operator::Postfix(_, _) => true, + _ => false, + } + } +} + +/// Runs the inner parser and transforms the result into an unary operator with the given +/// precedence. +/// +/// Intended for use with [precedence]. +/// # Arguments +/// * `precedence` The precedence of the operator. +/// * `parser` The parser to apply. +pub fn unary_op( + precedence: Precedence, + mut parser: ParseNext, +) -> impl Parser, Error> +where + Input: Stream, + Error: ParserError, + ParseNext: Parser, + Precedence: Ord + Copy, +{ + trace("unary_op", move |input: &mut Input| { + parser + .parse_next(input) + .map(|value| Unary { value, precedence }) + }) +} + +/// Runs the inner parser and transforms the result into a binary operator with the given precedence +/// and associativity. +/// +/// Intended for use with [precedence]. +/// # Arguments +/// * `precedence` The precedence of the operator. +/// * `assoc` The associativity of the operator. +/// * `parser` The parser to apply. +pub fn binary_op( + precedence: Precedence, + assoc: Assoc, + mut parser: ParseNext, +) -> impl Parser, Error> +where + Input: Stream, + Error: ParserError, + ParseNext: Parser, + Precedence: Ord + Copy, +{ + trace("unary_op", move |input: &mut Input| { + parser.parse_next(input).map(|value| Binary { + value, + precedence, + assoc, + }) + }) +} + +pub fn precedence< + Input, + Output, + Error, + EXT, + ParseOperand, + Fold, + ParsePrefix, + ParseBinary, + ParsePostfix, + P1, + P2, + P3, + Q, +>( + mut prefix: ParsePrefix, + mut postfix: ParsePostfix, + mut binary: ParseBinary, + mut operand: ParseOperand, + mut fold: Fold, +) -> impl Parser +where + Input: Stream, + ParseOperand: Parser, + Fold: FnMut(Operation) -> Result, + Error: ParserError, //+ winnow::error::FromExternalError, + ParsePrefix: Parser, Error>, + ParsePostfix: Parser, Error>, + ParseBinary: Parser, Error>, + Q: Ord + Copy, +{ + move |i: &mut Input| { + let mut operands = Vec::new(); + let mut operators = Vec::new(); + 'main: loop { + 'prefix: loop { + let start = i.checkpoint(); + let len = i.eof_offset(); + match prefix.parse_next(i) { + Err(ErrMode::Backtrack(_)) => { + i.reset(&start); + break 'prefix; + } + Err(e) => return Err(e), + Ok(op) => { + // infinite loop check: the parser must always consume + if i.eof_offset() == len { + return Err(ErrMode::assert( + i, + "`precedence` parsers must always consume", + )); + } + operators.push(Operator::Prefix(op.value, op.precedence)); + } + } + } + + let start = i.checkpoint(); + let op = match operand.parse_next(i) { + Ok(op) => op, + Err(ErrMode::Backtrack(e)) => { + // TODO: error handling + return Err(ErrMode::Backtrack(e.append(i, &start, ErrorKind::Fail))); + } + Err(e) => return Err(e), + }; + operands.push(op); + + 'postfix: loop { + let start = i.checkpoint(); + let len = i.eof_offset(); + match postfix.parse_next(i) { + Err(ErrMode::Backtrack(_)) => { + i.reset(&start); + break 'postfix; + } + Err(e) => return Err(e), + Ok(op) => { + // infinite loop check: the parser must always consume + if i.eof_offset() == len { + return Err(ErrMode::assert( + i, + "`precedence` parsers must always consume", + )); + } + + while operators + .last() + .map(|lhs| lhs.precedence() <= op.precedence) + .unwrap_or(false) + { + let value = operands.pop().unwrap(); + let operation = match operators.pop().unwrap() { + Operator::Prefix(op, _) => Operation::Prefix(op, value), + Operator::Postfix(op, _) => Operation::Postfix(value, op), + Operator::Binary(op, _, _) => match operands.pop() { + Some(lhs) => Operation::Binary(lhs, op, value), + None => { + // TODO: proper error + return Err(ErrMode::from_error_kind(i, ErrorKind::Fail)); + } + }, + }; + let result = match fold(operation) { + Err(e) => { + // TODO: error + return Err(ErrMode::from_error_kind(i, ErrorKind::Fail)); + // return Err(ErrMode::Backtrack( + // Error::from_external_error(i, ErrorKind::Fail, e), + // )); + } + Ok(r) => r, + }; + operands.push(result); + } + operators.push(Operator::Postfix(op.value, op.precedence)); + } + } + } + + let start = i.checkpoint(); + let len = i.eof_offset(); + match binary.parse_next(i) { + Err(ErrMode::Backtrack(_)) => { + i.reset(&start); + break 'main; + } + Err(e) => return Err(e), + Ok(op) => { + while operators + .last() + .map(|lhs| { + lhs.precedence() < op.precedence + || (op.assoc == Assoc::Left && lhs.precedence() == op.precedence) + || (lhs.is_postfix()) + }) + .unwrap_or(false) + { + let value = operands.pop().unwrap(); + let operation = match operators.pop().unwrap() { + Operator::Prefix(op, _) => Operation::Prefix(op, value), + Operator::Postfix(op, _) => Operation::Postfix(value, op), + Operator::Binary(op, _, _) => match operands.pop() { + Some(lhs) => Operation::Binary(lhs, op, value), + None => { + // TODO: proper error + return Err(ErrMode::from_error_kind(i, ErrorKind::Fail)); + } + }, + }; + let result = match fold(operation) { + Err(e) => { + return Err(ErrMode::from_error_kind(i, ErrorKind::Fail)); + // TODO: error + // return Err(ErrMode::Backtrack(Error::from_external_error( + // i, + // ErrorKind::Fail, + // e, + // ))); + } + Ok(r) => r, + }; + operands.push(result); + } + operators.push(Operator::Binary(op.value, op.precedence, op.assoc)); + } + } + + if i.eof_offset() == len { + return Err(ErrMode::assert( + i, + "`precedence` either operand or operator must consume input", + )); + } + } + + while operators.len() > 0 { + let value = match operands.pop() { + Some(o) => o, + None => { + // TODO: proper error + return Err(ErrMode::from_error_kind(i, ErrorKind::Fail)); + } + }; + let operation = match operators.pop().unwrap() { + Operator::Prefix(op, _) => Operation::Prefix(op, value), + Operator::Postfix(op, _) => Operation::Postfix(value, op), + Operator::Binary(op, _, _) => match operands.pop() { + Some(lhs) => Operation::Binary(lhs, op, value), + None => { + // TODO: proper error + return Err(ErrMode::from_error_kind(i, ErrorKind::Fail)); + } + }, + }; + let result = match fold(operation) { + Ok(r) => r, + Err(e) => { + return Err(ErrMode::from_error_kind(i, ErrorKind::Fail)); + // TODO: error + // return Err(ErrMode::Backtrack(Error::from_external_error( + // i, + // ErrorKind::Fail, + // e, + // ))); + } + }; + operands.push(result); + } + + if operands.len() == 1 { + return Ok(operands.pop().unwrap()); + } else { + // TODO: proper error + return Err(ErrMode::from_error_kind(i, ErrorKind::Fail)); + } + } +} + +#[cfg(test)] +mod tests { + + use crate::parser2::tests::input; + use crate::parser2::tests::Result; + + use super::*; + #[test] + fn test_take_inside() -> Result<()> { + // let i = input(r#"(aaa(aaa) a(aa))"#); + // let r = take_inside(b'(', b')').parse(i)?; + // dbg!(std::str::from_utf8(r).unwrap()); + + let i = input(r#"`1111`1` 222 `333`444 `"#); + let r = take_inside(b'`', b'`').parse(i)?; + dbg!(std::str::from_utf8(r).unwrap()); + Ok(()) + } +} diff --git a/brush-parser/src/parser2/expansion.rs b/brush-parser/src/parser2/expansion.rs new file mode 100644 index 00000000..9c32495b --- /dev/null +++ b/brush-parser/src/parser2/expansion.rs @@ -0,0 +1,2 @@ + +// TODO: there should be word expansions diff --git a/brush-parser/src/parser2/io.rs b/brush-parser/src/parser2/io.rs new file mode 100644 index 00000000..caaa4231 --- /dev/null +++ b/brush-parser/src/parser2/io.rs @@ -0,0 +1,197 @@ +use crate::ast; + +use winnow::{ + ascii::{digit1, line_ending}, + combinator::{ + alt, cut_err, delimited, empty, fail, opt, peek, preceded, repeat, repeat_till, terminated, + trace, + }, + dispatch, + error::ContextError, + token::{any, literal, one_of, take}, + PResult, Parser, +}; + +use super::{ + custom_combinators::non_posix_extension, + trivia::{comment, space, trim_whitespace}, + word, Input, +}; + +pub fn io_redirect(i: &mut Input<'_>) -> PResult { + trace( + "io_redirect", + alt(( + |i: &mut Input<'_>| { + let n = opt(io_number).parse_next(i)?; + alt(( + io_file.map(move |(kind, target)| ast::IoRedirect::File(n, kind, target)), + io_here.map(move |h| ast::IoRedirect::HereDocument(n, h)), + non_posix_extension(("<<<", preceded(space(0..), cut_err(word::word)))).map( + move |(_, w)| { + ast::IoRedirect::HereString(n, ast::Word::from(w.to_string())) + }, + ), + )) + .parse_next(i) + }, + non_posix_extension( + alt(( + ("&>>".value(true), preceded(space(0..), cut_err(word::word))), + ("&>".value(false), preceded(space(0..), cut_err(word::word))), + )) + .map(|(append, f)| { + ast::IoRedirect::OutputAndError(ast::Word::from(f.to_string()), append) + }), + ), + )), + ) + .parse_next(i) +} + +pub fn redirect_list(i: &mut Input<'_>) -> PResult { + repeat(1.., terminated(io_redirect, space(0..))) + .map(ast::RedirectList) + .parse_next(i) +} + +fn io_fd(i: &mut Input<'_>) -> PResult { + trace("fd", digit1.parse_to()).parse_next(i) +} + +fn io_number(i: &mut Input<'_>) -> PResult { + // N.B. An I/O number must be a string of only digits, and it must be + // followed by a '<' or '>' character (but not consume them). + trace("io_number", terminated(io_fd, peek(one_of((b'<', b'>'))))).parse_next(i) +} + +fn io_here(i: &mut Input<'_>) -> PResult { + trace("io_here", |i: &mut Input<'_>| { + let remove_tabs = + alt((literal("<<-").value(true), literal("<<").value(false))).parse_next(i)?; + let (tag, requires_expansion) = delimited( + space(0..), + cut_err(word::word).with_taken().map(|(tag, raw_tag)| { + // from the manual: + // > No parameter expansion, command substitution, arithmetic expansion, or pathname + // > expansion is performed on word. If any characters in word are quoted + let requires_expansion = !raw_tag + .into_iter() + .any(|c| *c == b'"' || *c == b'\'' || *c == b'\\'); + (tag, requires_expansion) + }), + // TODO: line_space or what + (space(0..), opt(comment), line_ending), + ) + .parse_next(i)?; + let r = cut_err( + repeat_till(0.., any, (line_ending, tag.as_bytes())) + .map(|((), _)| ()) + .take(), + ) + .try_map(std::str::from_utf8) + .map(|doc| ast::IoHereDocument { + remove_tabs, + requires_expansion, + here_end: ast::Word::from(tag.to_string()), + doc: ast::Word::from(doc.to_string()), + }) + .parse_next(i); + r + }) + .parse_next(i) +} + +fn io_file<'i>(i: &mut Input<'i>) -> PResult<(ast::IoFileRedirectKind, ast::IoFileRedirectTarget)> { + trace("io_file", alt(( + preceded(('<', space(0..)), io_filename).map(|f| (ast::IoFileRedirectKind::Write, f)), + preceded(('>', space(0..)), io_filename).map(|f| (ast::IoFileRedirectKind::Read, f)), + (dispatch! {take::<_, Input<'i>, _>(2usize); + b">>" => preceded(space(0..), cut_err(io_filename)).map(|f| (ast::IoFileRedirectKind::Append, f)), + b"<&" => preceded(space(0..), cut_err(filename_or_fd)).map(|f| (ast::IoFileRedirectKind::DuplicateInput, f)), + b">&" => preceded(space(0..), cut_err(filename_or_fd)).map(|f| (ast::IoFileRedirectKind::DuplicateOutput, f)), + b"<>" => preceded(space(0..), cut_err(io_filename)).map(|f| (ast::IoFileRedirectKind::ReadAndWrite, f)), + b">|" => preceded(space(0..), cut_err(io_filename)).map(|f| (ast::IoFileRedirectKind::Clobber, f)), + _ => fail + }) + ))) + .parse_next(i) +} + +fn io_filename(i: &mut Input<'_>) -> PResult { + trace( + "io_filename", + alt(( + // N.B. Process substitution forms are extensions to the POSIX standard. + non_posix_extension(process_substitution).map(|(kind, subshell)| { + ast::IoFileRedirectTarget::ProcessSubstitution(kind, subshell) + }), + word::word.map(|f| ast::IoFileRedirectTarget::Filename(ast::Word::from(f.to_string()))), + )), + ) + .parse_next(i) +} + +fn filename_or_fd(i: &mut Input<'_>) -> PResult { + trace( + "io_filename_or_fd", + alt((io_fd.map(ast::IoFileRedirectTarget::Fd), io_filename)), + ) + .parse_next(i) +} + +pub fn process_substitution( + i: &mut Input<'_>, +) -> PResult<(ast::ProcessSubstitutionKind, ast::SubshellCommand)> { + use super::complete_command::compound_list; + trace( + "process_substitution", + ( + dispatch! {peek(any); + b'<' => empty.value(ast::ProcessSubstitutionKind::Read), + b'>' => empty.value(ast::ProcessSubstitutionKind::Write), + _ => fail, + }, + preceded( + "(", + cut_err(terminated(trim_whitespace(0.., compound_list, 0..), ")")), + ) + .map(ast::SubshellCommand), + ), + ) + .parse_next(i) +} + +#[cfg(test)] +mod tests { + use crate::parser2::new_input; + use crate::ParserOptions; + + use super::*; + + #[test] + fn parse_heredoc() { + fn parse<'i>(i: &'i str) { + let io = io_here + .parse_next(&mut new_input(ParserOptions::default(), i)) + .unwrap(); + dbg!(io); + } + parse( + r#"<(i: &'i str) { + let io = process_substitution + .parse_next(&mut new_input(ParserOptions::default(), i)) + .unwrap(); + dbg!(io); + } + parse(r#"<()"#) + } +} diff --git a/brush-parser/src/parser2/pipeline.rs b/brush-parser/src/parser2/pipeline.rs new file mode 100644 index 00000000..1279b631 --- /dev/null +++ b/brush-parser/src/parser2/pipeline.rs @@ -0,0 +1,117 @@ +use crate::ast; + +use winnow::{ + combinator::{cut_err, delimited, fail, opt, peek, repeat, separated_pair, trace}, + dispatch, + stream::Accumulate, + token::take, + PResult, Parser, +}; + +use super::{ + complete_command::command, + trivia::{line_trailing, space}, + Input, +}; + +/// `pipeline := bang? pipe_sequence` +///```yacc +/// pipeline : pipe_sequence +/// | Bang pipe_sequence +/// ``` +pub fn pipeline(i: &mut Input<'_>) -> PResult { + trace( + "pipeline", + separated_pair( + opt("!").map(|bang| bang.is_some()), + space(0..), + pipe_sequence, + ), + ) + .map(|(bang, seq)| ast::Pipeline { bang, seq }) + .parse_next(i) +} + +/// ```yacc +/// pipe_sequence : command +/// | pipe_sequence '|' linebreak command +/// ``` +/// `pipe_sequence := command | (command (pipe_operator line_trailing* command)*)` +pub fn pipe_sequence(i: &mut Input<'_>) -> PResult> { + trace("pipe_sequence", |i: &mut Input<'_>| { + let mut current = trace("first_command", command).parse_next(i)?; + + let pipe = delimited( + space(0..), + pipe_operator, + (opt(line_trailing), space(0..)), + ); + let r = trace( + "remaining_pipe_sequence", + opt( + repeat(1.., (pipe, cut_err(command))).fold(Vec::new, |mut acc, (p, c)| { + if matches!(p, PipeOperator::Extension) { + add_pipe_extension_redirection(&mut current); + } + acc.accumulate(std::mem::replace(&mut current, c)); + acc + }), + ), + ) + .parse_next(i)?; + Ok(r.unwrap_or_else(|| vec![current])) + }) + .parse_next(i) +} + +#[derive(Clone, Copy)] +enum PipeOperator { + Simple, + Extension, +} + +/// ` pipe-operator := !'||' ('|&' | '|') ` +fn pipe_operator(i: &mut Input<'_>) -> PResult { + trace( + "pipe_operator", + dispatch!(peek::<_, &[u8],_,_>(take(2usize)); + b"||" => fail, + b"|&" => take(2usize).value(PipeOperator::Extension), + _ => "|".value(PipeOperator::Simple), + ), + ) + .parse_next(i) +} + +// add `2>&1` to the command if the pipeline is `|&` +pub fn add_pipe_extension_redirection(c: &mut ast::Command) { + let r = ast::IoRedirect::File( + Some(2), + ast::IoFileRedirectKind::DuplicateOutput, + ast::IoFileRedirectTarget::Fd(1), + ); + + fn add_to_redirect_list(l: &mut Option, r: ast::IoRedirect) { + if let Some(l) = l { + l.0.push(r); + } else { + let v = vec![r]; + *l = Some(ast::RedirectList(v)); + } + } + + match c { + ast::Command::Simple(c) => { + let r = ast::CommandPrefixOrSuffixItem::IoRedirect(r); + if let Some(l) = &mut c.suffix { + l.0.push(r); + } else { + c.suffix = Some(ast::CommandSuffix(vec![r])); + } + } + ast::Command::Compound(_, l) => add_to_redirect_list(l, r), + ast::Command::Function(f) => add_to_redirect_list(&mut f.body.1, r), + // TODO: redirect_list for extended tests + ast::Command::ExtendedTest(_) => (), + }; +} diff --git a/brush-parser/src/parser2/precedence.rs b/brush-parser/src/parser2/precedence.rs new file mode 100644 index 00000000..6ba877e7 --- /dev/null +++ b/brush-parser/src/parser2/precedence.rs @@ -0,0 +1,258 @@ +use std::marker::PhantomData; + +use winnow::error::ContextError; +use winnow::stream::{Stream, StreamIsPartial}; +use winnow::Parser; + +use super::Input; + +struct Neither(PhantomData); + +struct Unary(F, PhantomData) +where + F: FnMut(O) -> O; + +struct Binary(F, PhantomData) +where + F: FnMut(O, O) -> O; + +trait OperandType { + type T; +} +impl OperandType for Unary +where + F: FnMut(O) -> O, +{ + type T = O; +} +impl OperandType for Binary +where + F: FnMut(O, O) -> O, +{ + type T = O; +} +impl OperandType for Neither { + type T = O; +} + +struct PrecedenceInner +where + Input: Stream + StreamIsPartial, + _OperatorParser: Parser, +{ + parser: _OperatorParser, + precedence: PrecedenceKind, + assoc: Assoc, + _phantom: PhantomData, + _phantom2: PhantomData<_OperatorOutput>, +} + +struct Precedence +where + Input: Stream + StreamIsPartial, + _OperatorParser: Parser, +{ + op: Operator, + inner: PrecedenceInner, +} + +impl + Precedence +where + Operator: OperandType, + Input: Stream + StreamIsPartial, + OperatorParser: Parser, +{ + fn new(op: Operator, precedence: PrecedenceKind, parser: OperatorParser) -> Self { + Self { + op, + inner: PrecedenceInner { + parser, + precedence, + assoc: Assoc::Neither, + _phantom: PhantomData::, + _phantom2: PhantomData::<_ParserOutput>, + }, + } + } + + #[inline(always)] + fn unary(self, f: F) -> Precedence, Input, OperatorParser, _ParserOutput> + where + F: FnMut(Operand) -> Operand, + { + Precedence { + op: Unary(f, PhantomData::), + inner: self.inner, + } + } + + #[inline(always)] + fn binary(self, f: F) -> Precedence, Input, OperatorParser, _ParserOutput> + where + F: FnMut(Operand, Operand) -> Operand, + { + Precedence { + op: Binary(f, PhantomData::), + inner: self.inner, + } + } + #[inline(always)] + fn left(mut self, strength: u32) -> Self { + self.inner.assoc = Assoc::Left(strength); + Self { + op: self.op, + inner: self.inner, + } + } + #[inline(always)] + fn right(mut self, strength: u32) -> Self { + self.inner.assoc = Assoc::Right(strength); + Self { + op: self.op, + inner: self.inner, + } + } +} + +#[derive(Clone, Copy)] +enum PrecedenceKind { + Prefix, + Infix, + Postfix, + Nilfix, +} + +#[derive(Clone, Copy)] +enum Assoc { + Right(u32), + Left(u32), + Neither, +} + +#[inline(always)] +fn prefix( + parser: OperatorParser, +) -> Precedence, Input, OperatorParser, ParserOutput> +where + Input: Stream + StreamIsPartial, + OperatorParser: Parser, +{ + Precedence::new( + Neither::(PhantomData::), + PrecedenceKind::Prefix, + parser, + ) +} + +#[inline(always)] +fn infix( + parser: OperatorParser, +) -> Precedence, Input, OperatorParser, _ParserOutput> +where + Input: Stream + StreamIsPartial, + OperatorParser: Parser, +{ + Precedence::new( + Neither::(PhantomData::), + PrecedenceKind::Infix, + parser, + ) +} +#[inline(always)] +fn postfix( + parser: OperatorParser, +) -> Precedence, Input, OperatorParser, _ParserOutput> +where + Input: Stream + StreamIsPartial, + OperatorParser: Parser, +{ + Precedence::new( + Neither::(PhantomData::), + PrecedenceKind::Postfix, + parser, + ) +} + +trait ApplyPrecedence {} + +impl ApplyPrecedence + for Precedence +where + Operator: OperandType, + Input: Stream + StreamIsPartial, + _OperatorParser: Parser, +{ +} + +fn precedence<'i, ParseOperand, Operand>( + parser: ParseOperand, + ops: ( + impl ApplyPrecedence, Operand>, + impl ApplyPrecedence, Operand>, + ), +) -> impl Parser, Operand, ContextError> +where + ParseOperand: Parser, Operand, ContextError>, +{ + parser +} + +#[cfg(test)] +mod tests { + use winnow::ascii::digit1; + use winnow::token::literal; + use winnow::PResult; + + use crate::parser2::new_input; + use crate::ParserOptions; + + use super::*; + + // NOTE: "+".prefix() if a bad design. it pollutes parser namespace with pratt domain + // functions + + #[test] + fn test_api() { + // "+".prefix().left(2) + // unary('-').prefix().left(2) + + fn parse() -> PResult<()> { + let mut i = new_input(ParserOptions::default(), "1"); + precedence( + digit1.map(|_| 1usize), + ( + prefix("-").right(1), + infix("+").binary(|a, b| a + b).left(0), + ), + ) + .parse_next(&mut i)?; + Ok(()) + } + parse().unwrap(); + + // precedence(digit1, + // ( + // prefix("-").right(1).unary(|a| - b) + // infix("+").left(0).binary(|a, b| a + b) + // prefix("*").left(3).binary(|a, b| a * b) + // prefix("/").left(3).binary(|a, b| a / b) + // prefix("!").unary(|a| !a) + // ) + // ) + // precedence("111", vec![prefix(left(2, "+"))]); + + // precedence(); + } +} +// assoc neither +// https://github.com/segeljakt/pratt/issues/2 + +// let calc = pratt( +// digits1.map(Expr::Int), +// ( +// '-'.prefix(Right(1), |r| unary(r, Op::Neg)); +// '+'.infix(Left(0), |l, r| binary(l, r, Op::Add)); +// '!'.prefix(Right(3), |r| unary(r, Op::Fact)); +// ) +// ); diff --git a/brush-parser/src/parser2/trivia.rs b/brush-parser/src/parser2/trivia.rs new file mode 100644 index 00000000..15fbfc70 --- /dev/null +++ b/brush-parser/src/parser2/trivia.rs @@ -0,0 +1,84 @@ +use std::ops::RangeInclusive; + +use winnow::{ + ascii::line_ending, + combinator::{alt, delimited, eof, opt, trace}, + error::ParserError, + prelude::*, + stream::{AsChar, Stream, StreamIsPartial}, + token::take_while, + PResult, +}; + +use super::Input; + +pub(crate) const COMMENT: u8 = b'#'; + +pub(crate) const LF: u8 = b'\n'; +pub(crate) const CR: u8 = b'\r'; + +// wschar = ( %x20 / ; Space +// %x09 ) ; Horizontal tab +pub(crate) const WHITESPACE_CHARS: (u8, u8) = (b' ', b'\t'); + +pub(crate) const ESCAPE: u8 = b'\\'; + +// non-ascii = %x80-D7FF / %xE000-10FFFF +// - ASCII is 0xxxxxxx +// - First byte for UTF-8 is 11xxxxxx +// - Subsequent UTF-8 bytes are 10xxxxxx +pub(crate) const NON_ASCII: RangeInclusive = 0x80..=0xff; + +/// `non-eol = %x09 / %x20-7E / non-ascii` +pub(crate) const NON_EOL: (u8, RangeInclusive, RangeInclusive) = + (0x09, 0x20..=0x7E, NON_ASCII); + +/// `comment := # non-eol*` +pub fn comment(i: &mut Input<'_>) -> PResult<()> { + (COMMENT, take_while(0.., NON_EOL)).void().parse_next(i) +} + +// TODO: upstream into winnow because winnow has space0, space1 +// and want to switch to ranges +pub fn space(occurencies: R) -> impl Parser +where + R: Into, + Input: winnow::stream::StreamIsPartial + Stream, + ::Token: winnow::stream::AsChar + Clone, + Error: ParserError, +{ + trace("space", take_while(occurencies, (' ', '\t')).void()) +} + +/// `line-space = whitespace* [comment]?` +pub fn line_space(i: &mut Input<'_>) -> PResult<()> { + trace("line_space", (space(0..), opt(comment))) + .void() + .parse_next(i) +} + +/// `line-trailing = line-space* line-ending` +pub(crate) fn line_trailing(i: &mut Input<'_>) -> PResult<()> { + trace("line_trailing", (line_space, alt((line_ending, eof)))) + .void() + .parse_next(i) +} + +pub fn trim_whitespace<'i, Input, F, O, E, R1, R2>( + occurencies1: R1, + inner: F, + occurencies2: R2, +) -> impl Parser +where + R1: Into, + R2: Into, + Input: Stream + StreamIsPartial, + ::Token: AsChar + Clone, + E: ParserError, + F: Parser, +{ + trace( + "trim_whitespace", + delimited(space(occurencies1), inner, space(occurencies2)), + ) +} diff --git a/brush-parser/src/parser2/word.rs b/brush-parser/src/parser2/word.rs new file mode 100644 index 00000000..e96fd5b4 --- /dev/null +++ b/brush-parser/src/parser2/word.rs @@ -0,0 +1,423 @@ +use std::{borrow::Cow, cell::RefCell, ops::RangeInclusive}; + +use std::str; +use winnow::combinator::fail; +use winnow::token::take_till; +use winnow::{ + combinator::{alt, cut_err, delimited, empty, not, opt, peek, preceded, repeat, trace}, + dispatch, + error::ContextError, + token::{any, one_of, take, take_while}, + PResult, Parser, +}; + +use crate::parser2::custom_combinators::non_posix_extension; + +use super::custom_combinators::take_inside; +use super::{ + trivia::{line_trailing, space, ESCAPE, NON_ASCII, WHITESPACE_CHARS}, + Input, +}; + +// 2.4 Reserved Words +// https://pubs.opengroup.org/onlinepubs/9799919799/utilities/V3_chap02.html + +// PERF: imroved lookup from using alt(RESERVED) where RESERVED = ["!", "for", ...] 164.60 ns/iter +// to 8.89 ns/iter. The compiler automatically optimizes the lookup based on the string length +fn reserved_word<'i>(i: &mut Input<'i>) -> PResult<()> { + const WORD_SEP: &[u8] = &[ + b' ', b'\t', b'\r', b'\n', b'&', b'(', b')', b';', b'|', b'<', b'>', + ]; + trace( + "reserved_word", + // TODO: set of token separators \n \t \r ' ' + dispatch! {take_till::<_, Input<'i>, _>(1.., WORD_SEP); + b"!" | b"{" | b"}" | b"case" | b"do" | b"done" | b"elif" | b"else" + | b"esac" | b"fi" | b"for" | b"if" | b"in" | b"then" | b"until" | b"while" => empty, + b"[[" | b"]]" | b"function" | b"select" => non_posix_extension(empty), + _ => fail, + }, + ) + .parse_next(i) +} +// +pub fn non_reserved<'i>( + parser: impl Parser, Cow<'i, str>, ContextError>, +) -> impl Parser, Cow<'i, str>, ContextError> { + trace("non_reserved_word", preceded(not(reserved_word), parser)) +} + +pub fn word<'i>(i: &mut Input<'i>) -> PResult> { + trace( + "word", + dispatch! {peek(any); + b'"' => double_quoted::double_quoted, + b'\'' => single_quoted, + _ => unquoted_string::unquoted_string, + }, + ) + .parse_next(i) +} + +pub(crate) fn space_after_word(i: &mut Input<'_>) -> PResult<()> { + trace( + "space_after_keyword", + alt((line_trailing, cut_err(space(1..)))), + ) + .void() + .parse_next(i) +} + +pub fn non_empty<'i>( + parser: impl Parser, Cow<'i, str>, ContextError>, +) -> impl Parser, Cow<'i, str>, ContextError> { + trace( + "non_empty_string", + parser.verify(|s: &Cow<'i, str>| !s.as_bytes().is_empty()), + ) +} + +mod unquoted_string { + use super::*; + + /// The first character is different from the rest ones, for example you cannot use + /// an `#` as a first character, but it is possible to use it inside the word: + /// - `#echo` is a comment + /// but - `ec#ho` is a string + const FIRST_CHAR: ( + RangeInclusive, + RangeInclusive, + u8, + RangeInclusive, + RangeInclusive, + RangeInclusive, + RangeInclusive, + RangeInclusive, + ) = ( + // ascii hex codes + // `$`, `%` + 0x24..=0x25, + // `*` - `:` + 0x2A..=0x3A, + // '=' + 0x3D, + // '?' - '[' + 0x3F..=0x5B, + // ']' - `_` + 0x5D..=0x5F, + // 'a' - '{' + 0x61..=0x7B, + // '}' - '~' + 0x7D..=0x7E, + NON_ASCII, + ); + + // TODO: Im worried about that we forgetting what we escaped while parsing + // what if the interpreter requires this information (probably according to Posix spec Im not + // sure) + // NOTE: word expansion is performed in word.rs expansion_parser + + fn transform_escaped<'i>(i: &mut Input<'i>) -> PResult<&'i str> { + // TODO: list of special symbols + let v = |v: &'static str| take(1u8).value(v); + trace( + "unquoted/transform_escaped", + dispatch! {peek::<_, u8, _, _>(any); + b'\\' => v(r"\"), + b' ' => v(r" "), + b'\n' => v(r""), + b'"' => v(r#"""#), + b';' => v(r";"), + b'!' => v(r"!"), + // N.B for the interpreter + b'$' => v(r"\$"), + // Do nothing. + _ => empty.value(r"") + }, + ) + .parse_next(i) + } + + pub fn unquoted_string<'i>(i: &mut Input<'i>) -> PResult> { + trace( + "unquoted_string", + non_empty(fold_escaped( + // the first chunk until an escape + trace( + "first_chunk", + opt((one_of(FIRST_CHAR), unquoted_chars(0..))) + .take() + .try_map(std::str::from_utf8), + ), + // decide what to do with the escaped character + transform_escaped, + // remaining chunks + trace("chunk", unquoted_chars(0..)), + )), + ) + .parse_next(i) + } + + // characters that are allowed inside the unquted string + fn unquoted_chars<'i, R: Into>( + occurencies: R, + ) -> impl Parser, &'i str, ContextError> { + // includes a single quote + // basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii + const UNQUOTED_CHARS: ( + RangeInclusive, + RangeInclusive, + u8, + RangeInclusive, + RangeInclusive, + RangeInclusive, + RangeInclusive, + RangeInclusive, + ) = ( + 0x23..=0x25, + 0x2A..=0x3A, + 0x3D, + 0x3F..=0x5B, + 0x5D..=0x5F, + 0x61..=0x7B, + 0x7D..=0x7E, + NON_ASCII, + ); + trace("unquoted_chars", take_while(occurencies, UNQUOTED_CHARS)) + .try_map(std::str::from_utf8) + } +} + +// - `a s` -> `a` `s` +// - `a\ s` -> `a s` +// - `a\ +//s` -> `as` +// quoted +// - `"a\ s"` -> `a\ s` +// - `a\ +//s` -> `as` +pub fn single_quoted<'i>(i: &mut Input<'i>) -> PResult> { + take_inside(b'\'', b'\'') + .try_map(std::str::from_utf8) + .map(Cow::Borrowed) + .parse_next(i) +} + +mod double_quoted { + + // https://pubs.opengroup.org/onlinepubs/9799919799/utilities/V3_chap02.html + // The quote character sequence single-quote and the single-character quote + // characters (, single-quote, and double-quote) that were present in the original + // word shall be removed unless they have themselves been quoted + + use super::*; + + pub(super) fn double_quoted<'i>(i: &mut Input<'i>) -> PResult> { + trace( + "double_quoted_string", + delimited('"', string_body, cut_err('"')), + ) + .parse_next(i) + } + + // characters that are allowed inside the double quoted string + fn unescaped_chars<'i, R: Into>( + occurencies: R, + ) -> impl Parser, &'i str, ContextError> { + // includes a single quote + // basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii + const BASIC_UNESCAPED: ( + (u8, u8), + u8, + RangeInclusive, + RangeInclusive, + RangeInclusive, + ) = (WHITESPACE_CHARS, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII); + trace( + "double_quoted_unescaped_chars", + take_while(occurencies, BASIC_UNESCAPED), + ) + .try_map(std::str::from_utf8) + } + + // special rules for escaping characters inside double quoted string + fn transform_escaped<'i>(i: &mut Input<'i>) -> PResult<&'i str> { + trace( + "double_quoted_transform_escaped", + dispatch! {peek::<_, u8, _, _>(any); + b'\\' => take(1u8).value(r"\"), + b'\n' => take(1u8).value(r""), + b'"' => take(1u8).value(r#"""#), + b'!' => take(1u8).value(r"\!"), + // N.B for the interpreter + b'$' => take(1u8).value(r"\$"), + // Do nothing. Preserve `\` + _ => empty.value(r"\") + }, + ) + .parse_next(i) + } + + fn string_body<'i>(i: &mut Input<'i>) -> PResult> { + // escaped_transform(unescaped_chars(0..), '\\', transform_escaped) + trace( + "double_quoted_string_body", + fold_escaped( + unescaped_chars(0..), + transform_escaped, + unescaped_chars(0..), + ), + ) + .parse_next(i) + } +} + +fn fold_escaped<'i>( + mut first_chunk: impl Parser, &'i str, ContextError>, + mut transform_escaped: impl Parser, &'i str, ContextError> + 'i, + mut chunk: impl Parser, &'i str, ContextError>, +) -> impl Parser, Cow<'i, str>, ContextError> { + trace("fold_escaped", move |i: &mut Input<'i>| { + // TODO: Sremove RefCell when fold(init, ) changes its api from FnMut to FnOnce + // because SAFETY: this closure is called only once here. + + // the first chunk before the escape + let res = RefCell::new(Cow::Borrowed(first_chunk.by_ref().parse_next(i)?)); + + // process the escaped character and the next chunks + repeat( + 0.., + preceded( + ESCAPE, + ( + // tramsform the escaped char, for example: consume '\n' from the stream but + // append nothing to the result "" + transform_escaped.by_ref(), + // the next chunk after escape + chunk.by_ref(), + ), + ), + ) + .fold( + || &res, + |acc, (escaped, remaining)| { + let mut c = acc.borrow_mut(); + match &mut *c { + Cow::Borrowed(lhs) => { + let mut s = + String::with_capacity(lhs.len() + escaped.len() + remaining.len()); + s.push_str(lhs); + s.push_str(escaped); + s.push_str(remaining); + *c = Cow::Owned(s); + } + Cow::Owned(s) => { + s.reserve(escaped.len() + remaining.len()); + s.push_str(escaped); + s.push_str(remaining); + } + }; + acc + }, + ) + .parse_next(i)?; + Ok(res.take()) + }) +} + +#[cfg(test)] +mod tests { + use crate::parser2::tests::input; + + use super::*; + + macro_rules! test_word { + ($parser:ident ( $($case:ident ($i:literal) -> $expected:literal),+ $(,)?) ) => { + $( + #[test] + fn $case() -> crate::parser2::tests::Result<()> { + assert_eq!($parser.parse(crate::parser2::new_input(crate::ParserOptions::default(), $i))?, $expected); + Ok(()) + } + )+ + + }; + } + mod test_double_quoted_string { + use super::*; + use crate::parser2::word::double_quoted::double_quoted; + + test_word! {double_quoted ( + empty(r#""""#) -> "", + basic(r#""hello world""#) -> "hello world", + escaped_quote(r#""a\"b""#) -> r#"a"b"#, + escaped_from_start(r#""\"ab""#) -> r#""ab"#, + escaped_newline(r#""\ \ \ \ \ \a\ + bc""#) -> r"\ \ \ \ \ \a bc" + + )} + } + mod test_single_quoted_string { + use super::*; + + test_word! {single_quoted ( + empty("''") -> "", + basic("'hello world'") -> "hello world", + double_quote_inside(r#"'a"b'"#) -> r#"a"b"#, + escaped_newline(r#"'\ \ \ \ \ \a\ + bc'"#) -> r#"\ \ \ \ \ \a\ + bc"# + )} + } + + mod test_unquoted_string { + use crate::parser2::new_input; + use crate::ParserOptions; + + use super::*; + use unquoted_string::unquoted_string; + test_word! {unquoted_string ( + basic("ec#h1o") -> "ec#h1o", + basic2("_echo") -> "_echo", + basic3("4_echo") -> "4_echo", + complicated(r#"-:1[zd*fa:a]]d1#2_-:@:"#) -> r#"-:1[zd*fa:a]]d1#2_-:@:"#, + escaped_letter(r"\t") -> "t", + escaped_keychar(r"a\!b") -> "a!b", + escaped_keychar2(r"a\;b") -> "a;b", + escaped_escape(r"ec\\ho") -> r"ec\ho", + escaped_newline("\\ \\ \\ \\ \\ \\a\\\nbc") -> r#" abc"#, + escaped_newline2("a\\\n\\ ") -> r#"a "# + )} + + #[test] + fn test_empty_string() { + assert_matches::assert_matches!( + unquoted_string.parse(new_input(ParserOptions::default(), "")), + Err(_) + ); + assert_matches::assert_matches!( + unquoted_string.parse(new_input(ParserOptions::default(), "\\\n")), + Err(_) + ); + } + } + + extern crate test; + use test::Bencher; + + #[test] + fn test_reserved_keyword() { + let mut input = input("while"); + let r = reserved_word.parse(input.clone()).unwrap(); + dbg!(r); + } + + // #[bench] + // fn bench_reserved_keyword(b: &mut Bencher) { + // let mut input = input("while"); + // b.iter(|| { + // test::black_box(reserved_word.parse(input.clone()).unwrap()); + // }); + // } + // TODO: fix unquted word d\\d +}