diff --git a/Cargo.toml b/Cargo.toml index 124b3ad6..b93f58a0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,7 +57,7 @@ compact_str = "0.8.0" chrono = { version = "0.4", default-features = false, features = ["clock", "std", "wasmbind"] } delegate = "0.12.0" thiserror = "1.0" -winnow = { version = "0.6", features = ["simd"] } +winnow = { version = "0.6.24", features = ["simd"] } num-integer = "0.1.44" num-traits = "0.2" arrayvec = "0.7" diff --git a/benches/encoding_primitives.rs b/benches/encoding_primitives.rs index b8e8615c..a9c252e2 100644 --- a/benches/encoding_primitives.rs +++ b/benches/encoding_primitives.rs @@ -44,8 +44,6 @@ mod benchmark { println!("# Values: {NUM_VALUES}"); // TODO: For now, these benchmarks only write values that can be serialized in 8 bytes or fewer. - // This is because `VarUInt` has a bug[1] that causes it to encode very large u64s incorrectly. - // [1]: https://github.com/amazon-ion/ion-rust/issues/689 let unsigned_values = generate_unsigned_values(u64::MIN, (2 << 49) - 1); let signed_values = generate_signed_values(-2 << 49, (2 << 49) - 1); diff --git a/src/lazy/encoding.rs b/src/lazy/encoding.rs index 786c8fa1..4d794430 100644 --- a/src/lazy/encoding.rs +++ b/src/lazy/encoding.rs @@ -23,7 +23,6 @@ use crate::lazy::never::Never; use crate::lazy::text::buffer::{whitespace_and_then, IonParser, TextBuffer}; use crate::lazy::text::encoded_value::EncodedTextValue; use crate::lazy::text::matched::MatchedValue; -use crate::lazy::text::parse_result::fatal_parse_error; use crate::lazy::text::raw::r#struct::{LazyRawTextFieldName, RawTextStructIterator}; use crate::lazy::text::raw::reader::LazyRawTextReader_1_0; use crate::lazy::text::raw::sequence::{ @@ -42,7 +41,7 @@ use crate::{ }; use std::fmt::Debug; use std::io; -use winnow::combinator::{alt, opt, separated_pair}; +use winnow::combinator::{alt, cut_err, opt, separated_pair}; use winnow::Parser; /// Marker trait for types that represent an Ion encoding. @@ -266,21 +265,21 @@ pub trait TextEncoding<'top>: fn list_matcher() -> impl IonParser<'top, EncodedTextValue<'top, Self>> { let make_iter = |buffer: TextBuffer<'top>| RawTextListIterator::::new(buffer); let end_matcher = (whitespace_and_then(opt(",")), whitespace_and_then("]")).take(); - Self::container_matcher("a list", "[", make_iter, end_matcher) + Self::container_matcher("reading a list", "[", make_iter, end_matcher) .map(|nested_expr_cache| EncodedTextValue::new(MatchedValue::List(nested_expr_cache))) } fn sexp_matcher() -> impl IonParser<'top, EncodedTextValue<'top, Self>> { let make_iter = |buffer: TextBuffer<'top>| RawTextSExpIterator::::new(buffer); let end_matcher = whitespace_and_then(")"); - Self::container_matcher("an s-expression", "(", make_iter, end_matcher) + Self::container_matcher("reading an s-expression", "(", make_iter, end_matcher) .map(|nested_expr_cache| EncodedTextValue::new(MatchedValue::SExp(nested_expr_cache))) } fn struct_matcher() -> impl IonParser<'top, EncodedTextValue<'top, Self>> { let make_iter = |buffer: TextBuffer<'top>| RawTextStructIterator::new(buffer); let end_matcher = (whitespace_and_then(opt(",")), whitespace_and_then("}")).take(); - Self::container_matcher("a struct", "{", make_iter, end_matcher) + Self::container_matcher("reading a struct", "{", make_iter, end_matcher) .map(|nested_expr_cache| EncodedTextValue::new(MatchedValue::Struct(nested_expr_cache))) } @@ -321,7 +320,7 @@ pub trait TextEncoding<'top>: return input.incomplete(label); } Err(e) => { - return fatal_parse_error(*input, format!("failed to parse {label}: {e:?}")) + return input.invalid(format!("{e}")).context(label).cut(); } }; // If there are no errors, add the new child expr to the cache. @@ -359,9 +358,11 @@ impl<'top> TextEncoding<'top> for TextEncoding_1_0 { fn field_expr_matcher() -> impl IonParser<'top, LazyRawFieldExpr<'top, Self>> { // A (name, eexp) pair separated_pair( - whitespace_and_then(TextBuffer::match_struct_field_name), - whitespace_and_then(":"), - whitespace_and_then(TextBuffer::match_annotated_value::), + whitespace_and_then(TextBuffer::match_struct_field_name) + .context("matching a struct field name"), + whitespace_and_then(":").context("matching a struct field delimiter (`:`)"), + whitespace_and_then(TextBuffer::match_annotated_value::) + .context("matching a struct field value"), ) .map(|(field_name, invocation)| { LazyRawFieldExpr::NameValue(LazyRawTextFieldName::new(field_name), invocation) @@ -384,7 +385,7 @@ impl<'top> TextEncoding<'top> for TextEncoding_1_1 { } fn field_expr_matcher() -> impl IonParser<'top, LazyRawFieldExpr<'top, Self>> { - alt(( + cut_err(alt(( // A (name, eexp) pair. Check for this first to prevent `(:` from being considered // the beginning of an s-expression. separated_pair( @@ -407,7 +408,8 @@ impl<'top> TextEncoding<'top> for TextEncoding_1_1 { }), // An e-expression TextBuffer::match_e_expression.map(LazyRawFieldExpr::EExp), - )) + ))) + .context("matching a struct field") } } diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs index 9090c2aa..25e0506b 100644 --- a/src/lazy/text/buffer.rs +++ b/src/lazy/text/buffer.rs @@ -3,15 +3,13 @@ use std::ops::Range; use std::str::FromStr; use winnow::ascii::alphanumeric1; -use winnow::combinator::{ - alt, delimited, empty, eof, not, opt, peek, preceded, repeat, separated_pair, terminated, -}; +use winnow::combinator::{alt, delimited, empty, eof, not, opt, peek, preceded, repeat, separated_pair, terminated}; use winnow::error::{ErrMode, Needed}; use winnow::stream::{ Accumulate, CompareResult, ContainsToken, FindSlice, Location, SliceLen, Stream, StreamIsPartial, }; -use winnow::token::{one_of, take_till, take_until, take_while}; +use winnow::token::{one_of, rest, take_till, take_until, take_while}; use winnow::{dispatch, Parser}; use crate::lazy::decoder::{LazyRawValueExpr, RawValueExpr}; @@ -24,8 +22,8 @@ use crate::lazy::text::matched::{ MatchedFloat, MatchedInt, MatchedString, MatchedSymbol, MatchedTimestamp, MatchedTimestampOffset, MatchedValue, }; -use crate::lazy::text::parse_result::{fatal_parse_error, InvalidInputError, IonParseError}; use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult}; +use crate::lazy::text::parse_result::IonParseError; use crate::lazy::text::raw::v1_1::arg_group::{EExpArg, EExpArgExpr, TextEExpArgGroup}; use crate::lazy::text::raw::v1_1::reader::{MacroIdRef, SystemMacroAddress, TextEExpression_1_1}; use crate::lazy::text::value::{ @@ -162,15 +160,6 @@ impl<'top> TextBuffer<'top> { self.context } - #[inline(never)] - pub(crate) fn incomplete(&self, label: &'static str) -> IonParseResult<'top, T> { - if self.is_final_data() { - fatal_parse_error(*self, format!("ran out of data while parsing {label}")) - } else { - Err(ErrMode::Incomplete(Needed::Unknown)) - } - } - /// Returns a subslice of the [`TextBuffer`] that starts at `offset` and continues for /// `length` bytes. The subslice is considered to be 'final' data (i.e. not a potentially /// incomplete buffer). @@ -279,7 +268,9 @@ impl<'top> TextBuffer<'top> { if let Some(&byte) = self.bytes().first() { if WHITESPACE_BYTES.contains_token(byte) || byte == b'/' { - return full_match_optional_comments_and_whitespace(self); + return full_match_optional_comments_and_whitespace + .context("reading whitespace/comments") + .parse_next(self); } } self.match_nothing() @@ -332,16 +323,16 @@ impl<'top> TextBuffer<'top> { .parse_next(self)?; // `major` and `minor` are base 10 digits. Turning them into `&str`s is guaranteed to succeed. let major_version = u8::from_str(matched_major.as_text().unwrap()).map_err(|_| { - let error = InvalidInputError::new(matched_major) - .with_label("parsing an IVM major version") - .with_description("value did not fit in an unsigned byte"); - ErrMode::Cut(IonParseError::Invalid(error)) + matched_major + .invalid("value did not fit in an unsigned byte") + .context("reading an IVM major version") + .cut_err() })?; let minor_version = u8::from_str(matched_minor.as_text().unwrap()).map_err(|_| { - let error = InvalidInputError::new(matched_minor) - .with_label("parsing an IVM minor version") - .with_description("value did not fit in an unsigned byte"); - ErrMode::Cut(IonParseError::Invalid(error)) + matched_major + .invalid("value did not fit in an unsigned byte") + .context("reading an IVM minor version") + .cut_err() })?; let marker = LazyRawTextVersionMarker::::new(matched_marker, major_version, minor_version); @@ -356,10 +347,10 @@ impl<'top> TextBuffer<'top> { fn full_match_annotations<'t>(input: &mut TextBuffer<'t>) -> IonMatchResult<'t> { let matched = one_or_more(TextBuffer::match_annotation).parse_next(input)?; if matched.len() > u16::MAX as usize { - let error = InvalidInputError::new(matched) - .with_description("the maximum supported annotations sequence length is 65KB") - .with_label("parsing annotations"); - Err(ErrMode::Cut(IonParseError::Invalid(error))) + matched + .invalid("the maximum supported annotations sequence length is 65KB") + .context("reading an annotations sequence") + .cut() } else { Ok(matched) } @@ -520,6 +511,7 @@ impl<'top> TextBuffer<'top> { .map(LazyRawTextValue_1_1::from) .map(RawStreamItem::Value), )) + .context("reading a v1.1 top-level expression") .parse_next(self) } @@ -549,10 +541,11 @@ impl<'top> TextBuffer<'top> { Self::match_clob_value, E::struct_matcher(), )), - Invalid(byte) => |input: &mut TextBuffer<'top>| { - let error = InvalidInputError::new(*input) - .with_label(format!("a value cannot begin with '{}'", char::from(byte))); - Err(ErrMode::Backtrack(IonParseError::Invalid(error))) + Invalid(_byte) => |input: &mut TextBuffer<'top>| { + input + .unrecognized() + .context("reading a value") + .backtrack() }, } .with_taken() @@ -630,20 +623,20 @@ impl<'top> TextBuffer<'top> { let system_id = match id { MacroIdRef::LocalName(name) => { let Some(macro_address) = ION_1_1_SYSTEM_MACROS.address_for_name(name) else { - return fatal_parse_error( - *self, - format!("Found unrecognized system macro name: '{}'", name), - ); + return self + .invalid(format!("found unrecognized system macro name: '{}'", name)) + .context("reading an e-expression's macro ID as a local name") + .cut(); }; // This address came from the system table, so we don't need to validate it. MacroIdRef::SystemAddress(SystemMacroAddress::new_unchecked(macro_address)) } MacroIdRef::LocalAddress(address) => { let Some(system_address) = SystemMacroAddress::new(address) else { - return fatal_parse_error( - *self, - format!("Found out-of-bounds system macro address {}", address), - ); + return self + .invalid(format!("found out-of-bounds system macro address {}", address)) + .context("reading an e-expression's macro ID as a system address") + .cut(); }; MacroIdRef::SystemAddress(system_address) } @@ -679,10 +672,10 @@ impl<'top> TextBuffer<'top> { .macro_table() .macro_with_id(id) .ok_or_else(|| { - ErrMode::Cut(IonParseError::Invalid( - InvalidInputError::new(*input) - .with_description(format!("could not find macro with id {:?}", id)), - )) + (*input) + .invalid(format!("could not find macro with id {:?}", id)) + .context("reading an e-expression") + .cut_err() })? .reference(); let signature_params: &'top [Parameter] = macro_ref.signature().parameters(); @@ -693,13 +686,13 @@ impl<'top> TextBuffer<'top> { None => { for param in &signature_params[index..] { if !param.can_be_omitted() { - return fatal_parse_error( - *input, - format!( + return input + .invalid(format!( "e-expression did not include an argument for param '{}'", param.name() - ), - ); + )) + .context("reading an e-expression") + .cut() } } break; @@ -710,13 +703,13 @@ impl<'top> TextBuffer<'top> { Ok(_closing_delimiter) => Ok((id, macro_ref, arg_expr_cache)), Err(ErrMode::Incomplete(_)) => input.incomplete("an e-expression"), Err(_e) => { - fatal_parse_error( - *input, - format!( + (*input) + .invalid(format!( "macro {id} signature has {} parameter(s), e-expression had an extra argument", signature_params.len() - ), - ) + )) + .context("reading an e-expression's arguments") + .cut() } } }; @@ -784,23 +777,21 @@ impl<'top> TextBuffer<'top> { // This check exists to offer a more human-friendly error message; without it, // the user simply sees a parsing failure. if self.bytes().starts_with(b"(::") { - return fatal_parse_error( - *self, - format!("parameter '{}' has cardinality `ExactlyOne`; it cannot accept an expression group", parameter.name()), - ); + return self + .invalid(format!("parameter '{}' has cardinality `ExactlyOne`; it cannot accept an expression group", parameter.name())) + .context("reading an e-expression argument with `exactly-one` cardinality") + .cut() } let maybe_expr = Self::match_sexp_item_1_1 .map(|expr| expr.map(EExpArgExpr::::from)) .parse_next(self)?; match maybe_expr { Some(expr) => Ok(EExpArg::new(parameter, expr)), - None => fatal_parse_error( - *self, - format!( - "expected argument for required parameter '{}'", - parameter.name() - ), - ), + None => self.invalid(format!( + "expected argument for required parameter '{}'", + parameter.name() + )).context("reading an e-expression argument with `exactly-one` cardinality") + .cut() } } @@ -858,12 +849,10 @@ impl<'top> TextBuffer<'top> { parameter: &'top Parameter, ) -> IonParseResult<'top, Option>> { if self.match_empty_arg_group(parameter).is_ok() { - return Err(ErrMode::Cut(IonParseError::Invalid( - InvalidInputError::new(*self).with_description(format!( - "parameter '{}' is one-or-more (`+`) and cannot accept an empty stream", - parameter.name() - )), - ))); + return self + .unrecognized() + .context("reading an e-expression argument with `one-or-more` cardinality") + .backtrack(); } self.match_zero_or_more(parameter) @@ -874,10 +863,9 @@ impl<'top> TextBuffer<'top> { parameter: &'top Parameter, ) -> IonParseResult<'top, TextEExpArgGroup<'top>> { if parameter.rest_syntax_policy() == RestSyntaxPolicy::NotAllowed { - return Err(ErrMode::Backtrack(IonParseError::Invalid( - InvalidInputError::new(*self) - .with_description("parameter does not support rest syntax"), - ))); + return self.unrecognized() + .context("reading a parameter that does not support rest syntax") + .backtrack(); } let mut cache = BumpVec::new_in(self.context().allocator()); let parser = |input: &mut TextBuffer<'top>| { @@ -1561,7 +1549,7 @@ impl<'top> TextBuffer<'top> { } } } - self.incomplete("a text value without closing delimiter") + self.incomplete("reading a text value without closing delimiter") } #[cold] @@ -1572,14 +1560,18 @@ impl<'top> TextBuffer<'top> { allow_unescaped_newlines: bool, ) -> IonParseResult<'top, ()> { if byte == b'\n' && !allow_unescaped_newlines { - let error = InvalidInputError::new(self.slice_to_end(index)) - .with_description("unescaped newlines are not allowed in short string literals"); - return Err(ErrMode::Cut(IonParseError::Invalid(error))); + return self + .slice_to_end(index) + .invalid("unescaped newlines are not allowed in short string literals") + .context("reading a string") + .cut(); } if !WHITESPACE_BYTES.contains(&byte) { - let error = InvalidInputError::new(self.slice_to_end(index)) - .with_description("unescaped control characters are not allowed in text literals"); - return Err(ErrMode::Cut(IonParseError::Invalid(error))); + return self + .slice_to_end(index) + .invalid(format!("unescaped control characters are not allowed in text literals; byte {byte:02X}")) + .context("reading a string") + .cut(); } Ok(()) } @@ -1648,9 +1640,7 @@ impl<'top> TextBuffer<'top> { match self.bytes().first() { Some(byte) if byte.is_ascii_digit() => full_match_timestamp(self), - Some(_) => Err(ErrMode::Backtrack(IonParseError::Invalid( - InvalidInputError::new(*self), - ))), + Some(_) => self.unrecognized().backtrack(), None => self.incomplete("a timestamp"), } } @@ -1906,8 +1896,9 @@ impl<'top> TextBuffer<'top> { for byte in self.bytes().iter().copied() { if !Self::byte_is_legal_clob_ascii(byte) { let message = format!("found an illegal byte '{:0x}' in clob", byte); - let error = InvalidInputError::new(*self).with_description(message); - return Err(ErrMode::Cut(IonParseError::Invalid(error))); + return self.invalid(message) + .context("reading a clob") + .cut(); } } // Return success without consuming @@ -2101,6 +2092,16 @@ where repeat::<_, _, (), _, _>(n, parser).take() } +pub fn incomplete_is_ok<'data, P>( + parser: P, +) -> impl Parser, TextBuffer<'data>, IonParseError<'data>> +where + P: Parser, TextBuffer<'data>, IonParseError<'data>>, +{ + // If we run out of input while applying the parser, consider the rest of the input to match. + alt((parser.complete_err(), rest)) +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/lazy/text/matched.rs b/src/lazy/text/matched.rs index 045a7d7b..53635e80 100644 --- a/src/lazy/text/matched.rs +++ b/src/lazy/text/matched.rs @@ -30,7 +30,6 @@ use crate::lazy::span::Span; use crate::lazy::str_ref::StrRef; use crate::lazy::text::as_utf8::AsUtf8; use crate::lazy::text::buffer::TextBuffer; -use crate::lazy::text::parse_result::InvalidInputError; use crate::result::{DecodingError, IonFailure}; use crate::{ Decimal, Int, IonError, IonResult, IonType, RawSymbolRef, Timestamp, TimestampPrecision, @@ -245,11 +244,9 @@ impl MatchedFloat { let text = sanitized.as_utf8(matched_input.offset())?; let float = f64::from_str(text).map_err(|e| { - let error: IonError = InvalidInputError::new(matched_input) - .with_description(format!("encountered an unexpected error ({:?})", e)) - .with_label("parsing a float") - .into(); - error + matched_input + .invalid(format!("encountered an unexpected error ({:?})", e)) + .context("reading a float") })?; Ok(float) } diff --git a/src/lazy/text/parse_result.rs b/src/lazy/text/parse_result.rs index b2a155c8..767ae04b 100644 --- a/src/lazy/text/parse_result.rs +++ b/src/lazy/text/parse_result.rs @@ -3,11 +3,11 @@ use crate::lazy::text::buffer::TextBuffer; use crate::position::Position; -use crate::result::{DecodingError, IonFailure}; +use crate::result::{DecodingError, IncompleteError}; use crate::{IonError, IonResult}; use std::borrow::Cow; -use std::fmt::{Debug, Display}; -use winnow::error::{ErrMode, ErrorKind, ParseError, ParserError}; +use std::fmt::Debug; +use winnow::error::{AddContext, ErrMode, ErrorKind, Needed, ParserError}; use winnow::stream::Stream; use winnow::PResult; @@ -35,132 +35,208 @@ pub(crate) type IonParseResult<'a, O> = PResult>; /// particular item. pub(crate) type IonMatchResult<'a> = IonParseResult<'a, TextBuffer<'a>>; -#[derive(Debug, PartialEq)] -pub enum IonParseError<'data> { - // When nom reports that the data was incomplete, it doesn't provide additional context. - Incomplete, - // When we encounter illegal text Ion, we'll have more information to share with the user. - Invalid(InvalidInputError<'data>), +/// Returned to explain why the current parser could not successfully read the provided input. +/// The included `IonParseErrorKind` gives more granular detail, and indicates whether the reader +/// should continue trying other parsers if any alternatives remain. +#[derive(Debug, Clone)] +pub struct IonParseError<'data> { + /// The input that could not be parsed successfully. + input: TextBuffer<'data>, + /// A human-friendly name for what the parser was working on when the error occurred + context: &'static str, + /// An explanation of what went wrong. + kind: IonParseErrorKind, } -/// Describes a problem that occurred while trying to parse a given input `TextBuffer`. -/// -/// When returned as part of an `IonParseResult`, an `IonParseError` is always wrapped in -/// an [`ErrMode`] (see `IonParseResult`'s documentation for details). If the `ErrMode` is -/// a non-fatal `Error`, the `IonParseError`'s `description` will be `None`. If the `winnow::ErrMode` is -/// a fatal `Failure`, the `description` will be `Some(String)`. In this way, using an -/// `IonParseError` only incurs heap allocation costs when parsing is coming to an end. -#[derive(Debug, PartialEq)] -pub struct InvalidInputError<'data> { - // The input that being parsed when the error arose - input: TextBuffer<'data>, - // A human-friendly name for what the parser was working on when the error occurred - label: Option>, - // The nature of the error--what went wrong? - description: Option>, - // The nom ErrorKind, which indicates which nom-provided parser encountered the error we're - // bubbling up. - nom_error_kind: Option, +impl<'data> IonParseError<'data> { + /// Allows the context (a label for the task that was in progress) to be set on a manually + /// constructed error. Mirrors winnow's [`Parser::context`](winnow::Parser::context) method. + pub(crate) fn context(mut self, context: &'static str) -> Self { + self.context = context; + self + } + + /// Converts this `IonParseError` into an `ErrMode::Cut`. + pub(crate) fn cut_err(self) -> ErrMode { + ErrMode::Cut(self) + } + + /// Converts this `IonParseError` into an `IonParseResult` containing an `Err(ErrMode::Cut)`. + pub(crate) fn cut(self) -> IonParseResult<'data, T> { + Err(ErrMode::Cut(self)) + } + + pub(crate) fn backtrack_err(self) -> ErrMode { + ErrMode::Backtrack(self) + } + + pub(crate) fn backtrack(self) -> IonParseResult<'data, T> { + Err(ErrMode::Backtrack(self)) + } } -impl<'data> InvalidInputError<'data> { - /// Constructs a new `IonParseError` from the provided `input` text. - pub(crate) fn new(input: TextBuffer<'data>) -> Self { - InvalidInputError { - input, - label: None, - description: None, - nom_error_kind: None, +/// This should be overwritten by all parsers. If a parser does not overwrite it, this +/// will produce a coherent (albeit vague) error message. +const DEFAULT_CONTEXT: &str = "reading Ion data"; + +/// Adds error constructor methods to the `TextBuffer` type. +impl<'data> TextBuffer<'data> { + #[inline(never)] + pub(crate) fn incomplete_err_mode( + &self, + context: &'static str, + ) -> ErrMode> { + if self.is_final_data() { + self.invalid("stream is incomplete; ran out of data") + .context(context) + .cut_err() + } else { + ErrMode::Incomplete(Needed::Unknown) } } - /// Constructs a new `IonParseError` from the provided `input` text and `description`. - pub(crate) fn with_label>>(mut self, label: D) -> Self { - self.label = Some(label.into()); - self + pub fn incomplete(&self, context: &'static str) -> IonParseResult<'data, T> { + Err(self.incomplete_err_mode(context)) } - /// Constructs a new `IonParseError` from the provided `input` text and `description`. - pub(crate) fn with_description>>(mut self, description: D) -> Self { - self.description = Some(description.into()); - self + pub fn unrecognized(&self) -> IonParseError<'data> { + IonParseError { + input: *self, + context: DEFAULT_CONTEXT, + kind: IonParseErrorKind::Unrecognized(UnrecognizedInputError::new()), + } } - /// Constructs a new `IonParseError` from the provided `input` text and `description`. - pub(crate) fn with_nom_error_kind(mut self, nom_error_kind: ErrorKind) -> Self { - self.nom_error_kind = Some(nom_error_kind); - self + pub fn invalid(&self, description: impl Into>) -> IonParseError<'data> { + IonParseError { + input: *self, + context: DEFAULT_CONTEXT, + kind: IonParseErrorKind::Invalid(InvalidInputError::new(description)), + } } +} + +#[derive(Debug, Clone, PartialEq)] +pub enum IonParseErrorKind { + /// The parser ran out of input in the middle of an expression. + Incomplete, + /// The parser did not recognize the head of the provided input. It may be recognized by another + /// parser if there are more to try. + Unrecognized(UnrecognizedInputError), + /// The parser detected that it was the correct parser to handle the provided input, + /// but the input was malformed in some way. For example, if the list parser were reading this + /// stream: + /// ```ion + /// ["foo" "bar"] + /// ``` + /// it would know from the leading `[` that this is a list and the list parser is therefore the + /// correct parser to handle it. However, there was a comma missing after `"foo"`. The list + /// parser can say definitively that the input not valid Ion data. + Invalid(InvalidInputError), +} + +#[derive(Clone, Debug, PartialEq)] +pub struct UnrecognizedInputError { + /// The winnow-provided parser kind that rejected the input. + winnow_error_kind: Option, +} - /// Returns a reference to the `description` text, if any. - pub fn description(&self) -> Option<&str> { - self.description.as_deref() +impl UnrecognizedInputError { + fn new() -> Self { + Self { + winnow_error_kind: None, + } } - pub fn label(&self) -> Option<&str> { - self.label.as_deref() + fn with_winnow_error_kind(winnow_error_kind: ErrorKind) -> Self { + Self { + winnow_error_kind: Some(winnow_error_kind), + } } +} - // TODO: Decide how to expose 'input'. +/// Describes a problem that occurred while trying to parse a given input `TextBuffer`. +/// +/// When returned as part of an `IonParseResult`, an `IonParseError` is always wrapped in +/// an [`ErrMode`] (see `IonParseResult`'s documentation for details). If the `ErrMode` is +/// a non-fatal `Error`, the `IonParseError`'s `description` will be `None`. If the `winnow::ErrMode` is +/// a fatal `Failure`, the `description` will be `Some(String)`. In this way, using an +/// `IonParseError` only incurs heap allocation costs when parsing is coming to an end. +#[derive(Debug, PartialEq, Clone)] +pub struct InvalidInputError { + // A human-friendly description of the error. + description: Cow<'static, str>, } -impl<'data> From> for IonParseError<'data> { - fn from(value: InvalidInputError<'data>) -> Self { - IonParseError::Invalid(value) +impl InvalidInputError { + /// Constructs a new `IonParseError` from the provided `input` text. + pub(crate) fn new(description: impl Into>) -> Self { + InvalidInputError { + description: description.into(), + } + } +} + +impl<'data> AddContext> for IonParseError<'data> { + fn add_context( + mut self, + _input: &TextBuffer<'data>, + _token_start: & as Stream>::Checkpoint, + context: &'static str, + ) -> Self { + self.context = context; + self } } // We cannot provide an analogous impl for `Incomplete` because it is missing necessary data. -impl From> for IonError { - fn from(invalid_input_error: InvalidInputError<'_>) -> Self { - let mut message = String::from( - invalid_input_error - .description() - .unwrap_or("invalid Ion syntax encountered"), - ); - if let Some(label) = invalid_input_error.label { - message.push_str("\n while "); - message.push_str(label.as_ref()); - } +impl From> for IonError { + fn from(ion_parse_error: IonParseError<'_>) -> Self { + use IonParseErrorKind::*; + let mut message: String = match ion_parse_error.kind { + Incomplete => { + return IonError::Incomplete(IncompleteError::new( + ion_parse_error.context, + ion_parse_error.input.offset(), + )); + } + Invalid(e) => e.description.into(), + Unrecognized(..) => "found unrecognized syntax".into(), + }; + + message.push_str("\n while "); + message.push_str(ion_parse_error.context.as_ref()); use std::fmt::Write; - let input = invalid_input_error.input; + let input = ion_parse_error.input; - // Make displayable strings showing the contents of the first and last 32 characters - // of the buffer. If the buffer is smaller than 32 bytes, fewer characters will be shown. + // Make displayable strings showing the contents of the first 32 characters + // of the buffer. If the buffer is smaller than 32 bytes, fewer characters will be shown + // and a leading or trailing '...' will be displayed as appropriate. const NUM_CHARS_TO_SHOW: usize = 32; - let (buffer_head, buffer_tail) = match input.as_text() { + let buffer_head = match input.as_text() { // The buffer contains UTF-8 bytes, so we'll display it as text Ok(text) => { let mut head_chars = text.chars(); let mut head = (&mut head_chars) .take(NUM_CHARS_TO_SHOW) - .collect::(); + .collect::() + // XXX: Newlines wreck the formatting, so escape them in output. + // We may wish to do this for other whitespace too. + .replace("\n", "\\n"); if head_chars.next().is_some() { head.push_str("..."); } - let mut tail_chars = text.chars().rev(); - let tail_backwards = (&mut tail_chars) - .take(NUM_CHARS_TO_SHOW) - .collect::>(); - let mut tail = String::new(); - if tail_chars.next().is_some() { - tail.push_str("..."); - } - tail.push_str(tail_backwards.iter().rev().collect::().as_str()); - (head, tail) + head } // The buffer contains non-text bytes, so we'll show its contents as formatted hex // pairs instead. Err(_) => { let head = format!( "{:X?}", - &invalid_input_error.input.bytes()[..NUM_CHARS_TO_SHOW.min(input.len())] + &input.bytes()[..NUM_CHARS_TO_SHOW.min(input.len())] ); - let tail_bytes_to_take = input.bytes().len().min(NUM_CHARS_TO_SHOW); - let buffer_tail = &input.bytes()[input.len() - tail_bytes_to_take..]; - let tail = format!("{:X?}", buffer_tail); - (head, tail) + head } }; // The offset and buffer head will often be helpful to the programmer. The buffer tail @@ -170,149 +246,88 @@ impl From> for IonError { message, r#" offset={} - buffer head=<{}> - buffer tail=<{}> + input={} buffer len={} "#, - invalid_input_error.input.offset(), + input.offset(), buffer_head, - buffer_tail, input.len(), ) .unwrap(); - let position = Position::with_offset(invalid_input_error.input.offset()) - .with_length(invalid_input_error.input.len()); + let position = Position::with_offset(input.offset()).with_length(input.len()); let decoding_error = DecodingError::new(message).with_position(position); IonError::Decoding(decoding_error) } } -impl<'data> From>> for IonParseError<'data> { - fn from(value: ErrMode>) -> Self { - use winnow::error::ErrMode::*; - match value { - Incomplete(_) => IonParseError::Incomplete, - Backtrack(e) => e, - Cut(e) => e, - } - } -} - -/// Allows an `IonParseError` to be constructed from a `(&str, ErrorKind)` tuple, which is the -/// data provided by core `nom` parsers if they do not match the input. -impl<'data> From<(TextBuffer<'data>, ErrorKind)> for IonParseError<'data> { - fn from((input, error_kind): (TextBuffer<'data>, ErrorKind)) -> Self { - InvalidInputError::new(input) - .with_nom_error_kind(error_kind) - .into() - } -} - -/// Allows an [`ErrMode`] to be converted into an [IonParseError] by calling `.into()`. -impl<'data> From, IonParseError<'data>>> for IonParseError<'data> { - fn from(parse_error: ParseError, IonParseError<'data>>) -> Self { - parse_error.into_inner() - } -} - -/// Allows `IonParseError` to be used as the error type in various `nom` functions. +/// Allows `IonParseError` to be used as the error type in various `winnow` parsers. impl<'data> ParserError> for IonParseError<'data> { fn from_error_kind(input: &TextBuffer<'data>, error_kind: ErrorKind) -> Self { - InvalidInputError::new(*input) - .with_nom_error_kind(error_kind) - .into() + IonParseError { + input: *input, + context: DEFAULT_CONTEXT, + kind: IonParseErrorKind::Unrecognized(UnrecognizedInputError::with_winnow_error_kind( + error_kind, + )), + } } fn append( self, - input: &TextBuffer<'data>, + _input: &TextBuffer<'data>, _checkpoint: & as Stream>::Checkpoint, _kind: ErrorKind, ) -> Self { // When an error stack is being built, this method is called to give the error // type an opportunity to aggregate the errors into a collection or a more descriptive - // message. For now, we simply allow the most recent error to take precedence. - IonParseError::Invalid(InvalidInputError::new(*input)) - } -} - -/// `Result, _>` has a method called `transpose` that converts it into an `Option>`, -/// allowing it to be easily used in places like iterators that expect that return type. -/// This trait defines a similar extension method for `Result<(TextBuffer, Option)>`. -pub(crate) trait ToIteratorOutput<'data, T> { - fn transpose(self) -> Option>; -} - -impl<'data, T> ToIteratorOutput<'data, T> for IonResult<(TextBuffer<'data>, Option)> { - fn transpose(self) -> Option> { - match self { - Ok((_remaining, Some(value))) => Some(Ok(value)), - Ok((_remaining, None)) => None, - Err(e) => Some(Err(e)), - } + // message. We don't currently use this feature as it can be expensive in the common case. + // We may find a use for adding it behind a debugging-oriented feature gate. + self } } /// Converts the output of a text Ion parser (any of `IonParseResult`, `IonParseError`, -/// or `winnow::Err`) into a general-purpose `IonResult`. If the implementing type -/// does not have its own `label` and `input`, the specified values will be used. -pub(crate) trait AddContext<'data, T> { - fn with_context<'a>( - self, - label: impl Into>, - input: TextBuffer<'data>, - ) -> IonResult - where - 'data: 'a; +/// or `winnow::ErrMode`) into a general-purpose `IonResult`. +// This is necessary because `winnow`'s `ErrMode::Incomplete` variant doesn't store an IonParseError +// like its other variants, so the input and context aren't available. This method adds them back +// in as a kludge. Winnow v0.7.0 should address this; see: +// +pub(crate) trait WithContext<'data, T> { + fn with_context(self, label: &'static str, input: TextBuffer<'data>) -> IonResult; } -impl<'data, T> AddContext<'data, T> for ErrMode> { - fn with_context<'a>( - self, - label: impl Into>, - input: TextBuffer<'data>, - ) -> IonResult - where - 'data: 'a, - { - let ipe = IonParseError::from(self); - ipe.with_context(label, input) +impl<'data, T> WithContext<'data, T> for ErrMode> { + fn with_context(self, label: &'static str, input: TextBuffer<'data>) -> IonResult { + use ErrMode::*; + let ion_parse_error = match self { + // We only apply the label and input in the `Incomplete` case. Other cases already have + // these fields populated, potentially by more precise information. + Incomplete(_) => IonParseError { + input, + context: label, + kind: IonParseErrorKind::Incomplete, + }, + Backtrack(e) | Cut(e) => e, + }; + + let ion_error = IonError::from(ion_parse_error); + Err(ion_error) } } // Turns an IonParseError into an IonResult -impl<'data, T> AddContext<'data, T> for IonParseError<'data> { - fn with_context<'a>( - self, - label: impl Into>, - input: TextBuffer<'data>, - ) -> IonResult - where - 'data: 'a, - { - match self { - IonParseError::Incomplete => IonResult::incomplete( - format!( - "{}; buffer utf-8: {}", - label.into(), - input.as_text().unwrap_or("") - ), - input.offset(), - ), - IonParseError::Invalid(invalid_input_error) => Err(IonError::from(invalid_input_error)), +impl<'data, T> WithContext<'data, T> for IonParseError<'data> { + fn with_context(mut self, label: &'static str, input: TextBuffer<'data>) -> IonResult { + if self.kind == IonParseErrorKind::Incomplete { + self.input = input; + self.context = label; } + Err(IonError::from(self)) } } -impl<'data, T> AddContext<'data, T> for IonParseResult<'data, T> { - fn with_context<'a>( - self, - label: impl Into>, - input: TextBuffer<'data>, - ) -> IonResult - where - 'data: 'a, - { +impl<'data, T> WithContext<'data, T> for IonParseResult<'data, T> { + fn with_context(self, label: &'static str, input: TextBuffer<'data>) -> IonResult { match self { // No change needed in the ok case Ok(matched) => Ok(matched), @@ -320,43 +335,3 @@ impl<'data, T> AddContext<'data, T> for IonParseResult<'data, T> { } } } - -/// Constructs a `winnow::error::ErrMode::Cut` that contains an `IonParseError` describing the problem -/// that was encountered. -pub(crate) fn fatal_parse_error>, O>( - input: TextBuffer<'_>, - description: D, -) -> IonParseResult<'_, O> { - Err(ErrMode::Cut( - InvalidInputError::new(input) - .with_description(description) - .into(), - )) -} - -/// An extension trait that allows a [Result] of any kind to be mapped to an -/// `IonParseResult` concisely. -pub(crate) trait OrFatalParseError { - fn or_fatal_parse_error( - self, - input: TextBuffer<'_>, - label: L, - ) -> IonParseResult<'_, T>; -} - -/// See the documentation for [OrFatalParseError]. -impl OrFatalParseError for Result -where - E: Debug, -{ - fn or_fatal_parse_error( - self, - input: TextBuffer<'_>, - label: L, - ) -> IonParseResult<'_, T> { - match self { - Ok(value) => Ok(value), - Err(error) => fatal_parse_error(input, format!("{label}: {error:?}")), - } - } -} diff --git a/src/lazy/text/raw/reader.rs b/src/lazy/text/raw/reader.rs index 2cbb59e5..f3359b9e 100644 --- a/src/lazy/text/raw/reader.rs +++ b/src/lazy/text/raw/reader.rs @@ -6,9 +6,10 @@ use crate::lazy::encoding::TextEncoding_1_0; use crate::lazy::expanded::EncodingContextRef; use crate::lazy::raw_stream_item::{EndPosition, LazyRawStreamItem, RawStreamItem}; use crate::lazy::streaming_raw_reader::RawReaderState; -use crate::lazy::text::buffer::TextBuffer; -use crate::lazy::text::parse_result::AddContext; +use crate::lazy::text::buffer::{incomplete_is_ok, TextBuffer}; +use crate::lazy::text::parse_result::WithContext; use crate::{Encoding, IonResult}; +use winnow::Parser; /// A text Ion 1.0 reader that yields [`LazyRawStreamItem`]s representing the top level values found /// in the provided input stream. @@ -58,11 +59,9 @@ impl<'data> LazyRawTextReader_1_0<'data> { .match_top_level_item_1_0() .with_context("reading a top-level value", self.input)?; - let _trailing_ws = self - .input - .match_optional_comments_and_whitespace() + let _trailing_ws = incomplete_is_ok(TextBuffer::match_optional_comments_and_whitespace) + .parse_next(&mut self.input) .with_context("reading trailing top-level whitespace/comments", self.input)?; - Ok(matched_item) } diff --git a/src/lazy/text/raw/sequence.rs b/src/lazy/text/raw/sequence.rs index 3b6323a7..1b42233f 100644 --- a/src/lazy/text/raw/sequence.rs +++ b/src/lazy/text/raw/sequence.rs @@ -14,7 +14,7 @@ use crate::lazy::decoder::{ use crate::lazy::encoding::TextEncoding; use crate::lazy::text::buffer::{whitespace_and_then, TextBuffer}; use crate::lazy::text::matched::MatchedValue; -use crate::lazy::text::parse_result::AddContext; +use crate::lazy::text::parse_result::WithContext; use crate::lazy::text::raw::v1_1::reader::RawTextSequenceCacheIterator; use crate::lazy::text::value::{LazyRawTextValue, RawTextAnnotationsIterator}; use crate::{IonResult, IonType}; @@ -116,7 +116,8 @@ impl<'data, E: TextEncoding<'data>> Iterator for RawTextListIterator<'data, E> { peek("]").value(None), terminated( E::value_expr_matcher(), - whitespace_and_then(alt((",", peek("]")))), + whitespace_and_then(alt((",", peek("]")))) + .context("reading a list value delimiter (`,`)"), ) .map(Some), ))) @@ -130,7 +131,7 @@ impl<'data, E: TextEncoding<'data>> Iterator for RawTextListIterator<'data, E> { } Err(e) => { self.has_returned_error = true; - e.with_context("reading the next list value", self.input) + e.with_context("reading a list value", self.input) .transpose() } } @@ -205,7 +206,7 @@ impl<'data, E: TextEncoding<'data>> Iterator for RawTextSExpIterator<'data, E> { Ok(None) => None, Err(e) => { self.has_returned_error = true; - e.with_context("reading the next s-expression value", self.input) + e.with_context("reading an s-expression value", self.input) .transpose() } } diff --git a/src/lazy/text/raw/struct.rs b/src/lazy/text/raw/struct.rs index d6120e76..6010faac 100644 --- a/src/lazy/text/raw/struct.rs +++ b/src/lazy/text/raw/struct.rs @@ -5,7 +5,7 @@ use crate::lazy::encoding::{TextEncoding, TextEncoding_1_0, TextEncoding_1_1}; use crate::lazy::span::Span; use crate::lazy::text::buffer::{whitespace_and_then, TextBuffer}; use crate::lazy::text::matched::MatchedFieldName; -use crate::lazy::text::parse_result::AddContext; +use crate::lazy::text::parse_result::WithContext; use crate::{IonResult, RawSymbolRef}; use std::marker::PhantomData; use std::ops::Range; @@ -54,7 +54,7 @@ impl<'top, E: TextEncoding<'top>> Iterator for RawTextStructIterator<'top, E> { Ok(None) => None, Err(e) => { self.has_returned_error = true; - e.with_context("reading the next struct field", self.input) + e.with_context("reading a struct field", self.input) .transpose() } } diff --git a/src/lazy/text/raw/v1_1/reader.rs b/src/lazy/text/raw/v1_1/reader.rs index 6c070981..4b5b11d2 100644 --- a/src/lazy/text/raw/v1_1/reader.rs +++ b/src/lazy/text/raw/v1_1/reader.rs @@ -17,13 +17,14 @@ use crate::lazy::expanded::EncodingContextRef; use crate::lazy::raw_stream_item::{EndPosition, LazyRawStreamItem, RawStreamItem}; use crate::lazy::span::Span; use crate::lazy::streaming_raw_reader::RawReaderState; -use crate::lazy::text::buffer::TextBuffer; +use crate::lazy::text::buffer::{incomplete_is_ok, TextBuffer}; use crate::lazy::text::matched::MatchedValue; -use crate::lazy::text::parse_result::AddContext; +use crate::lazy::text::parse_result::WithContext; use crate::lazy::text::raw::v1_1::arg_group::{EExpArg, TextEExpArgGroup}; use crate::lazy::text::value::{LazyRawTextValue, RawTextAnnotationsIterator}; use crate::{v1_1, Encoding, IonResult}; +use winnow::Parser; pub struct LazyRawTextReader_1_1<'data> { input: TextBuffer<'data>, } @@ -84,9 +85,8 @@ impl<'data> LazyRawReader<'data, TextEncoding_1_1> for LazyRawTextReader_1_1<'da .match_top_level_item_1_1() .with_context("reading a v1.1 top-level value", self.input)?; - let _trailing_ws = self - .input - .match_optional_comments_and_whitespace() + let _trailing_ws = incomplete_is_ok(TextBuffer::match_optional_comments_and_whitespace) + .parse_next(&mut self.input) .with_context( "reading trailing top-level whitespace/comments in v1.1", self.input,