Skip to content

Commit

Permalink
Implements flexsym annotation reading, adds binary 1.1 roundtrip unit…
Browse files Browse the repository at this point in the history
… tests (#788)

* Adds binary 1.1 roundtrip unit tests, `has_annotations` no longer invokes `annotations`
* Implements reading binary FlexSym annotations
* Fixes timestamp decoding of offsets
* 1.1 binary writer now uses 1 for UTC and 0 for unknown
  • Loading branch information
zslayton authored Jun 10, 2024
1 parent 8d1178e commit 81e5fd9
Show file tree
Hide file tree
Showing 12 changed files with 437 additions and 131 deletions.
12 changes: 9 additions & 3 deletions src/lazy/binary/encoded_value.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
use crate::lazy::binary::raw::type_descriptor::Header;
use crate::lazy::binary::raw::v1_1::immutable_buffer::AnnotationsEncoding;
use crate::IonType;
use std::ops::Range;

pub(crate) trait EncodedHeader: Copy {
type TypeCode;
fn ion_type(&self) -> IonType;
fn type_code(&self) -> Self::TypeCode;
fn length_code(&self) -> u8;
fn low_nibble(&self) -> u8;

fn is_null(&self) -> bool;
}
Expand All @@ -22,7 +23,7 @@ impl EncodedHeader for Header {
self.ion_type_code
}

fn length_code(&self) -> u8 {
fn low_nibble(&self) -> u8 {
self.length_code
}

Expand Down Expand Up @@ -77,7 +78,10 @@ pub(crate) struct EncodedValue<HeaderType: EncodedHeader> {
// sequence itself.
pub annotations_header_length: u8,
// The number of bytes used to encode the series of symbol IDs inside the annotations wrapper.
pub annotations_sequence_length: u8,
pub annotations_sequence_length: u16,
// Whether the annotations sequence is encoded as `FlexSym`s or as symbol addresses.
// In Ion 1.0, they are always encoded as symbol addresses.
pub annotations_encoding: AnnotationsEncoding,
// The offset of the type descriptor byte within the overall input stream.
pub header_offset: usize,
// The number of bytes used to encode the optional length VarUInt following the header byte.
Expand Down Expand Up @@ -237,6 +241,7 @@ mod tests {
use crate::binary::IonTypeCode;
use crate::lazy::binary::encoded_value::EncodedValue;
use crate::lazy::binary::raw::type_descriptor::Header;
use crate::lazy::binary::raw::v1_1::immutable_buffer::AnnotationsEncoding;
use crate::{IonResult, IonType};

#[test]
Expand All @@ -250,6 +255,7 @@ mod tests {
},
annotations_header_length: 3,
annotations_sequence_length: 1,
annotations_encoding: AnnotationsEncoding::SymbolAddress,
header_offset: 200,
length_length: 0,
value_body_length: 3,
Expand Down
4 changes: 3 additions & 1 deletion src/lazy/binary/immutable_buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use crate::binary::var_uint::VarUInt;
use crate::lazy::binary::encoded_value::EncodedValue;
use crate::lazy::binary::raw::r#struct::LazyRawBinaryFieldName_1_0;
use crate::lazy::binary::raw::type_descriptor::{Header, TypeDescriptor, ION_1_0_TYPE_DESCRIPTORS};
use crate::lazy::binary::raw::v1_1::immutable_buffer::AnnotationsEncoding;
use crate::lazy::binary::raw::value::{LazyRawBinaryValue_1_0, LazyRawBinaryVersionMarker_1_0};
use crate::lazy::decoder::LazyRawFieldExpr;
use crate::lazy::encoder::binary::v1_1::flex_int::FlexInt;
Expand Down Expand Up @@ -704,6 +705,7 @@ impl<'a> ImmutableBuffer<'a> {
// If applicable, these are populated by the caller: `read_annotated_value()`
annotations_header_length: 0,
annotations_sequence_length: 0,
annotations_encoding: AnnotationsEncoding::SymbolAddress,
header_offset,
length_length,
value_body_length: value_length,
Expand Down Expand Up @@ -745,7 +747,7 @@ impl<'a> ImmutableBuffer<'a> {
}

lazy_value.encoded_value.annotations_header_length = wrapper.header_length;
lazy_value.encoded_value.annotations_sequence_length = wrapper.sequence_length;
lazy_value.encoded_value.annotations_sequence_length = wrapper.sequence_length as u16;
lazy_value.encoded_value.total_length += wrapper.header_length as usize;
// Modify the input to include the annotations
lazy_value.input = input;
Expand Down
42 changes: 37 additions & 5 deletions src/lazy/binary/raw/v1_1/annotations_iterator.rs
Original file line number Diff line number Diff line change
@@ -1,23 +1,55 @@
#![allow(non_camel_case_types)]
use crate::lazy::binary::raw::v1_1::immutable_buffer::ImmutableBuffer;
use crate::{IonResult, RawSymbolRef};
use crate::lazy::binary::raw::v1_1::immutable_buffer::{AnnotationsEncoding, ImmutableBuffer};
use crate::lazy::encoder::binary::v1_1::flex_sym::FlexSymValue;
use crate::{IonResult, RawSymbolRef, SymbolId};

/// Iterates over a slice of bytes, lazily reading them as a sequence of FlexUInt- or
/// FlexSym-encoded symbol IDs.
pub struct RawBinaryAnnotationsIterator_1_1<'a> {
buffer: ImmutableBuffer<'a>,
encoding: AnnotationsEncoding,
}

impl<'a> RawBinaryAnnotationsIterator_1_1<'a> {
pub(crate) fn new(buffer: ImmutableBuffer<'a>) -> RawBinaryAnnotationsIterator_1_1<'a> {
Self { buffer }
pub(crate) fn new(
buffer: ImmutableBuffer<'a>,
encoding: AnnotationsEncoding,
) -> RawBinaryAnnotationsIterator_1_1<'a> {
Self { buffer, encoding }
}
}

impl<'a> Iterator for RawBinaryAnnotationsIterator_1_1<'a> {
type Item = IonResult<RawSymbolRef<'a>>;

fn next(&mut self) -> Option<Self::Item> {
todo!()
if self.buffer.is_empty() {
return None;
}
use AnnotationsEncoding::*;
let (raw_symbol, remaining_input) = match self.encoding {
SymbolAddress => match self.buffer.read_flex_uint() {
Ok((flex_uint, remaining_input)) => (
RawSymbolRef::SymbolId(flex_uint.value() as SymbolId),
remaining_input,
),
Err(error) => return Some(Err(error)),
},
FlexSym => {
let (flex_sym, remaining_input) = match self.buffer.read_flex_sym() {
Ok((flex_sym, remaining_input)) => (flex_sym, remaining_input),
Err(error) => return Some(Err(error)),
};
let raw_symbol = match flex_sym.value() {
FlexSymValue::SymbolRef(raw_symbol) => raw_symbol,
FlexSymValue::Opcode(_) => {
todo!("FlexSym escapes in annotation sequences")
}
};
(raw_symbol, remaining_input)
}
};
self.buffer = remaining_input;
Some(Ok(raw_symbol))
}
}
126 changes: 108 additions & 18 deletions src/lazy/binary/raw/v1_1/immutable_buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use crate::lazy::binary::encoded_value::EncodedValue;
use crate::lazy::binary::raw::v1_1::value::{
LazyRawBinaryValue_1_1, LazyRawBinaryVersionMarker_1_1,
};
use crate::lazy::binary::raw::v1_1::{Header, LengthType, Opcode, ION_1_1_OPCODES};
use crate::lazy::binary::raw::v1_1::{Header, LengthType, Opcode, OpcodeType, ION_1_1_OPCODES};
use crate::lazy::encoder::binary::v1_1::fixed_int::FixedInt;
use crate::lazy::encoder::binary::v1_1::fixed_uint::FixedUInt;
use crate::lazy::encoder::binary::v1_1::flex_int::FlexInt;
Expand Down Expand Up @@ -173,12 +173,6 @@ impl<'a> ImmutableBuffer<'a> {
Ok((flex_sym, remaining))
}

/// Attempts to decode an annotations wrapper at the beginning of the buffer and returning
/// its subfields in an [`AnnotationsWrapper`].
pub fn read_annotations_wrapper(&self, _opcode: Opcode) -> ParseResult<'a, AnnotationsWrapper> {
todo!();
}

/// Reads a `NOP` encoding primitive from the buffer. If it is successful, returns an `Ok(_)`
/// containing the number of bytes that were consumed.
///
Expand All @@ -191,10 +185,10 @@ impl<'a> ImmutableBuffer<'a> {
let opcode = self.peek_opcode()?;

// We need to determine the size of the nop..
let (size, remaining) = if opcode.length_code == 0xC {
let (size, remaining) = if opcode.low_nibble() == 0xC {
// Size 0; the nop is contained entirely within the OpCode.
(0, self.consume(1))
} else if opcode.length_code == 0xD {
} else if opcode.low_nibble() == 0xD {
// We have a flexuint telling us how long our nop is.
let after_header = self.consume(1);
let (len, rest) = after_header.read_flex_uint()?;
Expand Down Expand Up @@ -278,7 +272,7 @@ impl<'a> ImmutableBuffer<'a> {
/// Reads a value from the buffer. The caller must confirm that the buffer is not empty and that
/// the next byte (`type_descriptor`) is not a NOP.
pub fn read_value(self, opcode: Opcode) -> IonResult<LazyRawBinaryValue_1_1<'a>> {
if opcode.is_annotation_wrapper() {
if opcode.is_annotations_sequence() {
self.read_annotated_value(opcode)
} else {
self.read_value_without_annotations(opcode)
Expand Down Expand Up @@ -309,6 +303,7 @@ impl<'a> ImmutableBuffer<'a> {
// If applicable, these are populated by the caller: `read_annotated_value()`
annotations_header_length: 0,
annotations_sequence_length: 0,
annotations_encoding: AnnotationsEncoding::SymbolAddress,
header_offset,
length_length,
value_body_length: value_length,
Expand Down Expand Up @@ -340,19 +335,114 @@ impl<'a> ImmutableBuffer<'a> {

/// Reads an annotations wrapper and its associated value from the buffer. The caller must confirm
/// that the next byte in the buffer (`type_descriptor`) begins an annotations wrapper.
fn read_annotated_value(
fn read_annotated_value(self, opcode: Opcode) -> IonResult<LazyRawBinaryValue_1_1<'a>> {
let (annotations_seq, input_after_annotations) = self.read_annotations_sequence(opcode)?;
let opcode = input_after_annotations.peek_opcode()?;
let mut value = input_after_annotations.read_value_without_annotations(opcode)?;
value.encoded_value.annotations_header_length = annotations_seq.header_length;
value.encoded_value.annotations_sequence_length = annotations_seq.sequence_length;
value.encoded_value.annotations_encoding = annotations_seq.encoding;
value.encoded_value.total_length +=
annotations_seq.header_length as usize + annotations_seq.sequence_length as usize;
// Rewind the input to include the annotations sequence
value.input = self;
Ok(value)
}

fn read_annotations_sequence(self, opcode: Opcode) -> ParseResult<'a, EncodedAnnotations> {
match opcode.opcode_type {
OpcodeType::AnnotationFlexSym => self.read_flex_sym_annotations_sequence(opcode),
OpcodeType::SymbolAddress => self.read_symbol_address_annotations_sequence(opcode),
_ => unreachable!("read_annotations_sequence called for non-annotations opcode"),
}
}

fn read_flex_sym_annotations_sequence(
self,
mut _type_descriptor: Opcode,
) -> IonResult<LazyRawBinaryValue_1_1<'a>> {
todo!();
opcode: Opcode,
) -> ParseResult<'a, EncodedAnnotations> {
let input_after_opcode = self.consume(1);
// TODO: This implementation actively reads the annotations, which isn't necessary.
// At this phase of parsing we can just identify the buffer slice that contains
// the annotations and remember their encoding; later on, the annotations iterator
// can actually do the reading. That optimization would be impactful for FlexSyms
// that represent inline text.
let (sequence, remaining_input) = match opcode.low_nibble() {
7 => {
let (flex_sym, remaining_input) = input_after_opcode.read_flex_sym()?;
let sequence = EncodedAnnotations {
encoding: AnnotationsEncoding::FlexSym,
header_length: 1, // 0xE7
sequence_length: u16::try_from(flex_sym.size_in_bytes()).map_err(|_| {
IonError::decoding_error(
"the maximum supported annotations sequence length is 65KB.",
)
})?,
};
(sequence, remaining_input)
}
8 => {
let (flex_sym1, input_after_sym1) = input_after_opcode.read_flex_sym()?;
let (flex_sym2, input_after_sym2) = input_after_sym1.read_flex_sym()?;
let combined_length = flex_sym1.size_in_bytes() + flex_sym2.size_in_bytes();
let sequence = EncodedAnnotations {
encoding: AnnotationsEncoding::FlexSym,
header_length: 1, // 0xE8
sequence_length: u16::try_from(combined_length).map_err(|_| {
IonError::decoding_error(
"the maximum supported annotations sequence length is 65KB.",
)
})?,
};
(sequence, input_after_sym2)
}
9 => {
let (flex_uint, remaining_input) = input_after_opcode.read_flex_uint()?;
let sequence = EncodedAnnotations {
encoding: AnnotationsEncoding::FlexSym,
header_length: u8::try_from(1 + flex_uint.size_in_bytes()).map_err(|_| {
IonError::decoding_error("found a 256+ byte annotations header")
})?,
sequence_length: u16::try_from(flex_uint.value()).map_err(|_| {
IonError::decoding_error(
"the maximum supported annotations sequence length is 65KB.",
)
})?,
};
(
sequence,
remaining_input.consume(sequence.sequence_length as usize),
)
}
_ => unreachable!("reading flexsym annotations sequence with invalid length code"),
};
Ok((sequence, remaining_input))
}

fn read_symbol_address_annotations_sequence(
self,
_opcode: Opcode,
) -> ParseResult<'a, EncodedAnnotations> {
todo!()
}
}

#[derive(Clone, Copy, Debug, PartialEq)]
pub enum AnnotationsEncoding {
SymbolAddress,
FlexSym,
}

/// Represents the data found in an Ion 1.0 annotations wrapper.
pub struct AnnotationsWrapper {
/// Represents the data found in an Ion 1.1 annotations sequence
#[derive(Clone, Copy, Debug)]
pub struct EncodedAnnotations {
pub encoding: AnnotationsEncoding,
// The number of bytes used to represent the annotations opcode and the byte length prefix
// (in the case of 0xE9). As a result, this will almost always be 1 or 2.
pub header_length: u8,
pub sequence_length: u8,
pub expected_value_length: usize,
// The number of bytes used to represent the annotations sequence itself. Because these
// can be encoded with inline text, it's possible for the length to be non-trivial.
pub sequence_length: u16,
}

#[cfg(test)]
Expand Down
22 changes: 11 additions & 11 deletions src/lazy/binary/raw/v1_1/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -536,17 +536,17 @@ mod tests {
#[case("2024T", &[0x80, 0x36])]
#[case("2023-10T", &[0x81, 0x35, 0x05])]
#[case("2023-10-15T", &[0x82, 0x35, 0x7D])]
#[case("2023-10-15T05:04Z", &[0x83, 0x35, 0x7D, 0x85, 0x08])]
#[case("2023-10-15T05:04:03Z", &[0x84, 0x35, 0x7D, 0x85, 0x38, 0x00])]
#[case("2023-10-15T05:04:03.123-00:00", &[0x85, 0x35, 0x7D, 0x85, 0x30, 0xEC, 0x01])]
#[case("2023-10-15T05:04:03.000123-00:00", &[0x86, 0x35, 0x7D, 0x85, 0x30, 0xEC, 0x01, 0x00])]
#[case("2023-10-15T05:04:03.000000123-00:00", &[0x87, 0x35, 0x7D, 0x85, 0x30, 0xEC, 0x01, 0x00, 0x00])]
#[case("2023-10-15T05:04+01:00", &[0x88, 0x35, 0x7D, 0x85, 0x20, 0x00])]
#[case("2023-10-15T05:04-01:00", &[0x88, 0x35, 0x7D, 0x85, 0xE0, 0x03])]
#[case("2023-10-15T05:04:03+01:00", &[0x89, 0x35, 0x7D, 0x85, 0x20, 0x0C])]
#[case("2023-10-15T05:04:03.123+01:00", &[0x8A, 0x35, 0x7D, 0x85, 0x20, 0x0C, 0x7B, 0x00])]
#[case("2023-10-15T05:04:03.000123+01:00", &[0x8B, 0x35, 0x7D, 0x85, 0x20, 0x0C, 0x7B, 0x00, 0x00])]
#[case("2023-10-15T05:04:03.000000123+01:00", &[0x8C, 0x35, 0x7D, 0x85, 0x20, 0x0C, 0x7B, 0x00, 0x00, 0x00])]
#[case("2023-10-15T05:04Z", &[0x83, 0x35, 0x7D, 0x85, 0x00])]
#[case("2023-10-15T05:04:03Z", &[0x84, 0x35, 0x7D, 0x85, 0x30, 0x00])]
#[case("2023-10-15T05:04:03.123-00:00", &[0x85, 0x35, 0x7D, 0x85, 0x38, 0xEC, 0x01])]
#[case("2023-10-15T05:04:03.000123-00:00", &[0x86, 0x35, 0x7D, 0x85, 0x38, 0xEC, 0x01, 0x00])]
#[case("2023-10-15T05:04:03.000000123-00:00", &[0x87, 0x35, 0x7D, 0x85, 0x38, 0xEC, 0x01, 0x00, 0x00])]
#[case("2023-10-15T05:04+01:00", &[0x88, 0x35, 0x7D, 0x85, 0xE0, 0x01])]
#[case("2023-10-15T05:04-01:00", &[0x88, 0x35, 0x7D, 0x85, 0xA0, 0x01])]
#[case("2023-10-15T05:04:03+01:00", &[0x89, 0x35, 0x7D, 0x85, 0xE0, 0x0D])]
#[case("2023-10-15T05:04:03.123+01:00", &[0x8A, 0x35, 0x7D, 0x85, 0xE0, 0x0D, 0x7B, 0x00])]
#[case("2023-10-15T05:04:03.000123+01:00", &[0x8B, 0x35, 0x7D, 0x85, 0xE0, 0x0D, 0x7B, 0x00, 0x00])]
#[case("2023-10-15T05:04:03.000000123+01:00", &[0x8C, 0x35, 0x7D, 0x85, 0xE0, 0x0D, 0x7B, 0x00, 0x00, 0x00])]
fn timestamps_short(#[case] expected_txt: &str, #[case] ion_data: &[u8]) -> IonResult<()> {
use crate::lazy::decoder::{LazyRawReader, LazyRawValue};
use crate::lazy::text::raw::v1_1::reader::LazyRawTextReader_1_1;
Expand Down
2 changes: 1 addition & 1 deletion src/lazy/binary/raw/v1_1/struct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ impl<'top> RawBinaryStructIterator_1_1<'top> {
bytes_to_skip: 0,
struct_type: match opcode_type {
// TODO: Delimited struct handling
OpcodeType::Struct => StructType::FlexSym,
OpcodeType::Struct => StructType::SymbolAddress,
_ => unreachable!("Unexpected opcode for structure"),
},
}
Expand Down
Loading

0 comments on commit 81e5fd9

Please sign in to comment.