Skip to content

Commit 623891f

Browse files
committed
feat: read array values
1 parent eedd5a2 commit 623891f

File tree

4 files changed

+761
-0
lines changed

4 files changed

+761
-0
lines changed

open-variant/README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,16 @@ section is meant to be shared across many data records. It holds the version of
1313
the format as well as a list of keys. The data section holds the serialized
1414
variant data, with all keys replaced by their index in the metadata section.
1515

16+
```text
17+
JSON string [{"key1": 1, "key2": 2}, {"key1": 3, "key2": 4}]
18+
Variant Metadata ["key1", "key2"]
19+
Variant Values [{0: 1, 1: 2}, {0: 3, 1: 4}]
20+
```
21+
22+
By pulling out the keys, space is saved in the serialized form. In addition,
23+
string comparisons only have to occur ones to lookup the field ids. Afterwards,
24+
fields can be looked up in the values data by integer id.
25+
1626
## Example
1727

1828
```rust

open-variant/src/values/mod.rs

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
//! Read and write the values part of the variant format.
2+
3+
mod read;
4+
pub mod write;
5+
6+
pub use read::{ArrayRef, ObjectRef, VariantRef};
7+
8+
/// Basic type of a variant value.
9+
///
10+
/// For [`BasicType::Primitive`], a more specific type is given by [`PrimitiveTypeId`].
11+
#[repr(u8)]
12+
#[derive(Debug, PartialEq)]
13+
pub enum BasicType {
14+
Primitive = 0,
15+
ShortString = 1,
16+
Object = 2,
17+
Array = 3,
18+
}
19+
20+
impl TryFrom<u8> for BasicType {
21+
type Error = ();
22+
23+
/// Convert from u8 to [`BasicType`]. Will return an error if the value is not a valid [`BasicType`].
24+
fn try_from(value: u8) -> Result<Self, ()> {
25+
match value {
26+
0 => Ok(BasicType::Primitive),
27+
1 => Ok(BasicType::ShortString),
28+
2 => Ok(BasicType::Object),
29+
3 => Ok(BasicType::Array),
30+
_ => Err(()),
31+
}
32+
}
33+
}
34+
35+
/// Specific type of a primitive variant value.
36+
#[repr(u8)]
37+
#[derive(Debug, PartialEq)]
38+
#[non_exhaustive]
39+
pub enum PrimitiveTypeId {
40+
Null = 0,
41+
BoolTrue = 1,
42+
BoolFalse = 2,
43+
Int8 = 3,
44+
Int16 = 4,
45+
Int32 = 5,
46+
Int64 = 6,
47+
Float32 = 14,
48+
Float64 = 7,
49+
Decimal4 = 8, // 32-bit
50+
Decimal8 = 9, // 64-bit
51+
Decimal16 = 10, // 128-bit
52+
Date32 = 11,
53+
TimestampMicro = 12, // (with timezone)
54+
TimestampMicroNTZ = 13, // (without timezone)
55+
// 14 is Float32
56+
Binary = 15,
57+
String = 16,
58+
BinaryFromDictionary = 17,
59+
StringFromDictionary = 18,
60+
}
61+
62+
impl TryFrom<u8> for PrimitiveTypeId {
63+
type Error = ();
64+
65+
/// Convert from i8 to [`PrimitiveTypeId`]. Will return an error if the value is not a valid [`PrimitiveTypeId`].
66+
fn try_from(value: u8) -> Result<Self, ()> {
67+
match value {
68+
0 => Ok(PrimitiveTypeId::Null),
69+
1 => Ok(PrimitiveTypeId::BoolTrue),
70+
2 => Ok(PrimitiveTypeId::BoolFalse),
71+
3 => Ok(PrimitiveTypeId::Int8),
72+
4 => Ok(PrimitiveTypeId::Int16),
73+
5 => Ok(PrimitiveTypeId::Int32),
74+
6 => Ok(PrimitiveTypeId::Int64),
75+
7 => Ok(PrimitiveTypeId::Float64),
76+
8 => Ok(PrimitiveTypeId::Decimal4),
77+
9 => Ok(PrimitiveTypeId::Decimal8),
78+
10 => Ok(PrimitiveTypeId::Decimal16),
79+
11 => Ok(PrimitiveTypeId::Date32),
80+
12 => Ok(PrimitiveTypeId::TimestampMicro),
81+
13 => Ok(PrimitiveTypeId::TimestampMicroNTZ),
82+
14 => Ok(PrimitiveTypeId::Float32),
83+
15 => Ok(PrimitiveTypeId::Binary),
84+
16 => Ok(PrimitiveTypeId::String),
85+
17 => Ok(PrimitiveTypeId::BinaryFromDictionary),
86+
18 => Ok(PrimitiveTypeId::StringFromDictionary),
87+
_ => Err(()),
88+
}
89+
}
90+
}

open-variant/src/values/read.rs

Lines changed: 259 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,259 @@
1+
// TODO: make this codebase not care about whether there is more data after
2+
// the value.
3+
// TODO: implement function to shrink to the slice where the value is, if people
4+
// want that.
5+
6+
use super::{BasicType, PrimitiveTypeId};
7+
8+
/// A view into a variant data buffer.
9+
#[derive(Clone)]
10+
pub struct VariantRef<'a>(&'a [u8]);
11+
12+
// TODO: a nice debug implementation would be awesome. TBH could use debug_struct?
13+
14+
impl<'a> VariantRef<'a> {
15+
pub fn try_new(data: &'a [u8]) -> Result<Self, String> {
16+
if data.is_empty() {
17+
return Err("Empty buffer".into());
18+
}
19+
Ok(Self(data))
20+
}
21+
22+
pub fn basic_type(&self) -> BasicType {
23+
let header = self.0[0];
24+
(header & 0b11).try_into().expect("Invalid BasicType")
25+
}
26+
27+
pub fn primitive_type_id(&self) -> PrimitiveTypeId {
28+
let header = self.0[0];
29+
(header >> 2).try_into().expect("Invalid PrimitiveTypeId")
30+
}
31+
32+
pub fn get_bool(&self) -> bool {
33+
match self.primitive_type_id() {
34+
PrimitiveTypeId::BoolTrue => true,
35+
PrimitiveTypeId::BoolFalse => false,
36+
_ => panic!("Not a boolean"),
37+
}
38+
}
39+
40+
pub fn get_i64(&self) -> i64 {
41+
if !matches!(self.primitive_type_id(), PrimitiveTypeId::Int64) {
42+
panic!("Not an i64");
43+
}
44+
// debug_assert_eq!(self.0.len(), 9); // 1 byte header + 8 byte i64
45+
i64::from_le_bytes(self.0[1..9].try_into().unwrap())
46+
}
47+
48+
pub fn get_f64(&self) -> f64 {
49+
if !matches!(self.primitive_type_id(), PrimitiveTypeId::Float64) {
50+
panic!("Not an f64");
51+
}
52+
// debug_assert_eq!(self.0.len(), 9); // 1 byte header + 8 byte f64
53+
f64::from_le_bytes(self.0[1..9].try_into().unwrap())
54+
}
55+
56+
pub fn get_string<'b>(&'b self) -> &'a str {
57+
if !matches!(self.primitive_type_id(), PrimitiveTypeId::String) {
58+
panic!("Not a string");
59+
}
60+
let size = i32::from_le_bytes(self.0[1..5].try_into().unwrap()) as usize;
61+
let start = 5;
62+
let end = start + size;
63+
std::str::from_utf8(&self.0[start..end]).unwrap()
64+
}
65+
66+
pub fn get_object<'b>(&'b self) -> Result<ObjectRef<'a>, String> {
67+
ObjectRef::try_new(self)
68+
}
69+
70+
pub fn get_array<'b>(&'b self) -> Result<ArrayRef<'a>, String> {
71+
ArrayRef::try_new(self)
72+
}
73+
}
74+
75+
/// A view into an object variant data buffer.
76+
///
77+
/// This has been validated that it is an object.
78+
pub struct ObjectRef<'a> {
79+
len: usize,
80+
field_id_width: u8,
81+
offset_width: u8,
82+
field_ids: &'a [u8],
83+
offsets: &'a [u8],
84+
values: &'a [u8],
85+
}
86+
87+
impl<'a> ObjectRef<'a> {
88+
/// Try to create a new ObjectRef from a VariantRef.
89+
///
90+
/// Will return an error if the VariantRef is not an object. Also returns
91+
/// an error if the object is not valid.
92+
pub fn try_new(data: &VariantRef<'a>) -> Result<Self, String> {
93+
if !matches!(data.basic_type(), BasicType::Object) {
94+
return Err("Not an object".into());
95+
}
96+
let mut data = data.0;
97+
98+
// Parse out the header
99+
let header = data[0] >> 2;
100+
let offset_width = (header & 0b11) + 1;
101+
let field_id_width = ((header >> 2) & 0b11) + 1;
102+
let is_large = (header >> 4) & 1;
103+
data = &data[1..];
104+
105+
let len = if is_large == 1 {
106+
// i32 for number of elements
107+
let len = i32::from_le_bytes(data[..4].try_into().unwrap()) as usize;
108+
data = &data[4..];
109+
len
110+
} else {
111+
// i8 for number of elements
112+
let len = i8::from_le_bytes(data[..1].try_into().unwrap()) as usize;
113+
data = &data[1..];
114+
len
115+
};
116+
117+
let field_id_len = len * field_id_width as usize;
118+
let field_ids = &data[..field_id_len];
119+
data = &data[field_id_len..];
120+
121+
let offset_len = (len + 1) * offset_width as usize;
122+
let offsets = &data[..offset_len];
123+
data = &data[offset_len..];
124+
125+
Ok(Self {
126+
len,
127+
field_id_width,
128+
offset_width,
129+
field_ids,
130+
offsets,
131+
values: data,
132+
})
133+
}
134+
135+
pub fn get_field<'b>(&'b self, field_id: usize) -> Option<VariantRef<'a>> {
136+
// Fields are required to be sorted by field_id, so we can binary search
137+
let field_id = field_id as u64;
138+
let mut left = 0;
139+
let mut right = self.len as u64;
140+
while left < right {
141+
let mid = left + (right - left) / 2;
142+
let mid_field_id = self.get_field_id(mid as usize);
143+
match mid_field_id.cmp(&field_id) {
144+
std::cmp::Ordering::Equal => return Some(VariantRef(self.get_value(mid as usize))),
145+
std::cmp::Ordering::Less => left = mid + 1,
146+
std::cmp::Ordering::Greater => right = mid,
147+
}
148+
}
149+
None
150+
}
151+
152+
fn get_value<'b>(&'b self, idx: usize) -> &'a [u8] {
153+
let start = self.get_offset(idx);
154+
155+
// Offsets are NOT guaranteed to be monotonic. It's a substantial
156+
// computation to find the end of the value or the next offset,
157+
// so instead we provide the buffer starting at the variant.
158+
// let end = (0..(self.len + 1))
159+
// .map(|i| self.get_offset(i))
160+
// .filter(|offset| *offset > start)
161+
// .min()
162+
// .expect("No other offset found");
163+
let end = self.get_offset(self.len);
164+
&self.values[start..end]
165+
}
166+
167+
fn get_field_id(&'a self, idx: usize) -> u64 {
168+
let start = idx * self.field_id_width as usize;
169+
let end = start + self.field_id_width as usize;
170+
match self.field_id_width {
171+
1 => u8::from_le_bytes(self.field_ids[start..end].try_into().unwrap()) as u64,
172+
2 => u16::from_le_bytes(self.field_ids[start..end].try_into().unwrap()) as u64,
173+
4 => u32::from_le_bytes(self.field_ids[start..end].try_into().unwrap()) as u64,
174+
8 => u64::from_le_bytes(self.field_ids[start..end].try_into().unwrap()),
175+
_ => unreachable!(),
176+
}
177+
}
178+
179+
fn get_offset(&'a self, idx: usize) -> usize {
180+
let start = idx * self.offset_width as usize;
181+
let end = start + self.offset_width as usize;
182+
match self.offset_width {
183+
1 => u8::from_le_bytes(self.offsets[start..end].try_into().unwrap()) as usize,
184+
2 => u16::from_le_bytes(self.offsets[start..end].try_into().unwrap()) as usize,
185+
4 => u32::from_le_bytes(self.offsets[start..end].try_into().unwrap()) as usize,
186+
8 => u64::from_le_bytes(self.offsets[start..end].try_into().unwrap()) as usize,
187+
_ => unreachable!(),
188+
}
189+
}
190+
}
191+
192+
/// A view into an array variant data buffer.
193+
///
194+
/// This has been validated that it is an array.
195+
pub struct ArrayRef<'a> {
196+
len: usize,
197+
offset_width: u8,
198+
offsets: &'a [u8],
199+
values: &'a [u8],
200+
}
201+
202+
impl<'a> ArrayRef<'a> {
203+
pub fn try_new(data: &VariantRef<'a>) -> Result<Self, String> {
204+
if !matches!(data.basic_type(), BasicType::Array) {
205+
return Err("Not an array".into());
206+
}
207+
let mut data = data.0;
208+
209+
let header = data[0] >> 2;
210+
let is_large = header >> 2 & 1 == 1;
211+
let offset_width = (header & 0b11) + 1;
212+
213+
data = &data[1..];
214+
215+
let len = if is_large {
216+
// i32 for number of elements
217+
let len = i32::from_le_bytes(data[..4].try_into().unwrap()) as usize;
218+
data = &data[4..];
219+
len
220+
} else {
221+
// i8 for number of elements
222+
let len = i8::from_le_bytes(data[..1].try_into().unwrap()) as usize;
223+
data = &data[1..];
224+
len
225+
};
226+
227+
let offset_len = (len + 1) * offset_width as usize;
228+
let offsets = &data[..offset_len];
229+
let values = &data[offset_len..];
230+
231+
Ok(Self {
232+
len,
233+
offset_width,
234+
offsets,
235+
values,
236+
})
237+
}
238+
239+
pub fn get_element<'b>(&'b self, index: usize) -> Option<VariantRef<'a>> {
240+
if index >= self.len {
241+
return None;
242+
}
243+
let start = self.get_offset(index);
244+
let end = self.get_offset(index + 1);
245+
Some(VariantRef(&self.values[start..end]))
246+
}
247+
248+
fn get_offset(&self, idx: usize) -> usize {
249+
let start = idx * self.offset_width as usize;
250+
let end = start + self.offset_width as usize;
251+
match self.offset_width {
252+
1 => u8::from_le_bytes(self.offsets[start..end].try_into().unwrap()) as usize,
253+
2 => u16::from_le_bytes(self.offsets[start..end].try_into().unwrap()) as usize,
254+
4 => u32::from_le_bytes(self.offsets[start..end].try_into().unwrap()) as usize,
255+
8 => u64::from_le_bytes(self.offsets[start..end].try_into().unwrap()) as usize,
256+
_ => unreachable!(),
257+
}
258+
}
259+
}

0 commit comments

Comments
 (0)