21
21
22
22
use anyhow:: anyhow;
23
23
use arrow_array:: RecordBatch ;
24
- use arrow_json:: reader:: { infer_json_schema_from_iterator, Decoder , DecoderOptions } ;
25
- use arrow_schema:: { DataType , Field , Schema } ;
24
+ use arrow_json:: reader:: { infer_json_schema_from_iterator, ReaderBuilder } ;
25
+ use arrow_schema:: { DataType , Field , Fields , Schema } ;
26
26
use datafusion:: arrow:: util:: bit_util:: round_upto_multiple_of_64;
27
27
use serde_json:: Value ;
28
28
use std:: { collections:: HashMap , sync:: Arc } ;
29
29
30
- use super :: EventFormat ;
30
+ use super :: { EventFormat , Metadata , Tags } ;
31
31
use crate :: utils:: { arrow:: get_field, json:: flatten_json_body} ;
32
32
33
33
pub struct Event {
34
34
pub data : Value ,
35
- pub tags : String ,
36
- pub metadata : String ,
35
+ pub tags : Tags ,
36
+ pub metadata : Metadata ,
37
37
}
38
38
39
39
impl EventFormat for Event {
@@ -43,10 +43,9 @@ impl EventFormat for Event {
43
43
// also extract the arrow schema, tags and metadata from the incoming json
44
44
fn to_data (
45
45
self ,
46
- schema : & HashMap < String , Field > ,
47
- ) -> Result < ( Self :: Data , Schema , bool , String , String ) , anyhow:: Error > {
46
+ schema : HashMap < String , Arc < Field > > ,
47
+ ) -> Result < ( Self :: Data , Vec < Arc < Field > > , bool , Tags , Metadata ) , anyhow:: Error > {
48
48
let data = flatten_json_body ( self . data ) ?;
49
-
50
49
let stream_schema = schema;
51
50
52
51
// incoming event may be a single json or a json array
@@ -63,18 +62,18 @@ impl EventFormat for Event {
63
62
collect_keys ( value_arr. iter ( ) ) . expect ( "fields can be collected from array of objects" ) ;
64
63
65
64
let mut is_first = false ;
66
- let schema = match derive_arrow_schema ( stream_schema, fields) {
65
+ let schema = match derive_arrow_schema ( & stream_schema, fields) {
67
66
Ok ( schema) => schema,
68
67
Err ( _) => match infer_json_schema_from_iterator ( value_arr. iter ( ) . map ( Ok ) ) {
69
68
Ok ( infer_schema) => {
70
69
if let Err ( err) = Schema :: try_merge ( vec ! [
71
- Schema :: new( stream_schema. values( ) . cloned( ) . collect( ) ) ,
70
+ Schema :: new( stream_schema. values( ) . cloned( ) . collect:: < Fields > ( ) ) ,
72
71
infer_schema. clone( ) ,
73
72
] ) {
74
73
return Err ( anyhow ! ( "Could not merge schema of this event with that of the existing stream. {:?}" , err) ) ;
75
74
}
76
75
is_first = true ;
77
- infer_schema
76
+ infer_schema. fields . iter ( ) . cloned ( ) . collect ( )
78
77
}
79
78
Err ( err) => {
80
79
return Err ( anyhow ! (
@@ -100,13 +99,13 @@ impl EventFormat for Event {
100
99
// Convert the Data type (defined above) to arrow record batch
101
100
fn decode ( data : Self :: Data , schema : Arc < Schema > ) -> Result < RecordBatch , anyhow:: Error > {
102
101
let array_capacity = round_upto_multiple_of_64 ( data. len ( ) ) ;
103
- let value_iter: & mut ( dyn Iterator < Item = Value > ) = & mut data. into_iter ( ) ;
102
+ let mut reader = ReaderBuilder :: new ( schema)
103
+ . with_batch_size ( array_capacity)
104
+ . with_coerce_primitive ( false )
105
+ . build_decoder ( ) ?;
104
106
105
- let reader = Decoder :: new (
106
- schema,
107
- DecoderOptions :: new ( ) . with_batch_size ( array_capacity) ,
108
- ) ;
109
- match reader. next_batch ( & mut value_iter. map ( Ok ) ) {
107
+ reader. serialize ( & data) ?;
108
+ match reader. flush ( ) {
110
109
Ok ( Some ( recordbatch) ) => Ok ( recordbatch) ,
111
110
Err ( err) => Err ( anyhow ! ( "Failed to create recordbatch due to {:?}" , err) ) ,
112
111
Ok ( None ) => unreachable ! ( "all records are added to one rb" ) ,
@@ -116,14 +115,17 @@ impl EventFormat for Event {
116
115
117
116
// Returns arrow schema with the fields that are present in the request body
118
117
// This schema is an input to convert the request body to arrow record batch
119
- fn derive_arrow_schema ( schema : & HashMap < String , Field > , fields : Vec < & str > ) -> Result < Schema , ( ) > {
118
+ fn derive_arrow_schema (
119
+ schema : & HashMap < String , Arc < Field > > ,
120
+ fields : Vec < & str > ,
121
+ ) -> Result < Vec < Arc < Field > > , ( ) > {
120
122
let mut res = Vec :: with_capacity ( fields. len ( ) ) ;
121
123
let fields = fields. into_iter ( ) . map ( |field_name| schema. get ( field_name) ) ;
122
124
for field in fields {
123
125
let Some ( field) = field else { return Err ( ( ) ) } ;
124
126
res. push ( field. clone ( ) )
125
127
}
126
- Ok ( Schema :: new ( res) )
128
+ Ok ( res)
127
129
}
128
130
129
131
fn collect_keys < ' a > ( values : impl Iterator < Item = & ' a Value > ) -> Result < Vec < & ' a str > , ( ) > {
@@ -145,7 +147,7 @@ fn collect_keys<'a>(values: impl Iterator<Item = &'a Value>) -> Result<Vec<&'a s
145
147
Ok ( keys)
146
148
}
147
149
148
- fn fields_mismatch ( schema : & Schema , body : & Value ) -> bool {
150
+ fn fields_mismatch ( schema : & [ Arc < Field > ] , body : & Value ) -> bool {
149
151
for ( name, val) in body. as_object ( ) . expect ( "body is of object variant" ) {
150
152
if val. is_null ( ) {
151
153
continue ;
0 commit comments