|
20 | 20 | #![allow(deprecated)]
|
21 | 21 |
|
22 | 22 | use anyhow::anyhow;
|
23 |
| -use arrow_array::RecordBatch; |
| 23 | +use arrow_array::{RecordBatch, StringArray, StringViewArray}; |
24 | 24 | use arrow_json::reader::{infer_json_schema_from_iterator, ReaderBuilder};
|
25 | 25 | use arrow_schema::{DataType, Field, Fields, Schema};
|
26 | 26 | use datafusion::arrow::util::bit_util::round_upto_multiple_of_64;
|
@@ -105,22 +105,61 @@ impl EventFormat for Event {
|
105 | 105 | Ok((value_arr, schema, is_first))
|
106 | 106 | }
|
107 | 107 |
|
108 |
| - // Convert the Data type (defined above) to arrow record batch |
109 |
| - fn decode(data: Self::Data, schema: Arc<Schema>) -> Result<RecordBatch, anyhow::Error> { |
110 |
| - let array_capacity = round_upto_multiple_of_64(data.len()); |
111 |
| - let mut reader = ReaderBuilder::new(schema) |
112 |
| - .with_batch_size(array_capacity) |
113 |
| - .with_coerce_primitive(false) |
114 |
| - .build_decoder()?; |
115 |
| - |
116 |
| - reader.serialize(&data)?; |
117 |
| - match reader.flush() { |
118 |
| - Ok(Some(recordbatch)) => Ok(recordbatch), |
119 |
| - Err(err) => Err(anyhow!("Failed to create recordbatch due to {:?}", err)), |
120 |
| - Ok(None) => unreachable!("all records are added to one rb"), |
| 108 | + |
| 109 | +fn decode(data: Self::Data, schema: Arc<Schema>) -> Result<RecordBatch, anyhow::Error> { |
| 110 | + // First create a schema with Utf8 instead of Utf8View |
| 111 | + let temp_schema = Schema::new( |
| 112 | + schema |
| 113 | + .fields() |
| 114 | + .iter() |
| 115 | + .map(|field| { |
| 116 | + if matches!(field.data_type(), DataType::Utf8View) { |
| 117 | + Arc::new(Field::new(field.name(), DataType::Utf8, field.is_nullable())) |
| 118 | + } else { |
| 119 | + field.clone() |
| 120 | + } |
| 121 | + }) |
| 122 | + .collect::<Vec<_>>(), |
| 123 | + ); |
| 124 | + |
| 125 | + let array_capacity = round_upto_multiple_of_64(data.len()); |
| 126 | + let mut reader = ReaderBuilder::new(Arc::new(temp_schema)) |
| 127 | + .with_batch_size(array_capacity) |
| 128 | + .with_coerce_primitive(false) |
| 129 | + .with_strict_mode(false) |
| 130 | + .build_decoder()?; |
| 131 | + |
| 132 | + reader.serialize(&data)?; |
| 133 | + |
| 134 | + match reader.flush() { |
| 135 | + Ok(Some(temp_batch)) => { |
| 136 | + // Convert Utf8 arrays to Utf8View arrays where needed |
| 137 | + let new_columns: Vec<Arc<dyn arrow_array::Array>> = temp_batch |
| 138 | + .columns() |
| 139 | + .iter() |
| 140 | + .zip(schema.fields()) |
| 141 | + .map(|(col, field)| { |
| 142 | + if matches!(field.data_type(), DataType::Utf8View) { |
| 143 | + let string_array = col |
| 144 | + .as_any() |
| 145 | + .downcast_ref::<StringArray>() |
| 146 | + .expect("Expected StringArray"); |
| 147 | + Arc::new(StringViewArray::from( |
| 148 | + string_array.iter().map(|s| s.map(|s| s.to_string())).collect::<Vec<_>>() |
| 149 | + )) |
| 150 | + } else { |
| 151 | + col.clone() |
| 152 | + } |
| 153 | + }) |
| 154 | + .collect(); |
| 155 | + |
| 156 | + Ok(RecordBatch::try_new(schema, new_columns)?) |
121 | 157 | }
|
| 158 | + Err(err) => Err(anyhow!("Failed to create recordbatch due to {:?}", err)), |
| 159 | + Ok(None) => unreachable!("all records are added to one rb"), |
122 | 160 | }
|
123 | 161 | }
|
| 162 | +} |
124 | 163 |
|
125 | 164 | // Returns arrow schema with the fields that are present in the request body
|
126 | 165 | // This schema is an input to convert the request body to arrow record batch
|
@@ -179,7 +218,7 @@ fn valid_type(data_type: &DataType, value: &Value, schema_version: SchemaVersion
|
179 | 218 | DataType::Float16 | DataType::Float32 | DataType::Float64 => value.is_f64(),
|
180 | 219 | // All numbers can be cast as Float64 from schema version v1
|
181 | 220 | DataType::Int64 => value.is_i64() || is_parsable_as_number(value),
|
182 |
| - DataType::Utf8 => value.is_string(), |
| 221 | + DataType::Utf8View => value.is_string(), |
183 | 222 | DataType::List(field) => {
|
184 | 223 | let data_type = field.data_type();
|
185 | 224 | if let Value::Array(arr) = value {
|
|
0 commit comments