Skip to content

Commit 5307851

Browse files
Implemented casting for RunEnd Encoding
1 parent 4c5b644 commit 5307851

File tree

2 files changed

+49
-273
lines changed

2 files changed

+49
-273
lines changed

arrow-cast/src/cast/mod.rs

Lines changed: 13 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -10726,25 +10726,16 @@ mod tests {
1072610726
let values = Int32Array::from(vec![1, 2, 3]);
1072710727
let run_array = RunArray::<Int32Type>::try_new(&run_ends, &values).unwrap();
1072810728
let array_ref = Arc::new(run_array) as ArrayRef;
10729-
10729+
println!("1");
1073010730
// Cast to Int64
1073110731
let cast_result = cast(&array_ref, &DataType::Int64).unwrap();
10732-
10732+
println!("2");
1073310733
// Verify the result is a RunArray with Int64 values
1073410734
let result_run_array = cast_result
1073510735
.as_any()
10736-
.downcast_ref::<RunArray<Int32Type>>()
10736+
.downcast_ref::<Int64Array>()
1073710737
.unwrap();
10738-
10739-
// Check that values were cast to Int64
10740-
assert_eq!(result_run_array.values().data_type(), &DataType::Int64);
10741-
10742-
// Check that run structure is preserved
10743-
assert_eq!(result_run_array.run_ends().values(), &[2, 5, 6]);
10744-
10745-
// Check that values are correct
10746-
let values_array = result_run_array.values().as_primitive::<Int64Type>();
10747-
assert_eq!(values_array.values(), &[1i64, 2i64, 3i64]);
10738+
assert_eq!(result_run_array.values(), &[1i64, 1i64, 2i64, 2i64, 2i64, 3i64]);
1074810739
}
1074910740

1075010741
/// Test casting FROM RunEndEncoded to string
@@ -10760,22 +10751,14 @@ mod tests {
1076010751
let cast_result = cast(&array_ref, &DataType::Utf8).unwrap();
1076110752

1076210753
// Verify the result is a RunArray with String values
10763-
let result_run_array = cast_result
10754+
let result_array = cast_result
1076410755
.as_any()
10765-
.downcast_ref::<RunArray<Int32Type>>()
10756+
.downcast_ref::<StringArray>()
1076610757
.unwrap();
10767-
10768-
// Check that values were cast to String
10769-
assert_eq!(result_run_array.values().data_type(), &DataType::Utf8);
10770-
10771-
// Check that run structure is preserved
10772-
assert_eq!(result_run_array.run_ends().values(), &[2, 3, 5]);
10773-
1077410758
// Check that values are correct
10775-
let values_array = result_run_array.values().as_string::<i32>();
10776-
assert_eq!(values_array.value(0), "10");
10777-
assert_eq!(values_array.value(1), "20");
10778-
assert_eq!(values_array.value(2), "30");
10759+
assert_eq!(result_array.value(0), "10");
10760+
assert_eq!(result_array.value(1), "10");
10761+
assert_eq!(result_array.value(2), "20");
1077910762
}
1078010763

1078110764
/// Test casting TO RunEndEncoded from primitive types
@@ -10909,13 +10892,11 @@ mod tests {
1090910892
// Verify the result preserves nulls
1091010893
let result_run_array = cast_result
1091110894
.as_any()
10912-
.downcast_ref::<RunArray<Int32Type>>()
10895+
.downcast_ref::<StringArray>()
1091310896
.unwrap();
10914-
10915-
let values_array = result_run_array.values().as_string::<i32>();
10916-
assert_eq!(values_array.value(0), "1");
10917-
assert!(values_array.is_null(1));
10918-
assert_eq!(values_array.value(2), "2");
10897+
assert_eq!(result_run_array.value(0), "1");
10898+
assert!(result_run_array.is_null(2));
10899+
assert_eq!(result_run_array.value(4), "2");
1091910900
}
1092010901

1092110902
/// Test different index types (Int16, Int64)

arrow-cast/src/cast/run_array.rs

Lines changed: 36 additions & 241 deletions
Original file line numberDiff line numberDiff line change
@@ -7,23 +7,46 @@ pub(crate) fn run_end_encoded_cast<K: RunEndIndexType>(
77
) -> Result<ArrayRef, ArrowError> {
88
match array.data_type() {
99
DataType::RunEndEncoded(_run_end_field, _values_field) => {
10-
let run_array = array.as_any().downcast_ref::<RunArray<K>>().unwrap();
10+
let run_array = array
11+
.as_any()
12+
.downcast_ref::<RunArray<K>>()
13+
.ok_or_else(|| ArrowError::CastError("Expected RunArray".to_string()))?;
1114

1215
let values = run_array.values();
1316

14-
// Cast the values to the target type
15-
let cast_values = cast_with_options(values, to_type, cast_options)?;
16-
17-
// Create a PrimitiveArray from the run_ends buffer
18-
let run_ends_buffer = run_array.run_ends();
19-
let run_ends_array =
20-
PrimitiveArray::<K>::from_iter_values(run_ends_buffer.values().iter().copied());
21-
22-
// Create new RunArray with the same run_ends but cast values
23-
let new_run_array = RunArray::<K>::try_new(&run_ends_array, cast_values.as_ref())?;
24-
25-
Ok(Arc::new(new_run_array))
17+
match to_type {
18+
// CASE 1: Stay as RunEndEncoded, cast only the values
19+
DataType::RunEndEncoded(_target_run_end_field, target_value_field) => {
20+
let cast_values =
21+
cast_with_options(values, target_value_field.data_type(), cast_options)?;
22+
23+
let run_ends_array = PrimitiveArray::<K>::from_iter_values(
24+
run_array.run_ends().values().iter().copied(),
25+
);
26+
27+
let new_run_array =
28+
RunArray::<K>::try_new(&run_ends_array, cast_values.as_ref())?;
29+
Ok(Arc::new(new_run_array))
30+
}
31+
32+
// CASE 2: Expand to logical form
33+
_ => {
34+
let total_len = run_array.len();
35+
let indices = Int32Array::from_iter_values(
36+
(0..total_len).map(|i| run_array.get_physical_index(i) as i32),
37+
);
38+
39+
let taken = take(values.as_ref(), &indices, None)?;
40+
41+
if taken.data_type() != to_type {
42+
cast_with_options(taken.as_ref(), to_type, cast_options)
43+
} else {
44+
Ok(taken)
45+
}
46+
}
47+
}
2648
}
49+
2750
_ => Err(ArrowError::CastError(format!(
2851
"Cannot cast array of type {:?} to RunEndEncodedArray",
2952
array.data_type()
@@ -76,12 +99,6 @@ pub(crate) fn cast_to_run_end_encoded<K: RunEndIndexType>(
7699
)?));
77100
}
78101

79-
// Step 3: Use a simpler approach - use existing Arrow builders for run-length encoding
80-
// This is a more robust implementation that handles all data types correctly
81-
82-
// For now, we'll use a basic approach that works with the existing builder infrastructure
83-
// In a production implementation, you'd want to use type-specific comparison logic
84-
85102
// Create a temporary builder to construct the run array
86103
// We'll iterate through and build runs by comparing adjacent elements
87104
let mut run_ends_vec = Vec::new();
@@ -133,225 +150,3 @@ pub(crate) fn cast_to_run_end_encoded<K: RunEndIndexType>(
133150
let run_array = RunArray::<K>::try_new(&run_ends_array, values_array.as_ref())?;
134151
Ok(Arc::new(run_array))
135152
}
136-
137-
#[cfg(test)]
138-
mod tests {
139-
use super::*;
140-
use arrow_array::*;
141-
use arrow_schema::DataType;
142-
use std::sync::Arc;
143-
144-
/// Test casting FROM RunEndEncoded to other types
145-
#[test]
146-
fn test_run_end_encoded_to_primitive() {
147-
// Create a RunEndEncoded array: [1, 1, 2, 2, 2, 3]
148-
let run_ends = Int32Array::from(vec![2, 5, 6]);
149-
let values = Int32Array::from(vec![1, 2, 3]);
150-
let run_array = RunArray::<Int32Type>::try_new(&run_ends, &values).unwrap();
151-
let array_ref = Arc::new(run_array) as ArrayRef;
152-
153-
// Cast to Int64
154-
let cast_result = run_end_encoded_cast::<Int32Type>(
155-
array_ref.as_ref(),
156-
&DataType::Int64,
157-
&CastOptions::default(),
158-
)
159-
.unwrap();
160-
161-
// Verify the result is a RunArray with Int64 values
162-
let result_run_array = cast_result
163-
.as_any()
164-
.downcast_ref::<RunArray<Int32Type>>()
165-
.unwrap();
166-
167-
// Check that values were cast to Int64
168-
assert_eq!(result_run_array.values().data_type(), &DataType::Int64);
169-
170-
// Check that run structure is preserved
171-
assert_eq!(result_run_array.run_ends().values(), &[2, 5, 6]);
172-
173-
// Check that values are correct
174-
let values_array = result_run_array.values().as_primitive::<Int64Type>();
175-
assert_eq!(values_array.values(), &[1i64, 2i64, 3i64]);
176-
}
177-
178-
#[test]
179-
fn test_run_end_encoded_to_string() {
180-
// Create a RunEndEncoded array with Int32 values: [10, 10, 20, 30, 30]
181-
let run_ends = Int32Array::from(vec![2, 3, 5]);
182-
let values = Int32Array::from(vec![10, 20, 30]);
183-
let run_array = RunArray::<Int32Type>::try_new(&run_ends, &values).unwrap();
184-
let array_ref = Arc::new(run_array) as ArrayRef;
185-
186-
// Cast to String
187-
let cast_result = run_end_encoded_cast::<Int32Type>(
188-
array_ref.as_ref(),
189-
&DataType::Utf8,
190-
&CastOptions::default(),
191-
)
192-
.unwrap();
193-
194-
// Verify the result is a RunArray with String values
195-
let result_run_array = cast_result
196-
.as_any()
197-
.downcast_ref::<RunArray<Int32Type>>()
198-
.unwrap();
199-
200-
// Check that values were cast to String
201-
assert_eq!(result_run_array.values().data_type(), &DataType::Utf8);
202-
203-
// Check that run structure is preserved
204-
assert_eq!(result_run_array.run_ends().values(), &[2, 3, 5]);
205-
206-
// Check that values are correct
207-
let values_array = result_run_array.values().as_string::<i32>();
208-
assert_eq!(values_array.value(0), "10");
209-
assert_eq!(values_array.value(1), "20");
210-
assert_eq!(values_array.value(2), "30");
211-
}
212-
213-
/// Test casting TO RunEndEncoded from other types
214-
#[test]
215-
fn test_primitive_to_run_end_encoded() {
216-
// Create an Int32 array with repeated values: [1, 1, 2, 2, 2, 3]
217-
let source_array = Int32Array::from(vec![1, 1, 2, 2, 2, 3]);
218-
let array_ref = Arc::new(source_array) as ArrayRef;
219-
220-
// Cast to RunEndEncoded<Int32, Int32>
221-
let cast_result = cast_to_run_end_encoded::<Int32Type>(
222-
array_ref.as_ref(),
223-
&DataType::Int32,
224-
&CastOptions::default(),
225-
)
226-
.unwrap();
227-
228-
// Verify the result is a RunArray
229-
let result_run_array = cast_result
230-
.as_any()
231-
.downcast_ref::<RunArray<Int32Type>>()
232-
.unwrap();
233-
234-
// Check run structure: runs should end at positions [2, 5, 6]
235-
assert_eq!(result_run_array.run_ends().values(), &[2, 5, 6]);
236-
237-
// Check values: should be [1, 2, 3]
238-
let values_array = result_run_array.values().as_primitive::<Int32Type>();
239-
assert_eq!(values_array.values(), &[1, 2, 3]);
240-
}
241-
242-
#[test]
243-
fn test_string_to_run_end_encoded() {
244-
// Create a String array with repeated values: ["a", "a", "b", "c", "c"]
245-
let source_array = StringArray::from(vec!["a", "a", "b", "c", "c"]);
246-
let array_ref = Arc::new(source_array) as ArrayRef;
247-
248-
// Cast to RunEndEncoded<Int32, String>
249-
let cast_result = cast_to_run_end_encoded::<Int32Type>(
250-
array_ref.as_ref(),
251-
&DataType::Utf8,
252-
&CastOptions::default(),
253-
)
254-
.unwrap();
255-
256-
// Verify the result is a RunArray
257-
let result_run_array = cast_result
258-
.as_any()
259-
.downcast_ref::<RunArray<Int32Type>>()
260-
.unwrap();
261-
262-
// Check run structure: runs should end at positions [2, 3, 5]
263-
assert_eq!(result_run_array.run_ends().values(), &[2, 3, 5]);
264-
265-
// Check values: should be ["a", "b", "c"]
266-
let values_array = result_run_array.values().as_string::<i32>();
267-
assert_eq!(values_array.value(0), "a");
268-
assert_eq!(values_array.value(1), "b");
269-
assert_eq!(values_array.value(2), "c");
270-
}
271-
272-
#[test]
273-
fn test_cast_with_type_conversion() {
274-
// Create an Int32 array: [1, 1, 2, 2, 3]
275-
let source_array = Int32Array::from(vec![1, 1, 2, 2, 3]);
276-
let array_ref = Arc::new(source_array) as ArrayRef;
277-
278-
// Cast to RunEndEncoded<Int32, String> (values get converted to strings)
279-
let cast_result = cast_to_run_end_encoded::<Int32Type>(
280-
array_ref.as_ref(),
281-
&DataType::Utf8,
282-
&CastOptions::default(),
283-
)
284-
.unwrap();
285-
286-
// Verify the result is a RunArray with String values
287-
let result_run_array = cast_result
288-
.as_any()
289-
.downcast_ref::<RunArray<Int32Type>>()
290-
.unwrap();
291-
292-
// Check that values were converted to strings
293-
assert_eq!(result_run_array.values().data_type(), &DataType::Utf8);
294-
295-
// Check run structure: runs should end at positions [2, 4, 5]
296-
assert_eq!(result_run_array.run_ends().values(), &[2, 4, 5]);
297-
298-
// Check values: should be ["1", "2", "3"]
299-
let values_array = result_run_array.values().as_string::<i32>();
300-
assert_eq!(values_array.value(0), "1");
301-
assert_eq!(values_array.value(1), "2");
302-
assert_eq!(values_array.value(2), "3");
303-
}
304-
305-
#[test]
306-
fn test_empty_array_to_run_end_encoded() {
307-
// Create an empty Int32 array
308-
let source_array = Int32Array::from(Vec::<i32>::new());
309-
let array_ref = Arc::new(source_array) as ArrayRef;
310-
311-
// Cast to RunEndEncoded<Int32, Int32>
312-
let cast_result = cast_to_run_end_encoded::<Int32Type>(
313-
array_ref.as_ref(),
314-
&DataType::Int32,
315-
&CastOptions::default(),
316-
)
317-
.unwrap();
318-
319-
// Verify the result is an empty RunArray
320-
let result_run_array = cast_result
321-
.as_any()
322-
.downcast_ref::<RunArray<Int32Type>>()
323-
.unwrap();
324-
325-
// Check that both run_ends and values are empty
326-
assert_eq!(result_run_array.run_ends().len(), 0);
327-
assert_eq!(result_run_array.values().len(), 0);
328-
}
329-
330-
#[test]
331-
fn test_run_end_encoded_with_nulls() {
332-
// Create a RunEndEncoded array with nulls: [1, 1, null, 2, 2]
333-
let run_ends = Int32Array::from(vec![2, 3, 5]);
334-
let values = Int32Array::from(vec![Some(1), None, Some(2)]);
335-
let run_array = RunArray::<Int32Type>::try_new(&run_ends, &values).unwrap();
336-
let array_ref = Arc::new(run_array) as ArrayRef;
337-
338-
// Cast to String
339-
let cast_result = run_end_encoded_cast::<Int32Type>(
340-
array_ref.as_ref(),
341-
&DataType::Utf8,
342-
&CastOptions::default(),
343-
)
344-
.unwrap();
345-
346-
// Verify the result preserves nulls
347-
let result_run_array = cast_result
348-
.as_any()
349-
.downcast_ref::<RunArray<Int32Type>>()
350-
.unwrap();
351-
352-
let values_array = result_run_array.values().as_string::<i32>();
353-
assert_eq!(values_array.value(0), "1");
354-
assert!(values_array.is_null(1));
355-
assert_eq!(values_array.value(2), "2");
356-
}
357-
}

0 commit comments

Comments
 (0)