datafusion-contrib · gabotechs · Oct 9, 2025 · Oct 3, 2025 · Oct 3, 2025 · Oct 4, 2025
diff --git a/src/common/callback_stream.rs b/src/common/callback_stream.rs
diff --git a/src/common/map_last_stream.rs b/src/common/map_last_stream.rs
@@ -0,0 +1,84 @@
+use futures::{Stream, StreamExt, stream};
+use std::task::Poll;
+
+/// Maps the last element of the provided stream.
+pub(crate) fn map_last_stream<T>(
+    mut input: impl Stream<Item = T> + Unpin,
+    map_f: impl FnOnce(T) -> T,
+) -> impl Stream<Item = T> + Unpin {
+    let mut final_closure = Some(map_f);
+
+    // this is used to peek the new value so that we can map upon emitting the last message
+    let mut current_value = None;
+
+    stream::poll_fn(move |cx| match futures::ready!(input.poll_next_unpin(cx)) {
+        Some(new_val) => {
+            match current_value.take() {
+                // This is the first value, so we store it and repoll to get the next value
+                None => {
+                    current_value = Some(new_val);
+                    cx.waker().wake_by_ref();
+                    Poll::Pending
+                }
+
+                Some(existing) => {
+                    current_value = Some(new_val);
+
+                    Poll::Ready(Some(existing))
+                }
+            }
+        }
+        // this is our last value, so we map it using the user provided closure
+        None => match current_value.take() {
+            Some(existing) => {
+                // make sure we wake ourselves to finish the stream
+                cx.waker().wake_by_ref();
+
+                if let Some(closure) = final_closure.take() {
+                    Poll::Ready(Some(closure(existing)))
+                } else {
+                    unreachable!("the closure is only executed once")
+                }
+            }
+            None => Poll::Ready(None),
+        },
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use futures::stream;
+
+    #[tokio::test]
+    async fn test_map_last_stream_empty_stream() {
+        let input = stream::empty::<i32>();
+        let mapped = map_last_stream(input, |x| x + 10);
+        let result: Vec<i32> = mapped.collect().await;
+        assert_eq!(result, Vec::<i32>::new());
+    }
+
+    #[tokio::test]
+    async fn test_map_last_stream_single_element() {
+        let input = stream::iter(vec![5]);
+        let mapped = map_last_stream(input, |x| x * 2);
+        let result: Vec<i32> = mapped.collect().await;
+        assert_eq!(result, vec![10]);
+    }
+
+    #[tokio::test]
+    async fn test_map_last_stream_multiple_elements() {
+        let input = stream::iter(vec![1, 2, 3, 4]);
+        let mapped = map_last_stream(input, |x| x + 100);
+        let result: Vec<i32> = mapped.collect().await;
+        assert_eq!(result, vec![1, 2, 3, 104]); // Only the last element is transformed
+    }
+
+    #[tokio::test]
+    async fn test_map_last_stream_preserves_order() {
+        let input = stream::iter(vec![10, 20, 30, 40, 50]);
+        let mapped = map_last_stream(input, |x| x - 50);
+        let result: Vec<i32> = mapped.collect().await;
+        assert_eq!(result, vec![10, 20, 30, 40, 0]); // Last element: 50 - 50 = 0
+    }
+}
diff --git a/src/common/mod.rs b/src/common/mod.rs
@@ -1,7 +1,7 @@
-mod callback_stream;
+mod map_last_stream;
 mod partitioning;
 #[allow(unused)]
 pub mod ttl_map;
 
-pub(crate) use callback_stream::with_callback;
+pub(crate) use map_last_stream::map_last_stream;
 pub(crate) use partitioning::{scale_partitioning, scale_partitioning_props};
diff --git a/src/execution_plans/network_coalesce.rs b/src/execution_plans/network_coalesce.rs
@@ -14,11 +14,12 @@ use arrow_flight::decode::FlightRecordBatchStream;
 use arrow_flight::error::FlightError;
 use dashmap::DashMap;
 use datafusion::common::{exec_err, internal_datafusion_err, internal_err, plan_err};
+use datafusion::datasource::schema_adapter::DefaultSchemaAdapterFactory;
 use datafusion::error::DataFusionError;
 use datafusion::execution::{SendableRecordBatchStream, TaskContext};
 use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
 use datafusion::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties};
-use futures::{TryFutureExt, TryStreamExt};
+use futures::{StreamExt, TryFutureExt, TryStreamExt};
 use http::Extensions;
 use prost::Message;
 use std::any::Any;
@@ -283,6 +284,8 @@ impl ExecutionPlan for NetworkCoalesceExec {
         };
 
         let metrics_collection_capture = self_ready.metrics_collection.clone();
+        let adapter = DefaultSchemaAdapterFactory::from_schema(self.schema());
+        let (mapper, _indices) = adapter.map_schema(&self.schema())?;
         let stream = async move {
             let mut client = channel_resolver.get_flight_client_for_url(&url).await?;
             let stream = client
@@ -297,7 +300,12 @@ impl ExecutionPlan for NetworkCoalesceExec {
 
             Ok(
                 FlightRecordBatchStream::new_from_flight_data(metrics_collecting_stream)
-                    .map_err(map_flight_to_datafusion_error),
+                    .map_err(map_flight_to_datafusion_error)
+                    .map(move |batch| {
+                        let batch = batch?;
+
+                        mapper.map_batch(batch)
+                    }),
             )
         }
         .try_flatten_stream();

diff --git a/src/execution_plans/network_shuffle.rs b/src/execution_plans/network_shuffle.rs
@@ -14,6 +14,7 @@ use arrow_flight::decode::FlightRecordBatchStream;
 use arrow_flight::error::FlightError;
 use dashmap::DashMap;
 use datafusion::common::{exec_err, internal_datafusion_err, internal_err, plan_err};
+use datafusion::datasource::schema_adapter::DefaultSchemaAdapterFactory;
 use datafusion::error::DataFusionError;
 use datafusion::execution::{SendableRecordBatchStream, TaskContext};
 use datafusion::physical_expr::Partitioning;
@@ -308,8 +309,12 @@ impl ExecutionPlan for NetworkShuffleExec {
         let task_context = DistributedTaskContext::from_ctx(&context);
         let off = self_ready.properties.partitioning.partition_count() * task_context.task_index;
 
+        let adapter = DefaultSchemaAdapterFactory::from_schema(self.schema());
+        let (mapper, _indices) = adapter.map_schema(&self.schema())?;
+
         let stream = input_stage_tasks.into_iter().enumerate().map(|(i, task)| {
             let channel_resolver = Arc::clone(&channel_resolver);
+            let mapper = mapper.clone();
 
             let ticket = Request::from_parts(
                 MetadataMap::from_headers(context_headers.clone()),
@@ -349,7 +354,12 @@ impl ExecutionPlan for NetworkShuffleExec {
 
                 Ok(
                     FlightRecordBatchStream::new_from_flight_data(metrics_collecting_stream)
-                        .map_err(map_flight_to_datafusion_error),
+                        .map_err(map_flight_to_datafusion_error)
+                        .map(move |batch| {
+                            let batch = batch?;
+
+                            mapper.map_batch(batch)
+                        }),
                 )
             }
             .try_flatten_stream()