apache · sven-weber-db · May 8, 2026 · sven-weber-db · May 7, 2026 · sven-weber-db
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
@@ -415,13 +415,17 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
      */
     def writeNextInputToStream(dataOut: DataOutputStream): Boolean
 
-    def open(dataOut: DataOutputStream): Unit = Utils.logUncaughtExceptions {
+    def open(outputStream: DataOutputStream): Unit = Utils.logUncaughtExceptions {
       val isUnixDomainSock = authHelper.conf.get(PYTHON_UNIX_DOMAIN_SOCKET_ENABLED)
       lazy val sockPath = new File(
         authHelper.conf.get(PYTHON_UNIX_DOMAIN_SOCKET_DIR)
           .getOrElse(System.getProperty("java.io.tmpdir")),
         s".${UUID.randomUUID()}.sock")
       try {
+        // Buffer the initialization message, and send it together with its length.
+        val buffer = new ByteArrayOutputStream()
+        val dataOut = new DataOutputStream(buffer)
+
         // Partition index
         dataOut.writeInt(partitionIndex)
 
@@ -522,6 +526,13 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
         writeCommand(dataOut)
 
         dataOut.flush()
+
+        // The initialization message is complete, write it to the stream with its length.
+        val messageBytes = buffer.toByteArray
+        outputStream.writeInt(SpecialLengths.START_OF_INIT_MESSAGE)
+        outputStream.writeInt(messageBytes.length)
+        outputStream.write(messageBytes)
+        outputStream.flush()
       } catch {
         case t: Throwable if NonFatal(t) || t.isInstanceOf[Exception] =>
           if (context.isCompleted() || context.isInterrupted()) {
@@ -1085,6 +1096,7 @@ private[spark] object SpecialLengths {
   val NULL = -5
   val START_ARROW_STREAM = -6
   val END_OF_MICRO_BATCH = -7
+  val START_OF_INIT_MESSAGE = -8
 }
 
 private[spark] object BarrierTaskContextMessageProtocol {

diff --git a/python/packaging/classic/setup.py b/python/packaging/classic/setup.py
@@ -267,6 +267,8 @@ def run(self):
             "pyspark",
             "pyspark.core",
             "pyspark.cloudpickle",
+            "pyspark.messages",
+            "pyspark.messages.socket",
             "pyspark.mllib",
             "pyspark.mllib.linalg",
             "pyspark.mllib.stat",

diff --git a/python/packaging/client/setup.py b/python/packaging/client/setup.py
@@ -148,6 +148,8 @@
     connect_packages = [
         "pyspark",
         "pyspark.cloudpickle",
+        "pyspark.messages",
+        "pyspark.messages.socket",
         "pyspark.mllib",
         "pyspark.mllib.linalg",
         "pyspark.mllib.stat",

diff --git a/python/pyspark/messages/__init__.py b/python/pyspark/messages/__init__.py
@@ -15,8 +15,12 @@
 # limitations under the License.
 #
 
+from pyspark.messages.spark_message_receiver import SparkMessageReceiver
 from pyspark.messages.zero_copy_byte_stream import ZeroCopyByteStream
+from pyspark.messages.socket.spark_socket_message_receiver import SparkSocketMessageReceiver
 
 __all__ = [
+    "SparkMessageReceiver",
+    "SparkSocketMessageReceiver",
     "ZeroCopyByteStream",
 ]
diff --git a/python/pyspark/messages/socket/__init__.py b/python/pyspark/messages/socket/__init__.py
@@ -0,0 +1,16 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pyspark/messages/socket/spark_socket_message_receiver.py b/python/pyspark/messages/socket/spark_socket_message_receiver.py
@@ -0,0 +1,64 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import BinaryIO
+
+from pyspark.serializers import read_int, SpecialLengths
+from pyspark.messages.zero_copy_byte_stream import ZeroCopyByteStream
+from pyspark.messages.spark_message_receiver import (
+    SparkMessageReceiver,
+)
+
+
+def _assert_message_id(message_id: int, expected: int) -> None:
+    assert message_id == expected, (
+        f"Expected message with id {expected} " + f"but got message with id {message_id} instead."
+    )
+
+
+class SparkSocketMessageReceiver(SparkMessageReceiver):
+    def __init__(self, infile: BinaryIO):
+        super().__init__()
+        self._infile = infile
+
+    def _do_get_init_message(self) -> ZeroCopyByteStream:
+        message_id = read_int(self._infile)
+        _assert_message_id(message_id, SpecialLengths.START_OF_INIT_MESSAGE)
+
+        # Read the length and init content
+        message_length = read_int(self._infile)
+        message_content = self._infile.read(message_length)
+
+        return ZeroCopyByteStream(memoryview(message_content))
+
+    def _do_get_data_stream(self) -> BinaryIO:
+        # For socket communication, we just pass along the underlying socket
+        # for the data channel. We already stripped the initialization data
+        # at this state. Therefore, any bytes following this are data bytes.
+        #
+        # Note: We deliberately did not introduce a message header for
+        # data messages to reduce the overhead, especially for small
+        # batch sizes and real-time-mode (RTM).
+        return self._infile
+
+    def _do_is_stream_finished(self) -> bool:
+        # Check if the stream is finished.
+        # If everything finished properly, we should read a
+        # 'END_OF_STREAM'. If we read somethign else this means
+        # the stream has unread data and something went wrong
+        # during processing.
+        return read_int(self._infile) == SpecialLengths.END_OF_STREAM
diff --git a/python/pyspark/messages/spark_message_receiver.py b/python/pyspark/messages/spark_message_receiver.py
@@ -0,0 +1,126 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from enum import Enum
+from functools import wraps
+from typing import BinaryIO, Callable, TypeVar
+from abc import ABC, abstractmethod
+
+from pyspark.messages.zero_copy_byte_stream import ZeroCopyByteStream
+
+
+T = TypeVar("T", bound="SparkMessageReceiver")
+R = TypeVar("R")
+
+
+class MessageState(Enum):
+    WAITING_FOR_INIT = 1
+    WAITING_FOR_DATA = 2
+    WAITING_FOR_FINISH = 3
+    DONE = 4
+
+
+class SparkMessageReceiver(ABC):
+    """
+    Generic class that implements receiving messages from Spark.
+    Caution: This class is STATEFUL. It is expected, that the
+    methods of this class are called in the following order:
+
+    1. Init -> 2. Data stream -> 3. Finish
+
+    This order is verified using assertions in the class. Each function
+    can be called EXACTLY ONCE in the specified order.
+    """
+
+    def __init__(self) -> None:
+        self._state = MessageState.WAITING_FOR_INIT
+
+    @staticmethod
+    def _state_transition(
+        required_state: MessageState, next_state: MessageState
+    ) -> Callable[[Callable[[T], R]], Callable[[T], R]]:
+        """Decorator to enforce state transitions."""
+
+        def decorator(func: Callable[[T], R]) -> Callable[[T], R]:
+            @wraps(func)
+            def wrapper(self: T) -> R:
+                assert self._state == required_state
+                result = func(self)
+                self._state = next_state
+                return result
+
+            return wrapper
+
+        return decorator
+
+    @_state_transition(MessageState.WAITING_FOR_INIT, MessageState.WAITING_FOR_DATA)
+    def get_init_message(self) -> ZeroCopyByteStream:
+        """
+        Returns:
+            the binary contents of the initial message as a ZeroCopyByteStream.
+        """
+        return self._do_get_init_message()
+
+    @_state_transition(MessageState.WAITING_FOR_DATA, MessageState.WAITING_FOR_FINISH)
+    def get_data_stream(self) -> BinaryIO:
+        """
+        Returns:
+            A binary stream containing the data to invoke the UDF on.
+        """
+        return self._do_get_data_stream()
+
+    @_state_transition(MessageState.WAITING_FOR_FINISH, MessageState.DONE)
+    def is_stream_finished(self) -> bool:
+        """
+        Checks if a finish message was received
+        from the JVM. The finish message itself only
+        has a message id and marks the end of the stream.
+        If bytes different from the finish id are read
+        this means something went wrong while consuming the stream.
+        """
+        return self._do_is_stream_finished()
+
+    @abstractmethod
+    def _do_get_init_message(self) -> ZeroCopyByteStream:
+        """
+        Returns the contents of the init message
+        as a 'ZeroCopyByteStream'.
+
+        To be implemented by child classes.
+        """
+        pass
+
+    @abstractmethod
+    def _do_get_data_stream(self) -> BinaryIO:
+        """
+        Returns the Spark data stream.
+
+        To be implemented by child classes.
+        """
+        pass
+
+    @abstractmethod
+    def _do_is_stream_finished(self) -> bool:
+        """
+        Blocking call that returns whether
+        the data stream from the JVM is finished.
+        This is implemented differently, depending
+        on the transport channel.
+
+        To be implemented by child classes.
+        """
+        pass
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
@@ -63,6 +63,7 @@
 import zlib
 import itertools
 import pickle
+import codecs
 
 pickle_protocol = pickle.HIGHEST_PROTOCOL
 
@@ -84,6 +85,7 @@ class SpecialLengths:
     END_OF_STREAM = -4
     NULL = -5
     START_ARROW_STREAM = -6
+    START_OF_INIT_MESSAGE = -8
 
 
 class Serializer:
@@ -539,7 +541,7 @@ def loads(self, stream):
         elif length == SpecialLengths.NULL:
             return None
         s = stream.read(length)
-        return s.decode("utf-8") if self.use_unicode else s
+        return codecs.decode(s, "utf-8") if self.use_unicode else s
 
     def load_stream(self, stream):
         try:

diff --git a/python/pyspark/taskcontext.py b/python/pyspark/taskcontext.py
@@ -161,7 +161,7 @@ def _getOrCreate(cls: Type["TaskContext"]) -> "TaskContext":
         return cls._taskContext
 
     @classmethod
-    def _setTaskContext(cls: Type["TaskContext"], taskContext: "TaskContext") -> None:
+    def _setTaskContext(cls: Type["TaskContext"], taskContext: Optional["TaskContext"]) -> None:
         cls._taskContext = taskContext
 
     @classmethod