apache
diff --git a/‎python/pyspark/taskcontext.py‎
Lines changed: 3 additions & 3 deletions b/‎python/pyspark/taskcontext.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎python/pyspark/worker.py‎
Lines changed: 47 additions & 79 deletions b/‎python/pyspark/worker.py‎
Lines changed: 47 additions & 79 deletions
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from typing import ClassVar, Type, TypeVar, Dict, List, Optional, Union, cast
+from typing import Any, ClassVar, Type, TypeVar, Dict, List, Optional, Union, cast
 
 from pyspark.util import local_connect_and_auth
 from pyspark.serializers import read_int, write_int, write_with_length, UTF8Deserializer
@@ -130,7 +130,7 @@ class TaskContext:
     _cpus: Optional[int] = None
     _resources: Optional[Dict[str, "ResourceInformation"]] = None
 
-    def __new__(cls: Type["TaskContext"], **kwargs: Dict) -> "TaskContext":
+    def __new__(cls: Type["TaskContext"], **kwargs: Any) -> "TaskContext":
         """
         Even if users construct :class:`TaskContext` instead of using get, give them the singleton.
         """
@@ -142,7 +142,7 @@ def __new__(cls: Type["TaskContext"], **kwargs: Dict) -> "TaskContext":
 
     def __init__(
         self,
-        **kwargs: Dict,
+        **kwargs: Any,
     ) -> None:
         # Set attributes only if they are passed in and not None
         # The kwargs are auto-mapped to the private attributes of TaskContext
 
@@ -57,8 +57,6 @@
 from pyspark.util import PythonEvalType
 from pyspark.serializers import (
     write_int,
-    read_long,
-    read_bool,
     write_long,
     read_int,
     SpecialLengths,
@@ -109,6 +107,7 @@
 )
 from pyspark import _NoValue, shuffle
 from pyspark.errors import PySparkRuntimeError, PySparkTypeError, PySparkValueError
+from pyspark.worker_message import WorkerInitInfo
 from pyspark.worker_util import (
     check_python_version,
     get_sock_file_to_executor,
@@ -118,7 +117,6 @@
     setup_broadcasts,
     setup_memory_limits,
     setup_spark_files,
-    utf8_deserializer,
     Conf,
 )
 from pyspark.logger.worker_io import capture_outputs
@@ -981,39 +979,25 @@ def profiling_func(*args, **kwargs):
     return profiling_func
 
 
-def read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index):
-    num_arg = read_int(infile)
-
-    args_offsets = []
-    kwargs_offsets = {}
-    for _ in range(num_arg):
-        offset = read_int(infile)
-        if read_bool(infile):
-            name = utf8_deserializer.loads(infile)
-            kwargs_offsets[name] = offset
-        else:
-            args_offsets.append(offset)
-
+def read_single_udf(pickleSer, udf_info, eval_type, runner_conf, udf_index):
     chained_func = None
-    for i in range(read_int(infile)):
-        f, return_type = read_command(pickleSer, infile)
+    for udf in udf_info.udfs:
+        f, return_type = read_command(pickleSer, udf)
         if chained_func is None:
             chained_func = f
         else:
             chained_func = chain(chained_func, f)
 
-    result_id = read_long(infile)
-
     # If chained_func is from pyspark.sql.worker, it is to read/write data source.
     # In this case, we check the data_source_profiler config.
     if getattr(chained_func, "__module__", "").startswith("pyspark.sql.worker."):
         profiler = runner_conf.data_source_profiler
     else:
         profiler = runner_conf.udf_profiler
     if profiler == "perf":
-        profiling_func = wrap_perf_profiler(chained_func, eval_type, result_id)
+        profiling_func = wrap_perf_profiler(chained_func, eval_type, udf_info.result_id)
     elif profiler == "memory":
-        profiling_func = wrap_memory_profiler(chained_func, eval_type, result_id)
+        profiling_func = wrap_memory_profiler(chained_func, eval_type, udf_info.result_id)
     else:
         profiling_func = chained_func
 
@@ -1027,6 +1011,8 @@ def read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index):
         # when they are processed in a for loop, raise them as RuntimeError's instead
         func = fail_on_stopiteration(profiling_func)
 
+    args_offsets, kwargs_offsets = udf_info.args, udf_info.kwargs
+
     # the last returnType will be the return type of UDF
     if eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF:
         return func, args_offsets, kwargs_offsets, return_type
@@ -1105,7 +1091,7 @@ def read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index):
 # It expects the UDTF to be in a specific format and performs various checks to
 # ensure the UDTF is valid. This function also prepares a mapper function for applying
 # the UDTF logic to input rows.
-def read_udtf(pickleSer, infile, eval_type, runner_conf, eval_conf):
+def read_udtf(pickleSer, udtf_info, eval_type, runner_conf, eval_conf):
     if eval_type == PythonEvalType.SQL_ARROW_TABLE_UDF:
         if runner_conf.use_legacy_pandas_udtf_conversion:
             # NOTE: if timezone is set here, that implies respectSessionTimeZone is True
@@ -1126,50 +1112,35 @@ def read_udtf(pickleSer, infile, eval_type, runner_conf, eval_conf):
         # Each row is a group so do not batch but send one by one.
         ser = BatchedSerializer(CPickleSerializer(), 1)
 
-    # See 'PythonUDTFRunner.PythonUDFWriterThread.writeCommand'
-    num_arg = read_int(infile)
-    args_offsets = []
-    kwargs_offsets = {}
-    for _ in range(num_arg):
-        offset = read_int(infile)
-        if read_bool(infile):
-            name = utf8_deserializer.loads(infile)
-            kwargs_offsets[name] = offset
-        else:
-            args_offsets.append(offset)
-    num_partition_child_indexes = read_int(infile)
-    partition_child_indexes = [read_int(infile) for i in range(num_partition_child_indexes)]
-    has_pickled_analyze_result = read_bool(infile)
-    if has_pickled_analyze_result:
-        pickled_analyze_result = pickleSer._read_with_length(infile)
+    if udtf_info.pickled_analyze_result is not None:
+        pickled_analyze_result = pickleSer.loads(udtf_info.pickled_analyze_result)
     else:
         pickled_analyze_result = None
     # Initially we assume that the UDTF __init__ method accepts the pickled AnalyzeResult,
     # although we may set this to false later if we find otherwise.
-    handler = read_command(pickleSer, infile)
+    handler = read_command(pickleSer, udtf_info.handler)
     if not isinstance(handler, type):
         raise PySparkRuntimeError(
             f"Invalid UDTF handler type. Expected a class (type 'type'), but "
             f"got an instance of {type(handler).__name__}."
         )
 
-    return_type = _parse_datatype_json_string(utf8_deserializer.loads(infile))
+    return_type = _parse_datatype_json_string(udtf_info.return_type)
     if not isinstance(return_type, StructType):
         raise PySparkRuntimeError(
             f"The return type of a UDTF must be a struct type, but got {type(return_type)}."
         )
-    udtf_name = utf8_deserializer.loads(infile)
 
     # Update the handler that creates a new UDTF instance to first try calling the UDTF constructor
     # with one argument containing the previous AnalyzeResult. If that fails, then try a constructor
     # with no arguments. In this way each UDTF class instance can decide if it wants to inspect the
     # AnalyzeResult.
     udtf_init_args = inspect.getfullargspec(handler)
-    if has_pickled_analyze_result:
+    if pickled_analyze_result is not None:
         if len(udtf_init_args.args) > 2:
             raise PySparkRuntimeError(
                 errorClass="UDTF_CONSTRUCTOR_INVALID_IMPLEMENTS_ANALYZE_METHOD",
-                messageParameters={"name": udtf_name},
+                messageParameters={"name": udtf_info.name},
             )
         elif len(udtf_init_args.args) == 2:
             prev_handler = handler
@@ -1182,7 +1153,7 @@ def construct_udtf():
     elif len(udtf_init_args.args) > 1:
         raise PySparkRuntimeError(
             errorClass="UDTF_CONSTRUCTOR_INVALID_NO_ANALYZE_METHOD",
-            messageParameters={"name": udtf_name},
+            messageParameters={"name": udtf_info.name},
         )
 
     class UDTFWithPartitions:
@@ -1220,7 +1191,7 @@ def __init__(self, create_udtf: Callable, partition_child_indexes: list):
             self._create_udtf: Callable = create_udtf
             self._udtf = create_udtf()
             self._prev_arguments: list = list()
-            self._partition_child_indexes: list = partition_child_indexes
+            self._partition_child_indexes: list = udtf_info.partition_child_indexes
             self._eval_raised_skip_rest_of_input_table: bool = False
 
         def eval(self, *args, **kwargs) -> Iterator:
@@ -1614,13 +1585,13 @@ def _remove_partition_by_exprs(self, arg: Any) -> Any:
 
     # Instantiate the UDTF class.
     try:
-        if len(partition_child_indexes) > 0:
+        if len(udtf_info.partition_child_indexes) > 0:
             # Determine if this is an Arrow UDTF
             is_arrow_udtf = eval_type == PythonEvalType.SQL_ARROW_UDTF
             if is_arrow_udtf:
-                udtf = ArrowUDTFWithPartition(handler, partition_child_indexes)
+                udtf = ArrowUDTFWithPartition(handler, udtf_info.partition_child_indexes)
             else:
-                udtf = UDTFWithPartitions(handler, partition_child_indexes)
+                udtf = UDTFWithPartitions(handler, udtf_info.partition_child_indexes)
         else:
             udtf = handler()
     except Exception as e:
@@ -1640,11 +1611,11 @@ def _remove_partition_by_exprs(self, arg: Any) -> Any:
     # Check that the arguments provided to the UDTF call match the expected parameters defined
     # in the 'eval' method signature.
     try:
-        inspect.signature(udtf.eval).bind(*args_offsets, **kwargs_offsets)
+        inspect.signature(udtf.eval).bind(*udtf_info.args, **udtf_info.kwargs)
     except TypeError as e:
         raise PySparkRuntimeError(
             errorClass="UDTF_EVAL_METHOD_ARGUMENTS_DO_NOT_MATCH_SIGNATURE",
-            messageParameters={"name": udtf_name, "reason": str(e)},
+            messageParameters={"name": udtf_info.name, "reason": str(e)},
         ) from None
 
     def build_null_checker(return_type: StructType) -> Optional[Callable[[Any], None]]:
@@ -1848,7 +1819,7 @@ def evaluate(*args: pd.Series, num_rows=1):
             return evaluate
 
         eval_func_kwargs_support, args_kwargs_offsets = wrap_kwargs_support(
-            getattr(udtf, "eval"), args_offsets, kwargs_offsets
+            getattr(udtf, "eval"), udtf_info.args, udtf_info.kwargs
         )
         eval = wrap_arrow_udtf(eval_func_kwargs_support, return_type)
 
@@ -2010,7 +1981,7 @@ def evaluate(*args: list, num_rows=1):
             return evaluate
 
         eval_func_kwargs_support, args_kwargs_offsets = wrap_kwargs_support(
-            getattr(udtf, "eval"), args_offsets, kwargs_offsets
+            getattr(udtf, "eval"), udtf_info.args, udtf_info.kwargs
         )
         eval = wrap_arrow_udtf(eval_func_kwargs_support, return_type)
 
@@ -2136,7 +2107,7 @@ def evaluate(*args: pa.RecordBatch):
             return evaluate
 
         eval_func_kwargs_support, args_kwargs_offsets = wrap_kwargs_support(
-            getattr(udtf, "eval"), args_offsets, kwargs_offsets
+            getattr(udtf, "eval"), udtf_info.args, udtf_info.kwargs
         )
         eval = wrap_pyarrow_udtf(eval_func_kwargs_support, return_type)
 
@@ -2238,7 +2209,7 @@ def evaluate(*a) -> tuple:
             return evaluate
 
         eval_func_kwargs_support, args_kwargs_offsets = wrap_kwargs_support(
-            getattr(udtf, "eval"), args_offsets, kwargs_offsets
+            getattr(udtf, "eval"), udtf_info.args, udtf_info.kwargs
         )
         eval = wrap_udtf(eval_func_kwargs_support, return_type)
 
@@ -2266,7 +2237,7 @@ def mapper(_, it):
         return mapper, None, ser, ser
 
 
-def read_udfs(pickleSer, infile, eval_type, runner_conf, eval_conf):
+def read_udfs(pickleSer, udf_info_list, eval_type, runner_conf, eval_conf):
     if eval_type in (
         PythonEvalType.SQL_ARROW_BATCHED_UDF,
         PythonEvalType.SQL_SCALAR_PANDAS_UDF,
@@ -2400,13 +2371,13 @@ def read_udfs(pickleSer, infile, eval_type, runner_conf, eval_conf):
         batch_size = int(os.environ.get("PYTHON_UDF_BATCH_SIZE", "100"))
         ser = BatchedSerializer(CPickleSerializer(), batch_size)
 
-    # Read all UDFs
-    num_udfs = read_int(infile)
     udfs = [
-        read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i)
-        for i in range(num_udfs)
+        read_single_udf(pickleSer, udf_info, eval_type, runner_conf, udf_index=udf_index)
+        for udf_index, udf_info in enumerate(udf_info_list)
     ]
 
+    num_udfs = len(udfs)
+
     def extract_key_value_indexes(grouped_arg_offsets):
         """
         Helper function to extract the key and value indexes from arg_offsets for the grouped and
@@ -3599,52 +3570,49 @@ def func(_, it):
 def main(infile, outfile):
     try:
         boot_time = time.time()
-        split_index = read_int(infile)
-        if split_index == -1:  # for unit tests
-            sys.exit(-1)
+        init_info = WorkerInitInfo.from_stream(infile)
         start_faulthandler_periodic_traceback()
-        check_python_version(infile)
+        check_python_version(init_info.python_version)
 
         memory_limit_mb = int(os.environ.get("PYSPARK_EXECUTOR_MEMORY_MB", "-1"))
         setup_memory_limits(memory_limit_mb)
 
-        task_context_json = json.loads(utf8_deserializer.loads(infile))
-        if task_context_json["isBarrier"]:
-            taskContext = BarrierTaskContext.from_json(task_context_json)
-        else:
-            taskContext = TaskContext.from_json(task_context_json)
-        TaskContext._setTaskContext(taskContext)
+        TaskContext._setTaskContext(init_info.task_context.to_task_context())
 
         shuffle.MemoryBytesSpilled = 0
         shuffle.DiskBytesSpilled = 0
 
-        setup_spark_files(infile)
-        setup_broadcasts(infile)
+        setup_spark_files(init_info.spark_files_dir, init_info.python_includes)
+        setup_broadcasts(
+            init_info.broadcast.variables,
+            init_info.broadcast.conn_info,
+            init_info.broadcast.auth_secret,
+        )
 
         _accumulatorRegistry.clear()
-        eval_type = read_int(infile)
-        runner_conf = RunnerConf(infile)
-        eval_conf = EvalConf(infile)
+        eval_type = init_info.eval_type
+        runner_conf = RunnerConf(init_info.runner_conf)
+        eval_conf = EvalConf(init_info.eval_conf)
         if eval_type == PythonEvalType.NON_UDF:
-            func, profiler, deserializer, serializer = read_command(pickleSer, infile)
+            func, profiler, deserializer, serializer = read_command(pickleSer, init_info.udf_info)
         elif eval_type in (
             PythonEvalType.SQL_TABLE_UDF,
             PythonEvalType.SQL_ARROW_TABLE_UDF,
             PythonEvalType.SQL_ARROW_UDTF,
         ):
             func, profiler, deserializer, serializer = read_udtf(
-                pickleSer, infile, eval_type, runner_conf, eval_conf
+                pickleSer, init_info.udf_info, eval_type, runner_conf, eval_conf
             )
         else:
             func, profiler, deserializer, serializer = read_udfs(
-                pickleSer, infile, eval_type, runner_conf, eval_conf
+                pickleSer, init_info.udf_info, eval_type, runner_conf, eval_conf
             )
 
         init_time = time.time()
 
         def process():
             iterator = deserializer.load_stream(infile)
-            out_iter = func(split_index, iterator)
+            out_iter = func(init_info.split_index, iterator)
             try:
                 serializer.dump_stream(out_iter, outfile)
             finally: