apache · sryza · May 21, 2025 · May 22, 2025 · May 28, 2025 · May 29, 2025
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -511,7 +511,7 @@ jobs:
           - >-
             pyspark-core, pyspark-errors, pyspark-streaming, pyspark-logger
           - >-
-            pyspark-mllib, pyspark-ml, pyspark-ml-connect
+            pyspark-mllib, pyspark-ml, pyspark-ml-connect, pyspark-pipelines
           - >-
             pyspark-connect
           - >-

diff --git a/bin/spark-pipelines b/bin/spark-pipelines
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Default to standard python3 interpreter unless told otherwise
+if [[ -z "$PYSPARK_PYTHON" ]]; then
+  PYSPARK_PYTHON=python3
+fi
+
+if [ -z "${SPARK_HOME}" ]; then
+  source "$(dirname "$0")"/find-spark-home
+fi
+
+# Add the PySpark classes to the Python path:
+export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH"
+export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.9-src.zip:$PYTHONPATH"
+
+$PYSPARK_PYTHON "${SPARK_HOME}"/python/pyspark/pipelines/cli.py "$@"
diff --git a/dev/requirements.txt b/dev/requirements.txt
@@ -27,6 +27,7 @@ pytest-mypy-plugins==1.9.3
 flake8==3.9.0
 # See SPARK-38680.
 pandas-stubs<1.2.0.54
+types-PyYAML
 
 # Documentation (SQL)
 mkdocs

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
@@ -1501,6 +1501,18 @@ def __hash__(self):
     ],
 )
 
+pyspark_pipelines = Module(
+    name="pyspark-pipelines",
+    dependencies=[pyspark_core, pyspark_sql, pyspark_connect],
+    source_file_regexes=["python/pyspark/pipelines"],
+    python_test_goals=[
+        "pyspark.pipelines.tests.test_block_connect_access",
+        "pyspark.pipelines.tests.test_cli",
+        "pyspark.pipelines.tests.test_decorators",
+        "pyspark.pipelines.tests.test_graph_element_registry",
+        "pyspark.pipelines.tests.test_init_cli",
+    ],
+)
 
 sparkr = Module(
     name="sparkr",

diff --git a/dev/sparktestsupport/utils.py b/dev/sparktestsupport/utils.py
@@ -112,25 +112,27 @@ def determine_modules_to_test(changed_modules, deduplicated=True):
      'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml', 'pyspark-ml-connect', 'pyspark-mllib',
      'pyspark-pandas', 'pyspark-pandas-connect-part0', 'pyspark-pandas-connect-part1',
      'pyspark-pandas-connect-part2', 'pyspark-pandas-connect-part3', 'pyspark-pandas-slow',
-     'pyspark-sql', 'pyspark-testing', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
+     'pyspark-pipelines', 'pyspark-sql', 'pyspark-testing', 'repl', 'sparkr', 'sql',
+     'sql-kafka-0-10']
     >>> sorted([x.name for x in determine_modules_to_test(
     ...     [modules.sparkr, modules.sql], deduplicated=False)])
     ... # doctest: +NORMALIZE_WHITESPACE
     ['avro', 'connect', 'docker-integration-tests', 'examples', 'hive', 'hive-thriftserver',
      'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml', 'pyspark-ml-connect', 'pyspark-mllib',
      'pyspark-pandas', 'pyspark-pandas-connect-part0', 'pyspark-pandas-connect-part1',
      'pyspark-pandas-connect-part2', 'pyspark-pandas-connect-part3', 'pyspark-pandas-slow',
-     'pyspark-sql', 'pyspark-testing', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
+     'pyspark-pipelines', 'pyspark-sql', 'pyspark-testing', 'repl', 'sparkr', 'sql',
+     'sql-kafka-0-10']
     >>> sorted([x.name for x in determine_modules_to_test(
     ...     [modules.sql, modules.core], deduplicated=False)])
     ... # doctest: +NORMALIZE_WHITESPACE
     ['avro', 'catalyst', 'connect', 'core', 'docker-integration-tests', 'examples', 'graphx',
      'hive', 'hive-thriftserver', 'mllib', 'mllib-local', 'protobuf', 'pyspark-connect',
      'pyspark-core', 'pyspark-ml', 'pyspark-ml-connect', 'pyspark-mllib', 'pyspark-pandas',
      'pyspark-pandas-connect-part0', 'pyspark-pandas-connect-part1', 'pyspark-pandas-connect-part2',
-     'pyspark-pandas-connect-part3', 'pyspark-pandas-slow', 'pyspark-resource', 'pyspark-sql',
-     'pyspark-streaming', 'pyspark-testing', 'repl', 'root', 'sparkr', 'sql', 'sql-kafka-0-10',
-     'streaming', 'streaming-kafka-0-10', 'streaming-kinesis-asl']
+     'pyspark-pandas-connect-part3', 'pyspark-pandas-slow', 'pyspark-pipelines', 'pyspark-resource',
+     'pyspark-sql', 'pyspark-streaming', 'pyspark-testing', 'repl', 'root', 'sparkr', 'sql',
+     'sql-kafka-0-10', 'streaming', 'streaming-kafka-0-10', 'streaming-kinesis-asl']
     """
     modules_to_test = set()
     for module in changed_modules:

diff --git a/python/mypy.ini b/python/mypy.ini
@@ -117,6 +117,9 @@ ignore_errors = True
 [mypy-pyspark.pandas.tests.*]
 ignore_errors = True
 
+[mypy-pyspark.pipelines.tests.*]
+ignore_errors = True
+
 [mypy-pyspark.tests.*]
 ignore_errors = True
 
@@ -185,6 +188,9 @@ ignore_missing_imports = True
 [mypy-flameprof.*]
 ignore_missing_imports = True
 
+[mypy-yaml.*]
+ignore_missing_imports = True
+
 ; Ignore errors for proto generated code
 [mypy-pyspark.sql.connect.proto.*, pyspark.sql.connect.proto, pyspark.sql.streaming.proto]
 ignore_errors = True
diff --git a/python/packaging/client/setup.py b/python/packaging/client/setup.py
@@ -111,6 +111,7 @@
         "pyspark.pandas.tests.connect.reshape",
         "pyspark.pandas.tests.connect.series",
         "pyspark.pandas.tests.connect.window",
+        "pyspark.pipelines.tests",
         "pyspark.logger.tests",
         "pyspark.logger.tests.connect",
     ]
@@ -182,6 +183,7 @@
         "pyspark.pandas.spark",
         "pyspark.pandas.typedef",
         "pyspark.pandas.usage_logging",
+        "pyspark.pipelines",
         "pyspark.testing",
         "pyspark.resource",
         "pyspark.errors",

diff --git a/python/pyspark/errors/error-conditions.json b/python/pyspark/errors/error-conditions.json
@@ -14,6 +14,11 @@
       "Arrow legacy IPC format is not supported in PySpark, please unset ARROW_PRE_0_15_IPC_FORMAT."
     ]
   },
+  "ATTEMPT_ANALYSIS_IN_PIPELINE_QUERY_FUNCTION": {
+    "message": [
+      "Operations that trigger DataFrame analysis or execution are not allowed in pipeline query functions. Move code outside of the pipeline query function."
+    ]
+  },
   "ATTRIBUTE_NOT_CALLABLE": {
     "message": [
       "Attribute `<attr_name>` in provided object `<obj_name>` is not callable."
@@ -219,6 +224,11 @@
       "Unexpected filter <name>."
     ]
   },
+  "DECORATOR_ARGUMENT_NOT_CALLABLE": {
+    "message": [
+      "The first positional argument passed to @<decorator_name> must be callable. Either add @<decorator_name> with no parameters to your function, or pass options to @<decorator_name> using keyword arguments (e.g. <example_usage>)."
+    ]
+  },
   "DIFFERENT_PANDAS_DATAFRAME": {
     "message": [
       "DataFrames are not almost equal:",
@@ -336,6 +346,11 @@
       "<field_name>: <obj> is not an instance of type <data_type>."
     ]
   },
+  "GRAPH_ELEMENT_DEFINED_OUTSIDE_OF_DECLARATIVE_PIPELINE": {
+    "message": [
+      "APIs that define elements of a declarative pipeline can only be invoked within the context of defining a pipeline."
+    ]
+  },
   "HIGHER_ORDER_FUNCTION_SHOULD_RETURN_COLUMN": {
     "message": [
       "Function `<func_name>` should return Column, got <return_type>."
@@ -552,6 +567,11 @@
       "Mixed type replacements are not supported."
     ]
   },
+  "MULTIPLE_PIPELINE_SPEC_FILES_FOUND": {
+    "message": [
+      "Multiple pipeline spec files found in the directory `<dir_path>`. Please remove one or choose a particular one with the --spec argument."
+    ]
+  },
   "NEGATIVE_VALUE": {
     "message": [
       "Value for `<arg_name>` must be greater than or equal to 0, got '<arg_value>'."
@@ -839,6 +859,41 @@
       "The Pandas SCALAR_ITER UDF outputs more rows than input rows."
     ]
   },
+  "PIPELINE_SPEC_DICT_KEY_NOT_STRING": {
+    "message": [
+      "For pipeline spec field `<field_name>`, key should be a string, got <key_type>."
+    ]
+  },
+  "PIPELINE_SPEC_DICT_VALUE_NOT_STRING": {
+    "message": [
+      "For pipeline spec field `<field_name>`, value for key `<key_name>` should be a string, got <value_type>."
+    ]
+  },
+  "PIPELINE_SPEC_FIELD_NOT_DICT": {
+    "message": [
+      "Pipeline spec field `<field_name>` should be a dict, got <field_type>."
+    ]
+  },
+  "PIPELINE_SPEC_FILE_DOES_NOT_EXIST": {
+    "message": [
+      "The pipeline spec file `<spec_path>` does not exist."
+    ]
+  },
+  "PIPELINE_SPEC_FILE_NOT_FOUND": {
+    "message": [
+      "No pipeline.yaml or pipeline.yml file provided in arguments or found in directory `<dir_path>` or readable ancestor directories."
+    ]
+  },
+  "PIPELINE_SPEC_UNEXPECTED_FIELD": {
+    "message": [
+      "Pipeline spec field `<field_name>` is unexpected."
+    ]
+  },
+  "PIPELINE_UNSUPPORTED_DEFINITIONS_FILE_EXTENSION": {
+    "message": [
+      "Pipeline definitions file `<file_path>` has an unsupported extension. Supported extensions are `.py` and `.sql`."
+    ]
+  },
   "PIPE_FUNCTION_EXITED": {
     "message": [
       "Pipe function `<func_name>` exited with error code <error_code>."
@@ -1145,6 +1200,11 @@
       "Pie plot requires either a `y` column or `subplots=True`."
     ]
   },
+  "UNSUPPORTED_PIPELINES_DATASET_TYPE": {
+    "message": [
+      "Unsupported pipelines dataset type: <dataset_type>."
+    ]
+  },
   "UNSUPPORTED_PLOT_BACKEND": {
     "message": [
       "`<backend>` is not supported, it should be one of the values from <supported_backends>"

diff --git a/python/pyspark/pipelines/__init__.py b/python/pyspark/pipelines/__init__.py
@@ -0,0 +1,31 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from pyspark.pipelines.api import (
+    append_flow,
+    create_streaming_table,
+    materialized_view,
+    table,
+    temporary_view,
+)
+
+__all__ = [
+    "append_flow",
+    "create_streaming_table",
+    "materialized_view",
+    "table",
+    "temporary_view",
+]