Skip to content

Commit d1db344

Browse files
committed
Python client for Spark Declarative Pipelines
1 parent ba9e0ae commit d1db344

21 files changed

+2021
-1
lines changed

bin/spark-pipelines

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/usr/bin/env bash
2+
3+
#
4+
# Licensed to the Apache Software Foundation (ASF) under one or more
5+
# contributor license agreements. See the NOTICE file distributed with
6+
# this work for additional information regarding copyright ownership.
7+
# The ASF licenses this file to You under the Apache License, Version 2.0
8+
# (the "License"); you may not use this file except in compliance with
9+
# the License. You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing, software
14+
# distributed under the License is distributed on an "AS IS" BASIS,
15+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
# See the License for the specific language governing permissions and
17+
# limitations under the License.
18+
#
19+
20+
# Default to standard python3 interpreter unless told otherwise
21+
if [[ -z "$PYSPARK_PYTHON" ]]; then
22+
PYSPARK_PYTHON=python3
23+
fi
24+
25+
if [ -z "${SPARK_HOME}" ]; then
26+
source "$(dirname "$0")"/find-spark-home
27+
fi
28+
29+
# Add the PySpark classes to the Python path:
30+
export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH"
31+
export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.9-src.zip:$PYTHONPATH"
32+
33+
${SPARK_HOME}/bin/spark-submit --conf spark.api.mode=connect "${SPARK_HOME}"/python/pyspark/sql/pipelines/cli.py "$@"

dev/sparktestsupport/modules.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -553,7 +553,11 @@ def __hash__(self):
553553
"pyspark.sql.tests.pandas.test_pandas_udf_typehints_with_future_annotations",
554554
"pyspark.sql.tests.pandas.test_pandas_udf_window",
555555
"pyspark.sql.tests.pandas.test_pandas_sqlmetrics",
556-
"pyspark.sql.tests.pandas.test_converter",
556+
"pyspark.sql.tests.pandas.test_block_connect_access",
557+
"pyspark.sql.tests.pipelines.test_blocking_connect_access"
558+
"pyspark.sql.tests.pipelines.test_cli",
559+
"pyspark.sql.tests.pipelines.test_decorators",
560+
"pyspark.sql.tests.pipelines.test_graph_element_registry",
557561
"pyspark.sql.tests.test_python_datasource",
558562
"pyspark.sql.tests.test_python_streaming_datasource",
559563
"pyspark.sql.tests.test_readwriter",

python/mypy.ini

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,3 +188,6 @@ ignore_missing_imports = True
188188
; Ignore errors for proto generated code
189189
[mypy-pyspark.sql.connect.proto.*, pyspark.sql.connect.proto, pyspark.sql.streaming.proto]
190190
ignore_errors = True
191+
192+
[mypy-pyspark.sql.pipelines.proto.*]
193+
ignore_errors = True

python/pyspark/errors/error-conditions.json

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,11 @@
219219
"Unexpected filter <name>."
220220
]
221221
},
222+
"DECORATOR_ARGUMENT_NOT_CALLABLE": {
223+
"message": [
224+
"The first positional argument passed to @<decorator_name> must be callable. Either add @<decorator_name> with no parameters to your function, or pass options to @<decorator_name> using keyword arguments (e.g. <example_usage>)."
225+
]
226+
},
222227
"DIFFERENT_PANDAS_DATAFRAME": {
223228
"message": [
224229
"DataFrames are not almost equal:",
@@ -447,6 +452,11 @@
447452
"StructField does not have typeName. Use typeName on its type explicitly instead."
448453
]
449454
},
455+
"GRAPH_ELEMENT_DEFINED_OUTSIDE_OF_DECLARATIVE_PIPELINE": {
456+
"message": [
457+
"APIs that define elements of a declarative pipeline can only be invoked within the context of defining a pipeline."
458+
]
459+
},
450460
"INVALID_TYPE_DF_EQUALITY_ARG": {
451461
"message": [
452462
"Expected type <expected_type> for `<arg_name>` but got type <actual_type>."
@@ -552,6 +562,11 @@
552562
"Mixed type replacements are not supported."
553563
]
554564
},
565+
"MULTIPLE_PIPELINE_SPEC_FILES_FOUND": {
566+
"message": [
567+
"Multiple pipeline spec files found in the directory `<dir_path>`. Please remove one or choose a particular one with the --spec argument."
568+
]
569+
},
555570
"NEGATIVE_VALUE": {
556571
"message": [
557572
"Value for `<arg_name>` must be greater than or equal to 0, got '<arg_value>'."
@@ -844,6 +859,46 @@
844859
"Pipe function `<func_name>` exited with error code <error_code>."
845860
]
846861
},
862+
"PIPELINE_SPEC_FIELD_NOT_DICT": {
863+
"message": [
864+
"Pipeline spec field `<field_name>` should be a dict, got <field_type>."
865+
]
866+
},
867+
"PIPELINE_SPEC_FILE_DOES_NOT_EXIST": {
868+
"message": [
869+
"The pipeline spec file `<spec_path>` does not exist."
870+
]
871+
},
872+
"ATTEMPT_ANALYSIS_IN_PIPELINE_QUERY_FUNCTION": {
873+
"message": [
874+
"Operations that trigger DataFrame analysis or execution are not allowed in pipeline query functions. Move code outside of the pipeline query function."
875+
]
876+
},
877+
"PIPELINE_SPEC_FILE_NOT_FOUND": {
878+
"message": [
879+
"No pipeline.yaml or pipeline.yml file provided in arguments or found in directory `<dir_path>` or readable ancestor directories."
880+
]
881+
},
882+
"PIPELINE_SPEC_DICT_KEY_NOT_STRING": {
883+
"message": [
884+
"For pipeline spec field `<field_name>`, key should be a string, got <key_type>."
885+
]
886+
},
887+
"PIPELINE_SPEC_DICT_VALUE_NOT_STRING": {
888+
"message": [
889+
"For pipeline spec field `<field_name>`, value for key `<key_name>` should be a string, got <value_type>."
890+
]
891+
},
892+
"PIPELINE_SPEC_UNEXPECTED_FIELD": {
893+
"message": [
894+
"Pipeline spec field `<field_name>` is unexpected."
895+
]
896+
},
897+
"PIPELINE_UNSUPPORTED_DEFINITIONS_FILE_EXTENSION": {
898+
"message": [
899+
"Pipeline definitions file `<file_path>` has an unsupported extension. Supported extensions are `.py` and `.sql`."
900+
]
901+
},
847902
"PLOT_INVALID_TYPE_COLUMN": {
848903
"message": [
849904
"Column <col_name> must be one of <valid_types> for plotting, got <col_type>."
@@ -1145,6 +1200,11 @@
11451200
"Pie plot requires either a `y` column or `subplots=True`."
11461201
]
11471202
},
1203+
"UNSUPPORTED_PIPELINES_DATASET_TYPE": {
1204+
"message": [
1205+
"Unsupported pipelines dataset type: <dataset_type>."
1206+
]
1207+
},
11481208
"UNSUPPORTED_PLOT_BACKEND": {
11491209
"message": [
11501210
"`<backend>` is not supported, it should be one of the values from <supported_backends>"
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
from pyspark.sql.pipelines.api import (
18+
append_flow,
19+
create_streaming_table,
20+
materialized_view,
21+
table,
22+
temporary_view,
23+
)
24+
25+
__all__ = [
26+
"append_flow",
27+
"create_streaming_table",
28+
"materialized_view",
29+
"table",
30+
"temporary_view",
31+
]

0 commit comments

Comments
 (0)