apache · aakash-db · May 16, 2025 · May 19, 2025 · May 19, 2025 · May 20, 2025
diff --git a/sql/connect/common/src/main/protobuf/spark/connect/base.proto b/sql/connect/common/src/main/protobuf/spark/connect/base.proto
@@ -26,6 +26,7 @@ import "spark/connect/expressions.proto";
 import "spark/connect/relations.proto";
 import "spark/connect/types.proto";
 import "spark/connect/ml.proto";
+import "spark/connect/pipelines.proto";
 
 option java_multiple_files = true;
 option java_package = "org.apache.spark.connect.proto";
@@ -399,6 +400,13 @@ message ExecutePlanResponse {
     // ML command response
     MlCommandResult ml_command_result = 20;
 
+    // Response containing pipeline events that are streamed back to the client during a pipeline
+    // run
+    PipelineEventsResult pipeline_events_result = 21;
+
+    // Pipeline command response
+    PipelineCommandResult pipeline_command_result = 22;
+
     // Support arbitrary result objects.
     google.protobuf.Any extension = 999;
   }

diff --git a/sql/connect/common/src/main/protobuf/spark/connect/commands.proto b/sql/connect/common/src/main/protobuf/spark/connect/commands.proto
@@ -22,6 +22,7 @@ import "spark/connect/common.proto";
 import "spark/connect/expressions.proto";
 import "spark/connect/relations.proto";
 import "spark/connect/ml.proto";
+import "spark/connect/pipelines.proto";
 
 package spark.connect;
 
@@ -51,6 +52,7 @@ message Command {
     MergeIntoTableCommand merge_into_table_command = 16;
     MlCommand ml_command = 17;
     ExecuteExternalCommand execute_external_command = 18;
+    PipelineCommand pipeline_command = 19;
 
     // This field is used to mark extensions to the protocol. When plugins generate arbitrary
     // Commands they can add them here. During the planning the correct resolution is done.

diff --git a/sql/connect/common/src/main/protobuf/spark/connect/pipelines.proto b/sql/connect/common/src/main/protobuf/spark/connect/pipelines.proto
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = "proto3";
+
+package spark.connect;
+
+import "spark/connect/relations.proto";
+import "spark/connect/types.proto";
+
+option java_multiple_files = true;
+option java_package = "org.apache.spark.connect.proto";
+
+// Dispatch object for pipelines commands. See each individual command for documentation.
+message PipelineCommand {
+  oneof command_type {
+    CreateDataflowGraph create_dataflow_graph = 1;
+    DefineDataset define_dataset = 2;
+    DefineFlow define_flow = 3;
+    DropDataflowGraph drop_dataflow_graph = 4;
+    StartRun start_run = 5;
+    StopRun stop_run = 6;
+    DefineSqlGraphElements define_sql_graph_elements = 7;
+  }
+
+  // Request to create a new dataflow graph.
+  message CreateDataflowGraph {
+    // The default catalog.
+    optional string default_catalog = 1;
+
+    // The default database.
+    optional string default_database = 2;
+
+    // SQL configurations for all flows in this graph.
+    map<string, string> sql_conf = 5;
+
+    message Response {
+      // The ID of the created graph.
+      string dataflow_graph_id = 1;
+    }
+  }
+
+  // Drops the graph and stops any running attached flows.
+  message DropDataflowGraph {
+    // The graph to drop.
+    string dataflow_graph_id = 1;
+  }
+
+  // Request to define a dataset: a table, a materialized view, or a temporary view.
+  message DefineDataset {
+    // The graph to attach this dataset to.
+    string dataflow_graph_id = 1;
+
+    // Name of the dataset. Can be partially or fully qualified.
+    string dataset_name = 2;
+
+    // The type of the dataset.
+    DatasetType dataset_type = 3;
+
+    // Optional comment for the dataset.
+    optional string comment = 4;
+
+    // Optional table properties. Only applies to dataset_type == TABLE and dataset_type == MATERIALIZED_VIEW.
+    map<string, string> table_properties = 5;
+
+    // Optional partition columns for the dataset. Only applies to dataset_type == TABLE and
+    // dataset_type == MATERIALIZED_VIEW.
+    repeated string partition_cols = 6;
+
+    // Schema for the dataset. If unset, this will be inferred from incoming flows.
+    optional spark.connect.DataType schema = 7;
+
+    // The output table format of the dataset. Only applies to dataset_type == TABLE and
+    // dataset_type == MATERIALIZED_VIEW.
+    optional string format = 8;
+  }
+
+  // Request to define a flow targeting a dataset.
+  message DefineFlow {
+    // The graph to attach this dataset to.
+    string dataflow_graph_id = 1;
+
+    // Name of the flow. For standalone flows, this must be a single-part name.
+    string flow_name = 2;
+
+    // Name of the dataset this flow writes to. Can be partially or fully qualified.
+    string target_dataset_name = 3;
+
+    // An unresolved relation that defines the dataset's flow.
+    spark.connect.Relation plan = 4;
+
+    // Default SQL configurations set when running this flow.
+    map<string, string> sql_conf = 5;
+
+    // If true, this flow will only be run once per execution.
+    bool once = 6;
+  }
+
+  // Resolves all datasets and flows and start a pipeline update. Should be called after all
+  // graph elements are registered.
+  message StartRun {
+    // The graph to start.
+    string dataflow_graph_id = 1;
+  }
+
+  // Stops all running flows in the graph. This is a no-op if the graph is not running.
+  message StopRun {
+    // The ID of the graph to stop.
+    string dataflow_graph_id = 1;
+  }
+}
+
+// Parses the SQL file and registers all datasets and flows.
+message DefineSqlGraphElements {
+  // The graph to attach this dataset to.
+  optional string dataflow_graph_id = 1;
+
+  // The full path to the SQL file. Can be relative or absolute.
+  optional string sql_file_path = 2;
+
+  // The contents of the SQL file.
+  optional string sql_text = 3;
+}
+
+// Dispatch object for pipelines command results.
+message PipelineCommandResult {
+  oneof result_type {
+    CreateDataflowGraphResult create_dataflow_graph_result = 1;
+  }
+  message CreateDataflowGraphResult {
+    // The ID of the created graph.
+    string dataflow_graph_id = 1;
+  }
+}
+
+// The type of dataset.
+enum DatasetType {
+  // Safe default value. Should not be used.
+  DATASET_UNSPECIFIED = 0;
+  // A materialized view dataset which is published to the catalog
+  MATERIALIZED_VIEW = 1;
+  // A table which is published to the catalog
+  TABLE = 2;
+  // A view which is not published to the catalog
+  TEMPORARY_VIEW = 3;
+}
+
+// A response containing events emitted during the run of a pipeline.
+message PipelineEventsResult {
+  repeated PipelineEvent events = 1;
+}
+
+// An event emitted during the run of a graph.
+message PipelineEvent {
+  // The time of the event.
+  optional string timestamp = 1;
+  // The message that should be displayed to users.
+  optional string message = 2;
+}