Skip to content
This repository was archived by the owner on Feb 16, 2024. It is now read-only.

Commit 8b7b6a5

Browse files
committed
Airflow demo with spark-k8s (#119)
## Description Add an airflow job executing and monitoring a spark-k8s job to the demo.
1 parent a968ab5 commit 8b7b6a5

21 files changed

+342
-32
lines changed
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
---
2+
apiVersion: rbac.authorization.k8s.io/v1
3+
kind: ClusterRole
4+
metadata:
5+
name: airflow-spark-clusterrole
6+
rules:
7+
- apiGroups:
8+
- spark.stackable.tech
9+
resources:
10+
- sparkapplications
11+
verbs:
12+
- create
13+
- get
14+
- list
15+
- apiGroups:
16+
- airflow.stackable.tech
17+
resources:
18+
- airflowdbs
19+
verbs:
20+
- create
21+
- get
22+
- list
23+
- apiGroups:
24+
- apps
25+
resources:
26+
- statefulsets
27+
verbs:
28+
- get
29+
- watch
30+
- list
31+
- apiGroups:
32+
- ""
33+
resources:
34+
- persistentvolumeclaims
35+
verbs:
36+
- list
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
---
2+
apiVersion: rbac.authorization.k8s.io/v1
3+
kind: ClusterRoleBinding
4+
metadata:
5+
name: airflow-spark-clusterrole-binding
6+
roleRef:
7+
apiGroup: rbac.authorization.k8s.io
8+
kind: ClusterRole
9+
name: airflow-spark-clusterrole
10+
subjects:
11+
- apiGroup: rbac.authorization.k8s.io
12+
kind: Group
13+
name: system:serviceaccounts
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
---
2+
apiVersion: batch/v1
3+
kind: Job
4+
metadata:
5+
name: start-pyspark-job
6+
spec:
7+
template:
8+
spec:
9+
containers:
10+
- name: start-pyspark-job
11+
image: docker.stackable.tech/stackable/tools:0.2.0-stackable0.3.0
12+
# N.B. it is possible for the scheduler to report that a DAG exists, only for the worker task to fail if a pod is unexpectedly
13+
# restarted. Additionally, the db-init job takes a few minutes to complete before the cluster is deployed. The wait/watch steps
14+
# below are not "water-tight" but add a layer of stability by at least ensuring that the db is initialized and ready and that
15+
# all pods are reachable (albeit independent of each other).
16+
command: ["bash", "-c", "
17+
kubectl wait airflowdb/airflow --for jsonpath='{.status.condition}'=Ready --timeout 600s
18+
&& kubectl rollout status --watch statefulset/airflow-webserver-default
19+
&& kubectl rollout status --watch statefulset/airflow-worker-default
20+
&& kubectl rollout status --watch statefulset/airflow-scheduler-default
21+
&& curl -i -s --user airflow:airflow http://airflow-webserver-default:8080/api/v1/dags/sparkapp_dag
22+
&& curl -i -s --user airflow:airflow -H 'Content-Type:application/json' -XPATCH http://airflow-webserver-default:8080/api/v1/dags/sparkapp_dag -d '{\"is_paused\": false}'
23+
&& curl -i -s --user airflow:airflow -H 'Content-Type:application/json' -XPOST http://airflow-webserver-default:8080/api/v1/dags/sparkapp_dag/dagRuns -d '{}'
24+
"]
25+
restartPolicy: OnFailure
26+
backoffLimit: 20 # give some time for the Airflow cluster to be available
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
---
2+
apiVersion: batch/v1
3+
kind: Job
4+
metadata:
5+
name: start-date-job
6+
spec:
7+
template:
8+
spec:
9+
containers:
10+
- name: start-date-job
11+
image: docker.stackable.tech/stackable/tools:0.2.0-stackable0.3.0
12+
# N.B. it is possible for the scheduler to report that a DAG exists, only for the worker task to fail if a pod is unexpectedly
13+
# restarted. Additionally, the db-init job takes a few minutes to complete before the cluster is deployed. The wait/watch steps
14+
# below are not "water-tight" but add a layer of stability by at least ensuring that the db is initialized and ready and that
15+
# all pods are reachable (albeit independent of each other).
16+
command: ["bash", "-c", "
17+
kubectl wait airflowdb/airflow --for jsonpath='{.status.condition}'=Ready --timeout 600s
18+
&& kubectl rollout status --watch statefulset/airflow-webserver-default
19+
&& kubectl rollout status --watch statefulset/airflow-worker-default
20+
&& kubectl rollout status --watch statefulset/airflow-scheduler-default
21+
&& curl -i -s --user airflow:airflow http://airflow-webserver-default:8080/api/v1/dags/date_demo
22+
&& curl -i -s --user airflow:airflow -H 'Content-Type:application/json' -XPATCH http://airflow-webserver-default:8080/api/v1/dags/date_demo -d '{\"is_paused\": false}'
23+
&& curl -i -s --user airflow:airflow -H 'Content-Type:application/json' -XPOST http://airflow-webserver-default:8080/api/v1/dags/date_demo/dagRuns -d '{}'
24+
"]
25+
restartPolicy: OnFailure
26+
backoffLimit: 20 # give some time for the Airflow cluster to be available

demos/airflow-scheduled-job/enable-and-run-dag.yaml

Lines changed: 0 additions & 17 deletions
This file was deleted.

demos/demos-v1.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,10 @@ demos:
77
- airflow
88
- job-scheduling
99
manifests:
10-
- plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/airflow-scheduled-job/enable-and-run-dag.yaml
10+
- plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/airflow-scheduled-job/01-airflow-spark-clusterrole.yaml
11+
- plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/airflow-scheduled-job/02-airflow-spark-clusterrolebinding.yaml
12+
- plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/airflow-scheduled-job/03-enable-and-run-spark-dag.yaml
13+
- plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/airflow-scheduled-job/04-enable-and-run-date-dag.yaml
1114
hbase-hdfs-load-cycling-data:
1215
description: Copy data from S3 bucket to an HBase table
1316
stackableStack: hdfs-hbase
27.4 KB
Loading
131 KB
Loading
522 KB
Loading
202 KB
Loading

0 commit comments

Comments
 (0)