Skip to content
This repository was archived by the owner on Feb 16, 2024. It is now read-only.

Commit 6c5e056

Browse files
committed
Add demo data-warehouse-iceberg-trino-spark (#144)
## Description Needs a larger k8s cluster! I use IONOS k8s with 9x 4 cores (8 threads), 20GB ram and 30GB hdd disk Maybe we can also offer a smaller variant later on. Otherwise business as usual. From feature-branch run `stackablectl --additional-stacks-file stacks/stacks-v1.yaml --additional-releases-file releases.yaml --additional-demos-file demos/demos-v1.yaml demo install data-warehouse-iceberg-trino-spark` I'm not happy with some parts but i think an iterative approach is best: * Shared bikes are currently not streamed into Kafka (instead on-time job) * Some high-volume real-time datasource would be great. Currently we use the water levels and duplicate them to get higher volumes. * Some sort of Upsert or Deletion usecases would be great. But probably not on the large datasets as for our wallet ^^ * Better Dashboards. The current one were thrown together quickly * I would like to partitions the water_level measurements by day but run into apache/iceberg#5625. There might be ways around by using a dedicated Spark context for compaction but we can easily adopt the partitioning after the issue gets fixed. Sorting during rewrites did cost performance during compaction and did not provide real benefits for my intial measurements. Disabled for now. * As always tracked my findings in https://github.com/stackabletech/stackablectl/issues/128 To get to the Spark UI `kubectl port-forward $(kubectl get pod -o name | grep 'spark-ingest-into-warehouse-.*-driver') 4040`
1 parent 8e29058 commit 6c5e056

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+8063
-35
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
FROM docker.stackable.tech/stackable/pyspark-k8s:3.3.0-stackable0.2.0
2+
3+
RUN curl -L -O http://search.maven.org/remotecontent?filepath=org/apache/ivy/ivy/2.5.0/ivy-2.5.0.jar
4+
RUN java -jar ivy-2.5.0.jar -notransitive \
5+
-dependency org.apache.spark spark-sql-kafka-0-10_2.12 3.3.0 \
6+
-retrieve "/stackable/spark/jars/[artifact]-[revision](-[classifier]).[ext]"
7+
RUN java -jar ivy-2.5.0.jar -confs compile \
8+
-dependency org.apache.spark spark-sql-kafka-0-10_2.12 3.3.0 \
9+
-retrieve "/stackable/spark/jars/[artifact]-[revision](-[classifier]).[ext]"
10+
RUN java -jar ivy-2.5.0.jar -notransitive \
11+
-dependency org.apache.iceberg iceberg-spark-runtime-3.3_2.12 0.14.1 \
12+
-retrieve "/stackable/spark/jars/[artifact]-[revision](-[classifier]).[ext]"
13+
RUN java -jar ivy-2.5.0.jar -confs compile \
14+
-dependency org.apache.iceberg iceberg-spark-runtime-3.3_2.12 0.14.1 \
15+
-retrieve "/stackable/spark/jars/[artifact]-[revision](-[classifier]).[ext]"

demos/data-warehouse-iceberg-trino-spark/WarehouseKafkaIngest.xml

+6,207
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
---
2+
apiVersion: batch/v1
3+
kind: Job
4+
metadata:
5+
name: create-nifi-ingestion-job
6+
spec:
7+
template:
8+
spec:
9+
serviceAccountName: demo-serviceaccount
10+
initContainers:
11+
- name: wait-for-testdata
12+
image: docker.stackable.tech/stackable/tools:0.2.0-stackable0.3.0
13+
command: ["bash", "-c", "echo 'Waiting for all kafka brokers to be ready' && kubectl wait --for=condition=ready --timeout=30m pod -l app.kubernetes.io/instance=kafka -l app.kubernetes.io/name=kafka"]
14+
containers:
15+
- name: create-nifi-ingestion-job
16+
image: docker.stackable.tech/stackable/testing-tools:0.1.0-stackable0.1.0
17+
command: ["bash", "-c", "curl -O https://raw.githubusercontent.com/stackabletech/stackablectl/demo-data-warehouse-iceberg-trino-spark/demos/data-warehouse-iceberg-trino-spark/WarehouseKafkaIngest.xml && python -u /tmp/script/script.py"]
18+
volumeMounts:
19+
- name: script
20+
mountPath: /tmp/script
21+
env:
22+
- name: NAMESPACE
23+
valueFrom:
24+
fieldRef:
25+
fieldPath: metadata.namespace
26+
volumes:
27+
- name: script
28+
configMap:
29+
name: create-nifi-ingestion-job-script
30+
restartPolicy: OnFailure
31+
backoffLimit: 50
32+
---
33+
apiVersion: v1
34+
kind: ConfigMap
35+
metadata:
36+
name: create-nifi-ingestion-job-script
37+
data:
38+
script.py: |
39+
from nipyapi.canvas import get_root_pg_id, schedule_process_group, list_all_controllers, schedule_controller
40+
from nipyapi.security import service_login
41+
from nipyapi.templates import get_template, upload_template, deploy_template
42+
import nipyapi
43+
import os
44+
import urllib3
45+
46+
# As of 2022-08-29 we cant use "https://nifi:8443" here because <h2>The request contained an invalid host header [<code>nifi:8443</code>] in the request [<code>/nifi-api</code>]. Check for request manipulation or third-party intercept.</h2>
47+
ENDPOINT = f"https://nifi-node-default-0.nifi-node-default.{os.environ['NAMESPACE']}.svc.cluster.local:8443" # For local testing / developing replace it, afterwards change back to f"https://nifi-node-default-0.nifi-node-default.{os.environ['NAMESPACE']}.svc.cluster.local:8443"
48+
USERNAME = "admin"
49+
PASSWORD = "adminadmin"
50+
TEMPLATE_NAME = "WarehouseKafkaIngest"
51+
TEMPLATE_FILE = f"{TEMPLATE_NAME}.xml"
52+
53+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
54+
55+
nipyapi.config.nifi_config.host = f"{ENDPOINT}/nifi-api"
56+
nipyapi.config.nifi_config.verify_ssl = False
57+
58+
print("Logging in")
59+
service_login(username=USERNAME, password=PASSWORD)
60+
print("Logged in")
61+
62+
pg_id = get_root_pg_id()
63+
64+
upload_template(pg_id, TEMPLATE_FILE)
65+
66+
template_id = get_template(TEMPLATE_NAME).id
67+
deploy_template(pg_id, template_id, 200, 0)
68+
69+
for controller in list_all_controllers():
70+
schedule_controller(controller, scheduled=True)
71+
72+
schedule_process_group(pg_id, scheduled=True)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,304 @@
1+
# We can't simply create the SparkApplication object here as we have to wait for Kafka to be ready because
2+
# * We currently don't restart failed Spark applications (see https://github.com/stackabletech/spark-k8s-operator/issues/157)
3+
# * We currently auto-create topics and we need all the brokers to be available so that the topic is distributed among all the brokers
4+
---
5+
apiVersion: batch/v1
6+
kind: Job
7+
metadata:
8+
name: create-spark-ingestion-job
9+
spec:
10+
template:
11+
spec:
12+
serviceAccountName: demo-serviceaccount
13+
initContainers:
14+
- name: wait-for-testdata
15+
image: docker.stackable.tech/stackable/tools:0.2.0-stackable0.3.0
16+
command: ["bash", "-c", "echo 'Waiting for all kafka brokers to be ready' && kubectl wait --for=condition=ready --timeout=30m pod -l app.kubernetes.io/instance=kafka -l app.kubernetes.io/name=kafka"]
17+
containers:
18+
- name: create-spark-ingestion-job
19+
image: docker.stackable.tech/stackable/tools:0.2.0-stackable0.3.0
20+
command: ["bash", "-c", "echo 'Submitting Spark job' && kubectl apply -f /tmp/manifest/spark-ingestion-job.yaml"]
21+
volumeMounts:
22+
- name: manifest
23+
mountPath: /tmp/manifest
24+
volumes:
25+
- name: manifest
26+
configMap:
27+
name: create-spark-ingestion-job-manifest
28+
restartPolicy: OnFailure
29+
backoffLimit: 50
30+
---
31+
apiVersion: v1
32+
kind: ConfigMap
33+
metadata:
34+
name: create-spark-ingestion-job-manifest
35+
data:
36+
spark-ingestion-job.yaml: |
37+
---
38+
apiVersion: spark.stackable.tech/v1alpha1
39+
kind: SparkApplication
40+
metadata:
41+
name: spark-ingest-into-warehouse
42+
spec:
43+
version: "1.0"
44+
sparkImage: docker.stackable.tech/sbernauer/pyspark-k8s-with-iceberg:latest3 # docker.stackable.tech/stackable/pyspark-k8s:3.3.0-stackable0.2.0
45+
mode: cluster
46+
mainApplicationFile: local:///stackable/spark/jobs/spark-ingest-into-warehouse.py
47+
# deps:
48+
# packages:
49+
# - org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:0.14.1
50+
# - org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0
51+
sparkConf:
52+
spark.hadoop.fs.s3a.endpoint: http://minio:9000
53+
spark.hadoop.fs.s3a.path.style.access: "true"
54+
spark.hadoop.fs.s3a.access.key: trino
55+
spark.hadoop.fs.s3a.secret.key: trinotrino
56+
spark.sql.extensions: org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
57+
spark.sql.catalog.warehouse: org.apache.iceberg.spark.SparkCatalog
58+
spark.sql.catalog.warehouse.type: hive
59+
spark.sql.catalog.warehouse.uri: thrift://hive-iceberg:9083
60+
volumes:
61+
- name: script
62+
configMap:
63+
name: write-iceberg-table-script
64+
job:
65+
resources:
66+
cpu:
67+
min: "100m"
68+
max: "1"
69+
driver:
70+
resources:
71+
cpu:
72+
min: "1"
73+
max: "1"
74+
memory:
75+
limit: "2Gi"
76+
volumeMounts:
77+
- name: script
78+
mountPath: /stackable/spark/jobs
79+
executor:
80+
instances: 4
81+
resources:
82+
cpu:
83+
min: "2"
84+
max: "4"
85+
memory:
86+
limit: "12Gi"
87+
volumeMounts:
88+
- name: script
89+
mountPath: /stackable/spark/jobs
90+
---
91+
apiVersion: v1
92+
kind: ConfigMap
93+
metadata:
94+
name: write-iceberg-table-script
95+
data:
96+
spark-ingest-into-warehouse.py: |
97+
from pyspark.sql import SparkSession
98+
from pyspark.sql.types import StructType, StructField, StringType, LongType, ShortType, FloatType, DoubleType, BooleanType, TimestampType, MapType, ArrayType
99+
from pyspark.sql.functions import col, from_json, expr
100+
import time
101+
from datetime import datetime, timedelta
102+
103+
spark = SparkSession.builder.appName("spark-ingest-into-warehouse").getOrCreate()
104+
# spark.sparkContext.setLogLevel("DEBUG")
105+
106+
spark.sql("CREATE SCHEMA IF NOT EXISTS warehouse.water_levels LOCATION 's3a://warehouse/water-levels/'")
107+
spark.sql("CREATE SCHEMA IF NOT EXISTS warehouse.smart_city LOCATION 's3a://warehouse/smart-city/'")
108+
109+
# Todo add PARTITIONED BY (days(timestamp))
110+
# Currently fails with org.apache.spark.sql.AnalysisException: days(timestamp) ASC NULLS FIRST is not currently supported
111+
# Don't forget to add option("fanout-enabled", "true") to iceberg sink as well
112+
# see https://github.com/apache/iceberg/issues/5625
113+
spark.sql("CREATE TABLE IF NOT EXISTS warehouse.water_levels.measurements (station_uuid string, timestamp timestamp, value float) USING iceberg")
114+
spark.sql("CREATE TABLE IF NOT EXISTS warehouse.water_levels.stations (uuid string, number bigint, short_name string, long_name string, km float, agency string, latitude double, longitude double, water_short_name string, water_long_name string) USING iceberg")
115+
spark.sql("CREATE TABLE IF NOT EXISTS warehouse.smart_city.shared_bikes_bike_status (bike_id string, vehicle_type_id string, latitude double, longitude double, is_reserved boolean, is_disabled boolean, last_reported timestamp) USING iceberg")
116+
spark.sql("CREATE TABLE IF NOT EXISTS warehouse.smart_city.shared_bikes_station_information (station_id string, name string, latitude double, longitude double) USING iceberg")
117+
spark.sql("CREATE TABLE IF NOT EXISTS warehouse.smart_city.shared_bikes_station_status (station_id string, num_bikes_available short, is_installed boolean, is_renting boolean, is_returning boolean, vehicle_types_available map<string,short>, last_reported timestamp) USING iceberg")
118+
119+
schema = StructType([
120+
StructField("station_uuid", StringType(), True),
121+
StructField("timestamp", TimestampType(), True),
122+
StructField("value", FloatType(), True),
123+
])
124+
spark \
125+
.readStream \
126+
.format("kafka") \
127+
.option("kafka.bootstrap.servers", "kafka:9092") \
128+
.option("subscribe", "water_levels_measurements") \
129+
.option("startingOffsets", "earliest") \
130+
.option("maxOffsetsPerTrigger", 50000000) \
131+
.load() \
132+
.selectExpr("cast(key as string)", "cast(value as string)") \
133+
.withColumn("json", from_json(col("value"), schema)) \
134+
.select("json.station_uuid", "json.timestamp", "json.value") \
135+
.writeStream \
136+
.queryName("ingest water_level measurements") \
137+
.format("iceberg") \
138+
.outputMode("append") \
139+
.trigger(processingTime='5 minutes') \
140+
.option("path", "warehouse.water_levels.measurements") \
141+
.option("checkpointLocation", "s3a://warehouse/water-levels/measurements/checkpoints") \
142+
.start()
143+
144+
schema = StructType([ \
145+
StructField("uuid", StringType(), True), \
146+
StructField("number", StringType(), True), \
147+
StructField("shortname", StringType(), True), \
148+
StructField("longname", StringType(), True), \
149+
StructField("km", FloatType(), True), \
150+
StructField("agency", StringType(), True), \
151+
StructField("latitude", DoubleType(), True), \
152+
StructField("longitude", DoubleType(), True), \
153+
StructField("water", \
154+
StructType([StructField("shortname", StringType(), True), StructField("longname", StringType(), True)]), \
155+
True), \
156+
])
157+
spark \
158+
.readStream \
159+
.format("kafka") \
160+
.option("kafka.bootstrap.servers", "kafka:9092") \
161+
.option("subscribe", "water_levels_stations") \
162+
.option("startingOffsets", "earliest") \
163+
.option("maxOffsetsPerTrigger", 10000) \
164+
.load() \
165+
.selectExpr("cast(key as string)", "cast(value as string)") \
166+
.withColumn("json", from_json(col("value"), schema)) \
167+
.selectExpr( \
168+
"json.uuid", \
169+
"cast(json.number as bigint) as number", \
170+
"json.shortname as short_name", \
171+
"json.longname as long_name", \
172+
"json.km", "json.agency", \
173+
"json.latitude", \
174+
"json.longitude", \
175+
"json.water.shortname as water_short_name", \
176+
"json.water.longname as water_long_name" \
177+
) \
178+
.writeStream \
179+
.queryName("ingest water_level stations") \
180+
.format("iceberg") \
181+
.outputMode("append") \
182+
.trigger(processingTime='2 minutes') \
183+
.option("path", "warehouse.water_levels.stations") \
184+
.option("checkpointLocation", "s3a://warehouse/water-levels/stations/checkpoints") \
185+
.start()
186+
187+
schema = StructType([ \
188+
StructField("station_id", StringType(), True), \
189+
StructField("lat", DoubleType(), True), \
190+
StructField("lon", DoubleType(), True), \
191+
StructField("name", StringType(), True), \
192+
])
193+
spark \
194+
.readStream \
195+
.format("kafka") \
196+
.option("kafka.bootstrap.servers", "kafka:9092") \
197+
.option("subscribe", "shared_bikes_station_information") \
198+
.option("startingOffsets", "earliest") \
199+
.option("maxOffsetsPerTrigger", 10000) \
200+
.load() \
201+
.selectExpr("cast(key as string)", "cast(value as string)") \
202+
.withColumn("json", from_json(col("value"), schema)) \
203+
.selectExpr("json.station_id", "json.name as name", "json.lat as latitude", "json.lon as longitude") \
204+
.writeStream \
205+
.queryName("ingest smart_city shared_bikes_station_information") \
206+
.format("iceberg") \
207+
.outputMode("append") \
208+
.trigger(processingTime='2 minutes') \
209+
.option("path", "warehouse.smart_city.shared_bikes_station_information") \
210+
.option("checkpointLocation", "s3a://warehouse/smart-city/shared_bikes_station_information/checkpoints") \
211+
.start()
212+
213+
schema = StructType([ \
214+
StructField("station_id", StringType(), True), \
215+
StructField("is_installed", BooleanType(), True), \
216+
StructField("last_reported", TimestampType(), True), \
217+
StructField("num_bikes_available", ShortType(), True), \
218+
StructField("is_renting", BooleanType(), True), \
219+
StructField("is_returning", BooleanType(), True), \
220+
StructField("vehicle_types_available", ArrayType(StructType([StructField("count", ShortType(), True), StructField("vehicle_type_id", StringType(), True)]), True), True), \
221+
])
222+
spark \
223+
.readStream \
224+
.format("kafka") \
225+
.option("kafka.bootstrap.servers", "kafka:9092") \
226+
.option("subscribe", "shared_bikes_station_status") \
227+
.option("startingOffsets", "earliest") \
228+
.option("maxOffsetsPerTrigger", 10000) \
229+
.load() \
230+
.selectExpr("cast(key as string)", "cast(value as string)") \
231+
.withColumn("json", from_json(col("value"), schema)) \
232+
.selectExpr( \
233+
"json.station_id", \
234+
"json.num_bikes_available", \
235+
"json.is_installed", \
236+
"json.is_renting", \
237+
"json.is_returning", \
238+
"map_from_arrays(json.vehicle_types_available.vehicle_type_id, json.vehicle_types_available.count) as vehicle_types_available", \
239+
"json.last_reported" \
240+
) \
241+
.writeStream \
242+
.queryName("ingest smart_city shared_bikes_station_status") \
243+
.format("iceberg") \
244+
.outputMode("append") \
245+
.trigger(processingTime='2 minutes') \
246+
.option("path", "warehouse.smart_city.shared_bikes_station_status") \
247+
.option("checkpointLocation", "s3a://warehouse/smart-city/shared_bikes_station_status/checkpoints") \
248+
.start()
249+
250+
schema = StructType([ \
251+
StructField("lat", DoubleType(), True), \
252+
StructField("lon", DoubleType(), True), \
253+
StructField("bike_id", StringType(), True), \
254+
StructField("is_reserved", BooleanType(), True), \
255+
StructField("is_disabled", BooleanType(), True), \
256+
StructField("vehicle_type_id", StringType(), True), \
257+
StructField("last_reported", TimestampType(), True), \
258+
])
259+
spark \
260+
.readStream \
261+
.format("kafka") \
262+
.option("kafka.bootstrap.servers", "kafka:9092") \
263+
.option("subscribe", "shared_bikes_bike_status") \
264+
.option("startingOffsets", "earliest") \
265+
.option("maxOffsetsPerTrigger", 10000) \
266+
.load() \
267+
.selectExpr("cast(key as string)", "cast(value as string)") \
268+
.withColumn("json", from_json(col("value"), schema)) \
269+
.selectExpr("json.bike_id", "json.vehicle_type_id", "json.lat as latitude", "json.lon as longitude", "json.is_reserved", "json.is_disabled", "json.last_reported") \
270+
.writeStream \
271+
.queryName("ingest smart_city shared_bikes_bike_status") \
272+
.format("iceberg") \
273+
.outputMode("append") \
274+
.trigger(processingTime='2 minutes') \
275+
.option("path", "warehouse.smart_city.shared_bikes_bike_status") \
276+
.option("checkpointLocation", "s3a://warehouse/smart-city/shared_bikes_bike_status/checkpoints") \
277+
.start()
278+
279+
# key: table name
280+
# value: compaction strategy
281+
tables_to_compact = {
282+
# "water_levels.measurements": ", strategy => 'sort', sort_order => 'station_uuid ASC NULLS LAST,timestamp DESC NULLS LAST'",
283+
"water_levels.measurements": "",
284+
"water_levels.stations": "",
285+
"warehouse.smart_city.shared_bikes_station_information": "",
286+
"warehouse.smart_city.shared_bikes_station_status": "",
287+
"warehouse.smart_city.shared_bikes_bike_status": "",
288+
}
289+
290+
while True:
291+
expire_before = (datetime.now() - timedelta(hours=4)).strftime("%Y-%m-%d %H:%M:%S")
292+
for table, table_compaction_strategy in tables_to_compact.items():
293+
print(f"[{table}] Expiring snapshots older than 4 hours ({expire_before})")
294+
spark.sql(f"CALL warehouse.system.expire_snapshots(table => '{table}', older_than => TIMESTAMP '{expire_before}', retain_last => 50, stream_results => true)")
295+
296+
print(f"[{table}] Removing orphaned files")
297+
spark.sql(f"CALL warehouse.system.remove_orphan_files(table => '{table}')")
298+
299+
print(f"[{table}] Starting compaction")
300+
spark.sql(f"CALL warehouse.system.rewrite_data_files(table => '{table}'{table_compaction_strategy})")
301+
print(f"[{table}] Finished compaction")
302+
303+
print("All tables compacted. Waiting 25min before scheduling next run...")
304+
time.sleep(25 * 60) # Assuming compaction takes 5 min run every 30 minutes

0 commit comments

Comments
 (0)