diff --git a/demos/data-warehouse-iceberg-trino-spark/Dockerfile b/demos/data-warehouse-iceberg-trino-spark/Dockerfile
new file mode 100644
index 00000000..51790b30
--- /dev/null
+++ b/demos/data-warehouse-iceberg-trino-spark/Dockerfile
@@ -0,0 +1,15 @@
+FROM docker.stackable.tech/stackable/pyspark-k8s:3.3.0-stackable0.2.0
+
+RUN curl -L -O http://search.maven.org/remotecontent?filepath=org/apache/ivy/ivy/2.5.0/ivy-2.5.0.jar
+RUN java -jar ivy-2.5.0.jar -notransitive \
+-dependency org.apache.spark spark-sql-kafka-0-10_2.12 3.3.0 \
+-retrieve "/stackable/spark/jars/[artifact]-[revision](-[classifier]).[ext]"
+RUN java -jar ivy-2.5.0.jar -confs compile \
+-dependency org.apache.spark spark-sql-kafka-0-10_2.12 3.3.0 \
+-retrieve "/stackable/spark/jars/[artifact]-[revision](-[classifier]).[ext]"
+RUN java -jar ivy-2.5.0.jar -notransitive \
+-dependency org.apache.iceberg iceberg-spark-runtime-3.3_2.12 0.14.1 \
+-retrieve "/stackable/spark/jars/[artifact]-[revision](-[classifier]).[ext]"
+RUN java -jar ivy-2.5.0.jar -confs compile \
+-dependency org.apache.iceberg iceberg-spark-runtime-3.3_2.12 0.14.1 \
+-retrieve "/stackable/spark/jars/[artifact]-[revision](-[classifier]).[ext]"
diff --git a/demos/data-warehouse-iceberg-trino-spark/WarehouseKafkaIngest.xml b/demos/data-warehouse-iceberg-trino-spark/WarehouseKafkaIngest.xml
new file mode 100644
index 00000000..97b84728
--- /dev/null
+++ b/demos/data-warehouse-iceberg-trino-spark/WarehouseKafkaIngest.xml
@@ -0,0 +1,6207 @@
+
+
+
+ d13fd9a8-0183-1000-aa3e-58ff5911cf24
+ WarehouseKafkaIngest
+
+
+ 00da16b0-a8d5-355b-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 1 GB
+ 10
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 6d56d309-2059-37b2-0000-000000000000
+ PROCESSOR
+
+ 0 sec
+ 1
+ DO_NOT_COMPRESS
+
+ LOAD_BALANCE_INACTIVE
+ ROUND_ROBIN
+
+ splits
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 2f8ff75e-c8fb-33bf-0000-000000000000
+ PROCESSOR
+
+ 0
+
+
+ 215fae28-7832-3dcf-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 1 GB
+ 10000
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 16b3ec1d-2d76-346a-0000-000000000000
+ PROCESSOR
+
+ 0 sec
+ 1
+ DO_NOT_COMPRESS
+
+ LOAD_BALANCE_NOT_CONFIGURED
+ DO_NOT_LOAD_BALANCE
+
+ Response
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 54766e13-fa3f-3541-0000-000000000000
+ PROCESSOR
+
+ 0
+
+
+ 30f6bfb0-9fa9-36df-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 1 GB
+ 10
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ d5d0ac64-15b8-3301-0000-000000000000
+ PROCESSOR
+
+ 0 sec
+ 1
+ DO_NOT_COMPRESS
+
+ LOAD_BALANCE_NOT_CONFIGURED
+ DO_NOT_LOAD_BALANCE
+
+ matched
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 6d56d309-2059-37b2-0000-000000000000
+ PROCESSOR
+
+ 0
+
+
+ 35dbe9c9-228d-3ef7-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 1 GB
+ 10000
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 08c3f2f6-c707-38f2-0000-000000000000
+ PROCESSOR
+
+ 0 sec
+ 1
+ DO_NOT_COMPRESS
+
+ LOAD_BALANCE_NOT_CONFIGURED
+ DO_NOT_LOAD_BALANCE
+
+ success
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 29b03b51-dd8d-3369-0000-000000000000
+ PROCESSOR
+
+ 0
+
+
+ 38ac56fe-8e7c-36af-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 1 GB
+ 1
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 2f8ff75e-c8fb-33bf-0000-000000000000
+ PROCESSOR
+
+ 0 sec
+ 1
+ DO_NOT_COMPRESS
+
+ LOAD_BALANCE_NOT_CONFIGURED
+ DO_NOT_LOAD_BALANCE
+
+ Response
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 014ce19b-e847-319f-0000-000000000000
+ PROCESSOR
+
+ 0
+
+
+ 3f24250d-dc86-323a-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 1 GB
+ 10000
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ f7619c90-048a-34ef-0000-000000000000
+ PROCESSOR
+
+ 0 sec
+ 1
+ DO_NOT_COMPRESS
+
+ LOAD_BALANCE_NOT_CONFIGURED
+ DO_NOT_LOAD_BALANCE
+
+ Response
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 6f5b02bd-f5bb-3a66-0000-000000000000
+ PROCESSOR
+
+ 0
+
+
+ 493a7334-97ba-3d11-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 1 GB
+ 10000
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 252a7e8d-f105-3b97-0000-000000000000
+ PROCESSOR
+
+ 0 sec
+ 1
+ DO_NOT_COMPRESS
+
+ LOAD_BALANCE_NOT_CONFIGURED
+ DO_NOT_LOAD_BALANCE
+
+ success
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 16b3ec1d-2d76-346a-0000-000000000000
+ PROCESSOR
+
+ 0
+
+
+ 51430cee-beea-319d-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 1 GB
+ 10000
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 81136d3a-233f-3c7a-0000-000000000000
+ PROCESSOR
+
+ 0 sec
+ 1
+ DO_NOT_COMPRESS
+
+ LOAD_BALANCE_NOT_CONFIGURED
+ DO_NOT_LOAD_BALANCE
+
+ success
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ f7619c90-048a-34ef-0000-000000000000
+ PROCESSOR
+
+ 0
+
+
+ 634f5740-9a21-30ce-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 1 GB
+ 10000
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 7a02b8a3-8188-3148-0000-000000000000
+ PROCESSOR
+
+ 0 sec
+ 1
+ DO_NOT_COMPRESS
+
+ LOAD_BALANCE_NOT_CONFIGURED
+ DO_NOT_LOAD_BALANCE
+
+ failure
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 252a7e8d-f105-3b97-0000-000000000000
+ PROCESSOR
+
+ 0
+
+
+ 6464ea22-2f43-3405-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 1 GB
+ 10
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 27f5905d-c078-3603-0000-000000000000
+ PROCESSOR
+
+ 0 sec
+ 1
+ DO_NOT_COMPRESS
+
+ LOAD_BALANCE_NOT_CONFIGURED
+ DO_NOT_LOAD_BALANCE
+
+ Response
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ d5d0ac64-15b8-3301-0000-000000000000
+ PROCESSOR
+
+ 0
+
+
+ c1d2051b-86bf-3dfc-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 1 GB
+ 10000
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 906719c0-82ef-3b69-0000-000000000000
+ PROCESSOR
+
+ 0 sec
+ 1
+ DO_NOT_COMPRESS
+
+ LOAD_BALANCE_NOT_CONFIGURED
+ DO_NOT_LOAD_BALANCE
+
+ Failure
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ d5d0ac64-15b8-3301-0000-000000000000
+ PROCESSOR
+
+ 0
+
+
+ c6789680-3392-394c-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 1 GB
+ 100
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 71a6da10-707a-391c-0000-000000000000
+ PROCESSOR
+
+ 0 sec
+ 1
+ DO_NOT_COMPRESS
+
+ LOAD_BALANCE_NOT_CONFIGURED
+ DO_NOT_LOAD_BALANCE
+
+ success
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 42ebd02a-4def-357f-0000-000000000000
+ PROCESSOR
+
+ 0
+
+
+ de9928de-2db3-3d85-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 1 GB
+ 10000
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ a54cbf16-70b2-3246-0000-000000000000
+ PROCESSOR
+
+ 0 sec
+ 1
+ DO_NOT_COMPRESS
+
+ LOAD_BALANCE_NOT_CONFIGURED
+ DO_NOT_LOAD_BALANCE
+
+ failure
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 71a6da10-707a-391c-0000-000000000000
+ PROCESSOR
+
+ 0
+
+
+ e0bb5fda-ac07-3c14-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 1 GB
+ 10000
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 57296a6b-edc3-3e6e-0000-000000000000
+ PROCESSOR
+
+ 0 sec
+ 1
+ DO_NOT_COMPRESS
+
+ LOAD_BALANCE_NOT_CONFIGURED
+ DO_NOT_LOAD_BALANCE
+
+ Response
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 07aea70e-37e6-3dd0-0000-000000000000
+ PROCESSOR
+
+ 0
+
+
+ e7088add-f8e4-3e71-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 1 GB
+ 10000
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 29b03b51-dd8d-3369-0000-000000000000
+ PROCESSOR
+
+ 0 sec
+ 1
+ DO_NOT_COMPRESS
+
+ LOAD_BALANCE_NOT_CONFIGURED
+ DO_NOT_LOAD_BALANCE
+
+ Response
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 7083cb72-46f9-37d4-0000-000000000000
+ PROCESSOR
+
+ 0
+
+
+ f11e4c69-25c1-370a-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 1 GB
+ 10
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 42ebd02a-4def-357f-0000-000000000000
+ PROCESSOR
+
+ 0 sec
+ 1
+ DO_NOT_COMPRESS
+
+ LOAD_BALANCE_NOT_CONFIGURED
+ DO_NOT_LOAD_BALANCE
+
+ success
+
+ 701bdca0-b08c-3dc3-0000-000000000000
+ 27f5905d-c078-3603-0000-000000000000
+ PROCESSOR
+
+ 0
+
+
+ 61545c67-9ff1-3dda-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ nifi-record-serialization-services-nar
+ org.apache.nifi
+ 1.16.3
+
+
+
+ schema-access-strategy
+
+ schema-access-strategy
+
+
+
+ schema-registry
+
+
+ confluent-encoded
+ schema-name
+ hwx-schema-ref-attributes
+ hwx-content-encoded-schema
+ schema-access-strategy
+
+ org.apache.nifi.schemaregistry.services.SchemaRegistry
+ schema-registry
+
+
+
+ schema-name
+
+
+ schema-name
+ schema-access-strategy
+
+ schema-name
+
+
+
+ schema-version
+
+
+ schema-name
+ schema-access-strategy
+
+ schema-version
+
+
+
+ schema-branch
+
+
+ schema-name
+ schema-access-strategy
+
+ schema-branch
+
+
+
+ schema-text
+
+
+ schema-text-property
+ schema-access-strategy
+
+ schema-text
+
+
+
+ schema-inference-cache
+
+
+ infer-schema
+ schema-access-strategy
+
+ org.apache.nifi.serialization.RecordSchemaCacheService
+ schema-inference-cache
+
+
+
+ starting-field-strategy
+
+ starting-field-strategy
+
+
+
+ starting-field-name
+
+
+ NESTED_FIELD
+ starting-field-strategy
+
+ starting-field-name
+
+
+
+ Date Format
+
+ Date Format
+
+
+
+ Time Format
+
+ Time Format
+
+
+
+ Timestamp Format
+
+ Timestamp Format
+
+
+
+ JsonTreeReader
+ false
+
+
+ schema-access-strategy
+ infer-schema
+
+
+ schema-registry
+
+
+ schema-name
+ ${schema.name}
+
+
+ schema-version
+
+
+ schema-branch
+
+
+ schema-text
+ ${avro.schema}
+
+
+ schema-inference-cache
+
+
+ starting-field-strategy
+ ROOT_NODE
+
+
+ starting-field-name
+
+
+ Date Format
+
+
+ Time Format
+
+
+ Timestamp Format
+
+
+ ENABLED
+ org.apache.nifi.json.JsonTreeReader
+
+
+ fdb721b8-788c-37b4-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ nifi-record-serialization-services-nar
+ org.apache.nifi
+ 1.16.3
+
+
+
+ Schema Write Strategy
+
+ Schema Write Strategy
+
+
+
+ schema-cache
+
+ org.apache.nifi.serialization.RecordSchemaCacheService
+ schema-cache
+
+
+
+ schema-protocol-version
+
+
+ hwx-schema-ref-attributes
+ hwx-content-encoded-schema
+ Schema Write Strategy
+
+ schema-protocol-version
+
+
+
+ schema-access-strategy
+
+ schema-access-strategy
+
+
+
+ schema-registry
+
+
+ confluent-encoded
+ schema-name
+ hwx-schema-ref-attributes
+ hwx-content-encoded-schema
+ schema-access-strategy
+
+ org.apache.nifi.schemaregistry.services.SchemaRegistry
+ schema-registry
+
+
+
+ schema-name
+
+
+ schema-name
+ schema-access-strategy
+
+ schema-name
+
+
+
+ schema-version
+
+
+ schema-name
+ schema-access-strategy
+
+ schema-version
+
+
+
+ schema-branch
+
+
+ schema-name
+ schema-access-strategy
+
+ schema-branch
+
+
+
+ schema-text
+
+
+ schema-text-property
+ schema-access-strategy
+
+ schema-text
+
+
+
+ Date Format
+
+ Date Format
+
+
+
+ Time Format
+
+ Time Format
+
+
+
+ Timestamp Format
+
+ Timestamp Format
+
+
+
+ Pretty Print JSON
+
+ Pretty Print JSON
+
+
+
+ suppress-nulls
+
+ suppress-nulls
+
+
+
+ output-grouping
+
+ output-grouping
+
+
+
+ compression-format
+
+ compression-format
+
+
+
+ compression-level
+
+
+ gzip
+ compression-format
+
+ compression-level
+
+
+
+ JsonRecordSetWriter
+ false
+
+
+ Schema Write Strategy
+ no-schema
+
+
+ schema-cache
+
+
+ schema-protocol-version
+ 1
+
+
+ schema-access-strategy
+ inherit-record-schema
+
+
+ schema-registry
+
+
+ schema-name
+ ${schema.name}
+
+
+ schema-version
+
+
+ schema-branch
+
+
+ schema-text
+ ${avro.schema}
+
+
+ Date Format
+
+
+ Time Format
+
+
+ Timestamp Format
+
+
+ Pretty Print JSON
+ false
+
+
+ suppress-nulls
+ never-suppress
+
+
+ output-grouping
+ output-array
+
+
+ compression-format
+ none
+
+
+ compression-level
+ 1
+
+
+ ENABLED
+ org.apache.nifi.json.JsonRecordSetWriter
+
+
+ 2b6d8a14-286d-3421-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 16.0
+ 40.0
+
+ 24.0
+
+
+ 152.0
+ 0
+
+
+ 32a1f9cb-b895-35c7-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 632.0
+ 40.0
+
+ 24.0
+
+
+ 152.0
+ 0
+
+
+ 476743fe-9b88-3e68-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 1024.0
+ 40.0
+
+ 24.0
+
+
+ 152.0
+ 0
+
+
+ 7d66f50f-e608-3676-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 16.0
+ 0.0
+
+ 24.0
+
+
+ 968.0
+ 0
+
+
+ 7f728d54-8102-335b-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 1424.0
+ 40.0
+
+ 24.0
+
+
+ 152.0
+ 0
+
+
+ b075eb7b-5b89-326d-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 1824.0
+ 40.0
+
+ 24.0
+
+
+ 152.0
+ 0
+
+
+ f9419456-b987-3ab3-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 1024.0
+ 0.0
+
+ 24.0
+
+
+ 1152.0
+ 0
+
+
+ 014ce19b-e847-319f-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 632.0
+ 72.0
+
+
+ nifi-standard-nar
+ org.apache.nifi
+ 1.16.3
+
+
+ PENALIZE_FLOWFILE
+ WARN
+
+ 1
+
+
+ HTTP Method
+
+ HTTP Method
+
+
+
+ Remote URL
+
+ Remote URL
+
+
+
+ SSL Context Service
+
+ org.apache.nifi.ssl.SSLContextService
+ SSL Context Service
+
+
+
+ Connection Timeout
+
+ Connection Timeout
+
+
+
+ Read Timeout
+
+ Read Timeout
+
+
+
+ idle-timeout
+
+ idle-timeout
+
+
+
+ max-idle-connections
+
+ max-idle-connections
+
+
+
+ Include Date Header
+
+ Include Date Header
+
+
+
+ Follow Redirects
+
+ Follow Redirects
+
+
+
+ cookie-strategy
+
+ cookie-strategy
+
+
+
+ disable-http2
+
+ disable-http2
+
+
+
+ flow-file-naming-strategy
+
+ flow-file-naming-strategy
+
+
+
+ Attributes to Send
+
+ Attributes to Send
+
+
+
+ Useragent
+
+ Useragent
+
+
+
+ Basic Authentication Username
+
+ Basic Authentication Username
+
+
+
+ Basic Authentication Password
+
+ Basic Authentication Password
+
+
+
+ oauth2-access-token-provider
+
+ org.apache.nifi.oauth2.OAuth2AccessTokenProvider
+ oauth2-access-token-provider
+
+
+
+ proxy-configuration-service
+
+ org.apache.nifi.proxy.ProxyConfigurationService
+ proxy-configuration-service
+
+
+
+ Proxy Host
+
+ Proxy Host
+
+
+
+ Proxy Port
+
+ Proxy Port
+
+
+
+ Proxy Type
+
+ Proxy Type
+
+
+
+ invokehttp-proxy-user
+
+ invokehttp-proxy-user
+
+
+
+ invokehttp-proxy-password
+
+ invokehttp-proxy-password
+
+
+
+ Put Response Body In Attribute
+
+ Put Response Body In Attribute
+
+
+
+ Max Length To Put In Attribute
+
+ Max Length To Put In Attribute
+
+
+
+ Digest Authentication
+
+ Digest Authentication
+
+
+
+ Always Output Response
+
+ Always Output Response
+
+
+
+ Add Response Headers to Request
+
+ Add Response Headers to Request
+
+
+
+ Content-Type
+
+ Content-Type
+
+
+
+ send-message-body
+
+ send-message-body
+
+
+
+ Use Chunked Encoding
+
+ Use Chunked Encoding
+
+
+
+ Penalize on "No Retry"
+
+ Penalize on "No Retry"
+
+
+
+ use-etag
+
+ use-etag
+
+
+
+ etag-max-cache-size
+
+ etag-max-cache-size
+
+
+
+ ignore-response-content
+
+ ignore-response-content
+
+
+
+ form-body-form-name
+
+ form-body-form-name
+
+
+
+ set-form-filename
+
+ set-form-filename
+
+
+
+ PRIMARY
+ false
+ 10 mins
+ 30 sec
+
+
+ HTTP Method
+ GET
+
+
+ Remote URL
+ https://www.pegelonline.wsv.de/webservices/rest-api/v2/stations.json
+
+
+ SSL Context Service
+
+
+ Connection Timeout
+ 5 secs
+
+
+ Read Timeout
+ 15 secs
+
+
+ idle-timeout
+ 5 mins
+
+
+ max-idle-connections
+ 5
+
+
+ Include Date Header
+ True
+
+
+ Follow Redirects
+ True
+
+
+ cookie-strategy
+ DISABLED
+
+
+ disable-http2
+ False
+
+
+ flow-file-naming-strategy
+ RANDOM
+
+
+ Attributes to Send
+
+
+ Useragent
+
+
+ Basic Authentication Username
+
+
+ Basic Authentication Password
+
+
+ oauth2-access-token-provider
+
+
+ proxy-configuration-service
+
+
+ Proxy Host
+
+
+ Proxy Port
+
+
+ Proxy Type
+ http
+
+
+ invokehttp-proxy-user
+
+
+ invokehttp-proxy-password
+
+
+ Put Response Body In Attribute
+
+
+ Max Length To Put In Attribute
+ 256
+
+
+ Digest Authentication
+ false
+
+
+ Always Output Response
+ false
+
+
+ Add Response Headers to Request
+ false
+
+
+ Content-Type
+ ${mime.type}
+
+
+ send-message-body
+ true
+
+
+ Use Chunked Encoding
+ false
+
+
+ Penalize on "No Retry"
+ false
+
+
+ use-etag
+ false
+
+
+ etag-max-cache-size
+ 10MB
+
+
+ ignore-response-content
+ false
+
+
+ form-body-form-name
+
+
+ set-form-filename
+ true
+
+
+ 10
+ 0
+ 1 s
+ TIMER_DRIVEN
+ 1 sec
+
+ false
+ Get station list
+
+ true
+ Failure
+ false
+
+
+ true
+ No Retry
+ false
+
+
+ true
+ Original
+ false
+
+
+ false
+ Response
+ false
+
+
+ true
+ Retry
+ false
+
+ STOPPED
+
+ org.apache.nifi.processors.standard.InvokeHTTP
+
+
+ 07aea70e-37e6-3dd0-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 16.0
+ 72.0
+
+
+ nifi-standard-nar
+ org.apache.nifi
+ 1.16.3
+
+
+ PENALIZE_FLOWFILE
+ WARN
+
+ 1
+
+
+ HTTP Method
+
+ HTTP Method
+
+
+
+ Remote URL
+
+ Remote URL
+
+
+
+ SSL Context Service
+
+ org.apache.nifi.ssl.SSLContextService
+ SSL Context Service
+
+
+
+ Connection Timeout
+
+ Connection Timeout
+
+
+
+ Read Timeout
+
+ Read Timeout
+
+
+
+ idle-timeout
+
+ idle-timeout
+
+
+
+ max-idle-connections
+
+ max-idle-connections
+
+
+
+ Include Date Header
+
+ Include Date Header
+
+
+
+ Follow Redirects
+
+ Follow Redirects
+
+
+
+ cookie-strategy
+
+ cookie-strategy
+
+
+
+ disable-http2
+
+ disable-http2
+
+
+
+ flow-file-naming-strategy
+
+ flow-file-naming-strategy
+
+
+
+ Attributes to Send
+
+ Attributes to Send
+
+
+
+ Useragent
+
+ Useragent
+
+
+
+ Basic Authentication Username
+
+ Basic Authentication Username
+
+
+
+ Basic Authentication Password
+
+ Basic Authentication Password
+
+
+
+ oauth2-access-token-provider
+
+ org.apache.nifi.oauth2.OAuth2AccessTokenProvider
+ oauth2-access-token-provider
+
+
+
+ proxy-configuration-service
+
+ org.apache.nifi.proxy.ProxyConfigurationService
+ proxy-configuration-service
+
+
+
+ Proxy Host
+
+ Proxy Host
+
+
+
+ Proxy Port
+
+ Proxy Port
+
+
+
+ Proxy Type
+
+ Proxy Type
+
+
+
+ invokehttp-proxy-user
+
+ invokehttp-proxy-user
+
+
+
+ invokehttp-proxy-password
+
+ invokehttp-proxy-password
+
+
+
+ Put Response Body In Attribute
+
+ Put Response Body In Attribute
+
+
+
+ Max Length To Put In Attribute
+
+ Max Length To Put In Attribute
+
+
+
+ Digest Authentication
+
+ Digest Authentication
+
+
+
+ Always Output Response
+
+ Always Output Response
+
+
+
+ Add Response Headers to Request
+
+ Add Response Headers to Request
+
+
+
+ Content-Type
+
+ Content-Type
+
+
+
+ send-message-body
+
+ send-message-body
+
+
+
+ Use Chunked Encoding
+
+ Use Chunked Encoding
+
+
+
+ Penalize on "No Retry"
+
+ Penalize on "No Retry"
+
+
+
+ use-etag
+
+ use-etag
+
+
+
+ etag-max-cache-size
+
+ etag-max-cache-size
+
+
+
+ ignore-response-content
+
+ ignore-response-content
+
+
+
+ form-body-form-name
+
+ form-body-form-name
+
+
+
+ set-form-filename
+
+ set-form-filename
+
+
+
+ PRIMARY
+ false
+ 10 mins
+ 30 sec
+
+
+ HTTP Method
+ GET
+
+
+ Remote URL
+ https://www.pegelonline.wsv.de/webservices/rest-api/v2/stations.json
+
+
+ SSL Context Service
+
+
+ Connection Timeout
+ 5 secs
+
+
+ Read Timeout
+ 15 secs
+
+
+ idle-timeout
+ 5 mins
+
+
+ max-idle-connections
+ 5
+
+
+ Include Date Header
+ True
+
+
+ Follow Redirects
+ True
+
+
+ cookie-strategy
+ DISABLED
+
+
+ disable-http2
+ False
+
+
+ flow-file-naming-strategy
+ RANDOM
+
+
+ Attributes to Send
+
+
+ Useragent
+
+
+ Basic Authentication Username
+
+
+ Basic Authentication Password
+
+
+ oauth2-access-token-provider
+
+
+ proxy-configuration-service
+
+
+ Proxy Host
+
+
+ Proxy Port
+
+
+ Proxy Type
+ http
+
+
+ invokehttp-proxy-user
+
+
+ invokehttp-proxy-password
+
+
+ Put Response Body In Attribute
+
+
+ Max Length To Put In Attribute
+ 256
+
+
+ Digest Authentication
+ false
+
+
+ Always Output Response
+ false
+
+
+ Add Response Headers to Request
+ false
+
+
+ Content-Type
+ ${mime.type}
+
+
+ send-message-body
+ true
+
+
+ Use Chunked Encoding
+ false
+
+
+ Penalize on "No Retry"
+ false
+
+
+ use-etag
+ false
+
+
+ etag-max-cache-size
+ 10MB
+
+
+ ignore-response-content
+ false
+
+
+ form-body-form-name
+
+
+ set-form-filename
+ true
+
+
+ 10
+ 0
+ 365000 days
+ TIMER_DRIVEN
+ 1 sec
+
+ false
+ Get station list
+
+ true
+ Failure
+ false
+
+
+ true
+ No Retry
+ false
+
+
+ true
+ Original
+ false
+
+
+ false
+ Response
+ false
+
+
+ true
+ Retry
+ false
+
+ RUNNING
+
+ org.apache.nifi.processors.standard.InvokeHTTP
+
+
+ 08c3f2f6-c707-38f2-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 1440.0
+ 552.0
+
+
+ nifi-kafka-2-6-nar
+ org.apache.nifi
+ 1.16.3
+
+
+ PENALIZE_FLOWFILE
+ WARN
+
+ 1
+
+
+ bootstrap.servers
+
+ bootstrap.servers
+
+
+
+ topic
+
+ topic
+
+
+
+ record-reader
+
+ org.apache.nifi.serialization.RecordReaderFactory
+ record-reader
+
+
+
+ record-writer
+
+ org.apache.nifi.serialization.RecordSetWriterFactory
+ record-writer
+
+
+
+ use-transactions
+
+ use-transactions
+
+
+
+ transactional-id-prefix
+
+
+true
+use-transactions
+
+ transactional-id-prefix
+
+
+
+ Failure Strategy
+
+ Failure Strategy
+
+
+
+ acks
+
+ acks
+
+
+
+ attribute-name-regex
+
+ attribute-name-regex
+
+
+
+ message-header-encoding
+
+ message-header-encoding
+
+
+
+ security.protocol
+
+ security.protocol
+
+
+
+ sasl.mechanism
+
+ sasl.mechanism
+
+
+
+ kerberos-credentials-service
+
+ org.apache.nifi.kerberos.KerberosCredentialsService
+ kerberos-credentials-service
+
+
+
+ kerberos-user-service
+
+ org.apache.nifi.kerberos.SelfContainedKerberosUserService
+ kerberos-user-service
+
+
+
+ sasl.kerberos.service.name
+
+ sasl.kerberos.service.name
+
+
+
+ sasl.kerberos.principal
+
+ sasl.kerberos.principal
+
+
+
+ sasl.kerberos.keytab
+
+ sasl.kerberos.keytab
+
+
+
+ sasl.username
+
+ sasl.username
+
+
+
+ sasl.password
+
+ sasl.password
+
+
+
+ sasl.token.auth
+
+ sasl.token.auth
+
+
+
+ ssl.context.service
+
+ org.apache.nifi.ssl.SSLContextService
+ ssl.context.service
+
+
+
+ message-key-field
+
+ message-key-field
+
+
+
+ max.request.size
+
+ max.request.size
+
+
+
+ ack.wait.time
+
+ ack.wait.time
+
+
+
+ max.block.ms
+
+ max.block.ms
+
+
+
+ partitioner.class
+
+ partitioner.class
+
+
+
+ partition
+
+ partition
+
+
+
+ compression.type
+
+ compression.type
+
+
+
+ ALL
+ false
+ 10 mins
+ 30 sec
+
+
+ bootstrap.servers
+ kafka:9092
+
+
+ topic
+ shared_bikes_station_status
+
+
+ record-reader
+ 61545c67-9ff1-3dda-0000-000000000000
+
+
+ record-writer
+ fdb721b8-788c-37b4-0000-000000000000
+
+
+ use-transactions
+ false
+
+
+ transactional-id-prefix
+
+
+ Failure Strategy
+ Route to Failure
+
+
+ acks
+ all
+
+
+ attribute-name-regex
+
+
+ message-header-encoding
+ UTF-8
+
+
+ security.protocol
+ PLAINTEXT
+
+
+ sasl.mechanism
+ GSSAPI
+
+
+ kerberos-credentials-service
+
+
+ kerberos-user-service
+
+
+ sasl.kerberos.service.name
+
+
+ sasl.kerberos.principal
+
+
+ sasl.kerberos.keytab
+
+
+ sasl.username
+
+
+ sasl.password
+
+
+ sasl.token.auth
+ false
+
+
+ ssl.context.service
+
+
+ message-key-field
+
+
+ max.request.size
+ 1 MB
+
+
+ ack.wait.time
+ 5 secs
+
+
+ max.block.ms
+ 5 sec
+
+
+ partitioner.class
+ org.apache.kafka.clients.producer.internals.DefaultPartitioner
+
+
+ partition
+
+
+ compression.type
+ none
+
+
+ 10
+ 0
+ 0 sec
+ TIMER_DRIVEN
+ 1 sec
+
+ false
+ Produce station status records
+
+ true
+ failure
+ false
+
+
+ true
+ success
+ false
+
+ RUNNING
+
+ org.apache.nifi.processors.kafka.pubsub.PublishKafkaRecord_2_6
+
+
+ 16b3ec1d-2d76-346a-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 1832.0
+ 336.0
+
+
+ nifi-standard-nar
+ org.apache.nifi
+ 1.16.3
+
+
+ PENALIZE_FLOWFILE
+ WARN
+
+ 1
+
+
+ jolt-transform
+
+ jolt-transform
+
+
+
+ jolt-custom-class
+
+
+jolt-transform-custom
+jolt-spec
+
+ jolt-custom-class
+
+
+
+ jolt-custom-modules
+
+
+jolt-transform-custom
+jolt-spec
+
+ jolt-custom-modules
+
+
+
+ jolt-spec
+
+ jolt-spec
+
+
+
+ Transform Cache Size
+
+ Transform Cache Size
+
+
+
+ pretty_print
+
+ pretty_print
+
+
+
+ ALL
+ false
+ 10 mins
+ 30 sec
+
+
+ jolt-transform
+ jolt-transform-chain
+
+
+ jolt-custom-class
+
+
+ jolt-custom-modules
+
+
+ jolt-spec
+ [
+ {
+ "operation": "shift",
+ "spec": {
+ "data": {
+ "*": ""
+ }
+ }
+ }
+]
+
+
+
+ Transform Cache Size
+ 1
+
+
+ pretty_print
+ false
+
+
+ 10
+ 0
+ 0 sec
+ TIMER_DRIVEN
+ 1 sec
+
+ false
+ Extract data attribute
+
+ true
+ failure
+ false
+
+
+ false
+ success
+ false
+
+ RUNNING
+
+ org.apache.nifi.processors.standard.JoltTransformJSON
+
+
+ 252a7e8d-f105-3b97-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 1840.0
+ 552.0
+
+
+ nifi-kafka-2-6-nar
+ org.apache.nifi
+ 1.16.3
+
+
+ PENALIZE_FLOWFILE
+ WARN
+
+ 1
+
+
+ bootstrap.servers
+
+ bootstrap.servers
+
+
+
+ topic
+
+ topic
+
+
+
+ record-reader
+
+ org.apache.nifi.serialization.RecordReaderFactory
+ record-reader
+
+
+
+ record-writer
+
+ org.apache.nifi.serialization.RecordSetWriterFactory
+ record-writer
+
+
+
+ use-transactions
+
+ use-transactions
+
+
+
+ transactional-id-prefix
+
+
+true
+use-transactions
+
+ transactional-id-prefix
+
+
+
+ Failure Strategy
+
+ Failure Strategy
+
+
+
+ acks
+
+ acks
+
+
+
+ attribute-name-regex
+
+ attribute-name-regex
+
+
+
+ message-header-encoding
+
+ message-header-encoding
+
+
+
+ security.protocol
+
+ security.protocol
+
+
+
+ sasl.mechanism
+
+ sasl.mechanism
+
+
+
+ kerberos-credentials-service
+
+ org.apache.nifi.kerberos.KerberosCredentialsService
+ kerberos-credentials-service
+
+
+
+ kerberos-user-service
+
+ org.apache.nifi.kerberos.SelfContainedKerberosUserService
+ kerberos-user-service
+
+
+
+ sasl.kerberos.service.name
+
+ sasl.kerberos.service.name
+
+
+
+ sasl.kerberos.principal
+
+ sasl.kerberos.principal
+
+
+
+ sasl.kerberos.keytab
+
+ sasl.kerberos.keytab
+
+
+
+ sasl.username
+
+ sasl.username
+
+
+
+ sasl.password
+
+ sasl.password
+
+
+
+ sasl.token.auth
+
+ sasl.token.auth
+
+
+
+ ssl.context.service
+
+ org.apache.nifi.ssl.SSLContextService
+ ssl.context.service
+
+
+
+ message-key-field
+
+ message-key-field
+
+
+
+ max.request.size
+
+ max.request.size
+
+
+
+ ack.wait.time
+
+ ack.wait.time
+
+
+
+ max.block.ms
+
+ max.block.ms
+
+
+
+ partitioner.class
+
+ partitioner.class
+
+
+
+ partition
+
+ partition
+
+
+
+ compression.type
+
+ compression.type
+
+
+
+ ALL
+ false
+ 10 mins
+ 30 sec
+
+
+ bootstrap.servers
+ kafka:9092
+
+
+ topic
+ shared_bikes_bike_status
+
+
+ record-reader
+ 61545c67-9ff1-3dda-0000-000000000000
+
+
+ record-writer
+ fdb721b8-788c-37b4-0000-000000000000
+
+
+ use-transactions
+ false
+
+
+ transactional-id-prefix
+
+
+ Failure Strategy
+ Route to Failure
+
+
+ acks
+ all
+
+
+ attribute-name-regex
+
+
+ message-header-encoding
+ UTF-8
+
+
+ security.protocol
+ PLAINTEXT
+
+
+ sasl.mechanism
+ GSSAPI
+
+
+ kerberos-credentials-service
+
+
+ kerberos-user-service
+
+
+ sasl.kerberos.service.name
+
+
+ sasl.kerberos.principal
+
+
+ sasl.kerberos.keytab
+
+
+ sasl.username
+
+
+ sasl.password
+
+
+ sasl.token.auth
+ false
+
+
+ ssl.context.service
+
+
+ message-key-field
+
+
+ max.request.size
+ 1 MB
+
+
+ ack.wait.time
+ 5 secs
+
+
+ max.block.ms
+ 5 sec
+
+
+ partitioner.class
+ org.apache.kafka.clients.producer.internals.DefaultPartitioner
+
+
+ partition
+
+
+ compression.type
+ none
+
+
+ 10
+ 0
+ 0 sec
+ TIMER_DRIVEN
+ 1 sec
+
+ false
+ Produce bike status records
+
+ false
+ failure
+ false
+
+
+ true
+ success
+ false
+
+ RUNNING
+
+ org.apache.nifi.processors.kafka.pubsub.PublishKafkaRecord_2_6
+
+
+ 27f5905d-c078-3603-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 632.0
+ 1016.0
+
+
+ nifi-standard-nar
+ org.apache.nifi
+ 1.16.3
+
+
+ PENALIZE_FLOWFILE
+ WARN
+
+ 1
+
+
+ jolt-transform
+
+ jolt-transform
+
+
+
+ jolt-custom-class
+
+
+jolt-transform-custom
+jolt-spec
+
+ jolt-custom-class
+
+
+
+ jolt-custom-modules
+
+
+jolt-transform-custom
+jolt-spec
+
+ jolt-custom-modules
+
+
+
+ jolt-spec
+
+ jolt-spec
+
+
+
+ Transform Cache Size
+
+ Transform Cache Size
+
+
+
+ pretty_print
+
+ pretty_print
+
+
+
+ ALL
+ false
+ 10 mins
+ 30 sec
+
+
+ jolt-transform
+ jolt-transform-chain
+
+
+ jolt-custom-class
+
+
+ jolt-custom-modules
+
+
+ jolt-spec
+ [
+ {
+ "operation": "default",
+ "spec": {
+ "*": {
+ "station_uuid": "${station_uuid}"
+ }
+ }
+ }
+]
+
+
+
+ Transform Cache Size
+ 1
+
+
+ pretty_print
+ false
+
+
+ 10
+ 0
+ 0 sec
+ TIMER_DRIVEN
+ 1 sec
+
+ false
+ Add station_uuid
+
+ true
+ failure
+ false
+
+
+ false
+ success
+ false
+
+ RUNNING
+
+ org.apache.nifi.processors.standard.JoltTransformJSON
+
+
+ 29b03b51-dd8d-3369-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 1432.0
+ 336.0
+
+
+ nifi-standard-nar
+ org.apache.nifi
+ 1.16.3
+
+
+ PENALIZE_FLOWFILE
+ WARN
+
+ 1
+
+
+ jolt-transform
+
+ jolt-transform
+
+
+
+ jolt-custom-class
+
+
+jolt-transform-custom
+jolt-spec
+
+ jolt-custom-class
+
+
+
+ jolt-custom-modules
+
+
+jolt-transform-custom
+jolt-spec
+
+ jolt-custom-modules
+
+
+
+ jolt-spec
+
+ jolt-spec
+
+
+
+ Transform Cache Size
+
+ Transform Cache Size
+
+
+
+ pretty_print
+
+ pretty_print
+
+
+
+ ALL
+ false
+ 10 mins
+ 30 sec
+
+
+ jolt-transform
+ jolt-transform-chain
+
+
+ jolt-custom-class
+
+
+ jolt-custom-modules
+
+
+ jolt-spec
+ [
+ {
+ "operation": "shift",
+ "spec": {
+ "data": {
+ "*": ""
+ }
+ }
+ }
+]
+
+
+
+ Transform Cache Size
+ 1
+
+
+ pretty_print
+ false
+
+
+ 10
+ 0
+ 0 sec
+ TIMER_DRIVEN
+ 1 sec
+
+ false
+ Extract data attribute
+
+ true
+ failure
+ false
+
+
+ false
+ success
+ false
+
+ RUNNING
+
+ org.apache.nifi.processors.standard.JoltTransformJSON
+
+
+ 2f8ff75e-c8fb-33bf-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 632.0
+ 336.0
+
+
+ nifi-standard-nar
+ org.apache.nifi
+ 1.16.3
+
+
+ PENALIZE_FLOWFILE
+ WARN
+
+ 1
+
+
+ Record Reader
+
+ org.apache.nifi.serialization.RecordReaderFactory
+ Record Reader
+
+
+
+ Record Writer
+
+ org.apache.nifi.serialization.RecordSetWriterFactory
+ Record Writer
+
+
+
+ Records Per Split
+
+ Records Per Split
+
+
+
+ ALL
+ false
+ 10 mins
+ 30 sec
+
+
+ Record Reader
+ 61545c67-9ff1-3dda-0000-000000000000
+
+
+ Record Writer
+ fdb721b8-788c-37b4-0000-000000000000
+
+
+ Records Per Split
+ 1
+
+
+ 10
+ 0
+ 0 sec
+ TIMER_DRIVEN
+ 1 sec
+
+ false
+ SplitRecord
+
+ true
+ failure
+ false
+
+
+ true
+ original
+ false
+
+
+ false
+ splits
+ false
+
+ RUNNING
+
+ org.apache.nifi.processors.standard.SplitRecord
+
+
+ 42ebd02a-4def-357f-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 632.0
+ 1216.0
+
+
+ nifi-standard-nar
+ org.apache.nifi
+ 1.16.3
+
+
+ PENALIZE_FLOWFILE
+ WARN
+
+ 1
+
+
+ Number of Copies
+
+ Number of Copies
+
+
+
+ ALL
+ false
+ 10 mins
+ 30 sec
+
+
+ Number of Copies
+ 100
+
+
+ 10
+ 0
+ 0 sec
+ TIMER_DRIVEN
+ 1 sec
+
+ false
+ DuplicateFlowFile
+
+ false
+ success
+ false
+
+ RUNNING
+
+ org.apache.nifi.processors.standard.DuplicateFlowFile
+
+
+ 54766e13-fa3f-3541-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 1824.0
+ 72.0
+
+
+ nifi-standard-nar
+ org.apache.nifi
+ 1.16.3
+
+
+ PENALIZE_FLOWFILE
+ WARN
+
+ 1
+
+
+ HTTP Method
+
+ HTTP Method
+
+
+
+ Remote URL
+
+ Remote URL
+
+
+
+ SSL Context Service
+
+ org.apache.nifi.ssl.SSLContextService
+ SSL Context Service
+
+
+
+ Connection Timeout
+
+ Connection Timeout
+
+
+
+ Read Timeout
+
+ Read Timeout
+
+
+
+ idle-timeout
+
+ idle-timeout
+
+
+
+ max-idle-connections
+
+ max-idle-connections
+
+
+
+ Include Date Header
+
+ Include Date Header
+
+
+
+ Follow Redirects
+
+ Follow Redirects
+
+
+
+ cookie-strategy
+
+ cookie-strategy
+
+
+
+ disable-http2
+
+ disable-http2
+
+
+
+ flow-file-naming-strategy
+
+ flow-file-naming-strategy
+
+
+
+ Attributes to Send
+
+ Attributes to Send
+
+
+
+ Useragent
+
+ Useragent
+
+
+
+ Basic Authentication Username
+
+ Basic Authentication Username
+
+
+
+ Basic Authentication Password
+
+ Basic Authentication Password
+
+
+
+ oauth2-access-token-provider
+
+ org.apache.nifi.oauth2.OAuth2AccessTokenProvider
+ oauth2-access-token-provider
+
+
+
+ proxy-configuration-service
+
+ org.apache.nifi.proxy.ProxyConfigurationService
+ proxy-configuration-service
+
+
+
+ Proxy Host
+
+ Proxy Host
+
+
+
+ Proxy Port
+
+ Proxy Port
+
+
+
+ Proxy Type
+
+ Proxy Type
+
+
+
+ invokehttp-proxy-user
+
+ invokehttp-proxy-user
+
+
+
+ invokehttp-proxy-password
+
+ invokehttp-proxy-password
+
+
+
+ Put Response Body In Attribute
+
+ Put Response Body In Attribute
+
+
+
+ Max Length To Put In Attribute
+
+ Max Length To Put In Attribute
+
+
+
+ Digest Authentication
+
+ Digest Authentication
+
+
+
+ Always Output Response
+
+ Always Output Response
+
+
+
+ Add Response Headers to Request
+
+ Add Response Headers to Request
+
+
+
+ Content-Type
+
+ Content-Type
+
+
+
+ send-message-body
+
+ send-message-body
+
+
+
+ Use Chunked Encoding
+
+ Use Chunked Encoding
+
+
+
+ Penalize on "No Retry"
+
+ Penalize on "No Retry"
+
+
+
+ use-etag
+
+ use-etag
+
+
+
+ etag-max-cache-size
+
+ etag-max-cache-size
+
+
+
+ ignore-response-content
+
+ ignore-response-content
+
+
+
+ form-body-form-name
+
+ form-body-form-name
+
+
+
+ set-form-filename
+
+ set-form-filename
+
+
+
+ PRIMARY
+ false
+ 10 mins
+ 30 sec
+
+
+ HTTP Method
+ GET
+
+
+ Remote URL
+ https://ip-api.mobidata-bw.de/v1/NVBW/gbfs/v2/free_bike_status.json
+
+
+ SSL Context Service
+
+
+ Connection Timeout
+ 5 secs
+
+
+ Read Timeout
+ 15 secs
+
+
+ idle-timeout
+ 5 mins
+
+
+ max-idle-connections
+ 5
+
+
+ Include Date Header
+ True
+
+
+ Follow Redirects
+ True
+
+
+ cookie-strategy
+ DISABLED
+
+
+ disable-http2
+ False
+
+
+ flow-file-naming-strategy
+ RANDOM
+
+
+ Attributes to Send
+
+
+ Useragent
+
+
+ Basic Authentication Username
+
+
+ Basic Authentication Password
+
+
+ oauth2-access-token-provider
+
+
+ proxy-configuration-service
+
+
+ Proxy Host
+
+
+ Proxy Port
+
+
+ Proxy Type
+ http
+
+
+ invokehttp-proxy-user
+
+
+ invokehttp-proxy-password
+
+
+ Put Response Body In Attribute
+
+
+ Max Length To Put In Attribute
+ 256
+
+
+ Digest Authentication
+ false
+
+
+ Always Output Response
+ false
+
+
+ Add Response Headers to Request
+ false
+
+
+ Content-Type
+ ${mime.type}
+
+
+ send-message-body
+ true
+
+
+ Use Chunked Encoding
+ false
+
+
+ Penalize on "No Retry"
+ false
+
+
+ use-etag
+ false
+
+
+ etag-max-cache-size
+ 10MB
+
+
+ ignore-response-content
+ false
+
+
+ form-body-form-name
+
+
+ set-form-filename
+ true
+
+
+ 10
+ 0
+ 365000 days
+ TIMER_DRIVEN
+ 1 sec
+
+ false
+ Get station information
+
+ true
+ Failure
+ false
+
+
+ true
+ No Retry
+ false
+
+
+ true
+ Original
+ false
+
+
+ false
+ Response
+ false
+
+
+ true
+ Retry
+ false
+
+ STOPPED
+
+ org.apache.nifi.processors.standard.InvokeHTTP
+
+
+ 57296a6b-edc3-3e6e-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 16.0
+ 336.0
+
+
+ nifi-kafka-2-6-nar
+ org.apache.nifi
+ 1.16.3
+
+
+ PENALIZE_FLOWFILE
+ WARN
+
+ 1
+
+
+ bootstrap.servers
+
+ bootstrap.servers
+
+
+
+ topic
+
+ topic
+
+
+
+ record-reader
+
+ org.apache.nifi.serialization.RecordReaderFactory
+ record-reader
+
+
+
+ record-writer
+
+ org.apache.nifi.serialization.RecordSetWriterFactory
+ record-writer
+
+
+
+ use-transactions
+
+ use-transactions
+
+
+
+ transactional-id-prefix
+
+
+true
+use-transactions
+
+ transactional-id-prefix
+
+
+
+ Failure Strategy
+
+ Failure Strategy
+
+
+
+ acks
+
+ acks
+
+
+
+ attribute-name-regex
+
+ attribute-name-regex
+
+
+
+ message-header-encoding
+
+ message-header-encoding
+
+
+
+ security.protocol
+
+ security.protocol
+
+
+
+ sasl.mechanism
+
+ sasl.mechanism
+
+
+
+ kerberos-credentials-service
+
+ org.apache.nifi.kerberos.KerberosCredentialsService
+ kerberos-credentials-service
+
+
+
+ kerberos-user-service
+
+ org.apache.nifi.kerberos.SelfContainedKerberosUserService
+ kerberos-user-service
+
+
+
+ sasl.kerberos.service.name
+
+ sasl.kerberos.service.name
+
+
+
+ sasl.kerberos.principal
+
+ sasl.kerberos.principal
+
+
+
+ sasl.kerberos.keytab
+
+ sasl.kerberos.keytab
+
+
+
+ sasl.username
+
+ sasl.username
+
+
+
+ sasl.password
+
+ sasl.password
+
+
+
+ sasl.token.auth
+
+ sasl.token.auth
+
+
+
+ ssl.context.service
+
+ org.apache.nifi.ssl.SSLContextService
+ ssl.context.service
+
+
+
+ message-key-field
+
+ message-key-field
+
+
+
+ max.request.size
+
+ max.request.size
+
+
+
+ ack.wait.time
+
+ ack.wait.time
+
+
+
+ max.block.ms
+
+ max.block.ms
+
+
+
+ partitioner.class
+
+ partitioner.class
+
+
+
+ partition
+
+ partition
+
+
+
+ compression.type
+
+ compression.type
+
+
+
+ ALL
+ false
+ 10 mins
+ 30 sec
+
+
+ bootstrap.servers
+ kafka:9092
+
+
+ topic
+ water_levels_stations
+
+
+ record-reader
+ 61545c67-9ff1-3dda-0000-000000000000
+
+
+ record-writer
+ fdb721b8-788c-37b4-0000-000000000000
+
+
+ use-transactions
+ false
+
+
+ transactional-id-prefix
+
+
+ Failure Strategy
+ Route to Failure
+
+
+ acks
+ all
+
+
+ attribute-name-regex
+
+
+ message-header-encoding
+ UTF-8
+
+
+ security.protocol
+ PLAINTEXT
+
+
+ sasl.mechanism
+ GSSAPI
+
+
+ kerberos-credentials-service
+
+
+ kerberos-user-service
+
+
+ sasl.kerberos.service.name
+
+
+ sasl.kerberos.principal
+
+
+ sasl.kerberos.keytab
+
+
+ sasl.username
+
+
+ sasl.password
+
+
+ sasl.token.auth
+ false
+
+
+ ssl.context.service
+
+
+ message-key-field
+
+
+ max.request.size
+ 1 MB
+
+
+ ack.wait.time
+ 5 secs
+
+
+ max.block.ms
+ 5 sec
+
+
+ partitioner.class
+ org.apache.kafka.clients.producer.internals.DefaultPartitioner
+
+
+ partition
+
+
+ compression.type
+ none
+
+
+ 10
+ 0
+ 0 sec
+ TIMER_DRIVEN
+ 1 sec
+
+ false
+ Produce station records
+
+ true
+ failure
+ false
+
+
+ true
+ success
+ false
+
+ RUNNING
+
+ org.apache.nifi.processors.kafka.pubsub.PublishKafkaRecord_2_6
+
+
+ 6d56d309-2059-37b2-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 632.0
+ 552.0
+
+
+ nifi-standard-nar
+ org.apache.nifi
+ 1.16.3
+
+
+ PENALIZE_FLOWFILE
+ WARN
+
+ 1
+
+
+ Destination
+
+ Destination
+
+
+
+ Return Type
+
+ Return Type
+
+
+
+ Path Not Found Behavior
+
+ Path Not Found Behavior
+
+
+
+ Null Value Representation
+
+ Null Value Representation
+
+
+
+ station_uuid
+
+ station_uuid
+
+
+
+ ALL
+ false
+ 10 mins
+ 30 sec
+
+
+ Destination
+ flowfile-attribute
+
+
+ Return Type
+ auto-detect
+
+
+ Path Not Found Behavior
+ ignore
+
+
+ Null Value Representation
+ empty string
+
+
+ station_uuid
+ $.uuid
+
+
+ 10
+ 0
+ 0 sec
+ TIMER_DRIVEN
+ 1 sec
+
+ false
+ Extract station_uuid
+
+ true
+ failure
+ false
+
+
+ false
+ matched
+ false
+
+
+ true
+ unmatched
+ false
+
+ RUNNING
+
+ org.apache.nifi.processors.standard.EvaluateJsonPath
+
+
+ 6f5b02bd-f5bb-3a66-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 1024.0
+ 72.0
+
+
+ nifi-standard-nar
+ org.apache.nifi
+ 1.16.3
+
+
+ PENALIZE_FLOWFILE
+ WARN
+
+ 1
+
+
+ HTTP Method
+
+ HTTP Method
+
+
+
+ Remote URL
+
+ Remote URL
+
+
+
+ SSL Context Service
+
+ org.apache.nifi.ssl.SSLContextService
+ SSL Context Service
+
+
+
+ Connection Timeout
+
+ Connection Timeout
+
+
+
+ Read Timeout
+
+ Read Timeout
+
+
+
+ idle-timeout
+
+ idle-timeout
+
+
+
+ max-idle-connections
+
+ max-idle-connections
+
+
+
+ Include Date Header
+
+ Include Date Header
+
+
+
+ Follow Redirects
+
+ Follow Redirects
+
+
+
+ cookie-strategy
+
+ cookie-strategy
+
+
+
+ disable-http2
+
+ disable-http2
+
+
+
+ flow-file-naming-strategy
+
+ flow-file-naming-strategy
+
+
+
+ Attributes to Send
+
+ Attributes to Send
+
+
+
+ Useragent
+
+ Useragent
+
+
+
+ Basic Authentication Username
+
+ Basic Authentication Username
+
+
+
+ Basic Authentication Password
+
+ Basic Authentication Password
+
+
+
+ oauth2-access-token-provider
+
+ org.apache.nifi.oauth2.OAuth2AccessTokenProvider
+ oauth2-access-token-provider
+
+
+
+ proxy-configuration-service
+
+ org.apache.nifi.proxy.ProxyConfigurationService
+ proxy-configuration-service
+
+
+
+ Proxy Host
+
+ Proxy Host
+
+
+
+ Proxy Port
+
+ Proxy Port
+
+
+
+ Proxy Type
+
+ Proxy Type
+
+
+
+ invokehttp-proxy-user
+
+ invokehttp-proxy-user
+
+
+
+ invokehttp-proxy-password
+
+ invokehttp-proxy-password
+
+
+
+ Put Response Body In Attribute
+
+ Put Response Body In Attribute
+
+
+
+ Max Length To Put In Attribute
+
+ Max Length To Put In Attribute
+
+
+
+ Digest Authentication
+
+ Digest Authentication
+
+
+
+ Always Output Response
+
+ Always Output Response
+
+
+
+ Add Response Headers to Request
+
+ Add Response Headers to Request
+
+
+
+ Content-Type
+
+ Content-Type
+
+
+
+ send-message-body
+
+ send-message-body
+
+
+
+ Use Chunked Encoding
+
+ Use Chunked Encoding
+
+
+
+ Penalize on "No Retry"
+
+ Penalize on "No Retry"
+
+
+
+ use-etag
+
+ use-etag
+
+
+
+ etag-max-cache-size
+
+ etag-max-cache-size
+
+
+
+ ignore-response-content
+
+ ignore-response-content
+
+
+
+ form-body-form-name
+
+ form-body-form-name
+
+
+
+ set-form-filename
+
+ set-form-filename
+
+
+
+ PRIMARY
+ false
+ 10 mins
+ 30 sec
+
+
+ HTTP Method
+ GET
+
+
+ Remote URL
+ https://ip-api.mobidata-bw.de/v1/NVBW/gbfs/v2/station_information.json
+
+
+ SSL Context Service
+
+
+ Connection Timeout
+ 5 secs
+
+
+ Read Timeout
+ 15 secs
+
+
+ idle-timeout
+ 5 mins
+
+
+ max-idle-connections
+ 5
+
+
+ Include Date Header
+ True
+
+
+ Follow Redirects
+ True
+
+
+ cookie-strategy
+ DISABLED
+
+
+ disable-http2
+ False
+
+
+ flow-file-naming-strategy
+ RANDOM
+
+
+ Attributes to Send
+
+
+ Useragent
+
+
+ Basic Authentication Username
+
+
+ Basic Authentication Password
+
+
+ oauth2-access-token-provider
+
+
+ proxy-configuration-service
+
+
+ Proxy Host
+
+
+ Proxy Port
+
+
+ Proxy Type
+ http
+
+
+ invokehttp-proxy-user
+
+
+ invokehttp-proxy-password
+
+
+ Put Response Body In Attribute
+
+
+ Max Length To Put In Attribute
+ 256
+
+
+ Digest Authentication
+ false
+
+
+ Always Output Response
+ false
+
+
+ Add Response Headers to Request
+ false
+
+
+ Content-Type
+ ${mime.type}
+
+
+ send-message-body
+ true
+
+
+ Use Chunked Encoding
+ false
+
+
+ Penalize on "No Retry"
+ false
+
+
+ use-etag
+ false
+
+
+ etag-max-cache-size
+ 10MB
+
+
+ ignore-response-content
+ false
+
+
+ form-body-form-name
+
+
+ set-form-filename
+ true
+
+
+ 10
+ 0
+ 365000 days
+ TIMER_DRIVEN
+ 1 sec
+
+ false
+ Get station information
+
+ true
+ Failure
+ false
+
+
+ true
+ No Retry
+ false
+
+
+ true
+ Original
+ false
+
+
+ false
+ Response
+ false
+
+
+ true
+ Retry
+ false
+
+ STOPPED
+
+ org.apache.nifi.processors.standard.InvokeHTTP
+
+
+ 7083cb72-46f9-37d4-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 1424.0
+ 72.0
+
+
+ nifi-standard-nar
+ org.apache.nifi
+ 1.16.3
+
+
+ PENALIZE_FLOWFILE
+ WARN
+
+ 1
+
+
+ HTTP Method
+
+ HTTP Method
+
+
+
+ Remote URL
+
+ Remote URL
+
+
+
+ SSL Context Service
+
+ org.apache.nifi.ssl.SSLContextService
+ SSL Context Service
+
+
+
+ Connection Timeout
+
+ Connection Timeout
+
+
+
+ Read Timeout
+
+ Read Timeout
+
+
+
+ idle-timeout
+
+ idle-timeout
+
+
+
+ max-idle-connections
+
+ max-idle-connections
+
+
+
+ Include Date Header
+
+ Include Date Header
+
+
+
+ Follow Redirects
+
+ Follow Redirects
+
+
+
+ cookie-strategy
+
+ cookie-strategy
+
+
+
+ disable-http2
+
+ disable-http2
+
+
+
+ flow-file-naming-strategy
+
+ flow-file-naming-strategy
+
+
+
+ Attributes to Send
+
+ Attributes to Send
+
+
+
+ Useragent
+
+ Useragent
+
+
+
+ Basic Authentication Username
+
+ Basic Authentication Username
+
+
+
+ Basic Authentication Password
+
+ Basic Authentication Password
+
+
+
+ oauth2-access-token-provider
+
+ org.apache.nifi.oauth2.OAuth2AccessTokenProvider
+ oauth2-access-token-provider
+
+
+
+ proxy-configuration-service
+
+ org.apache.nifi.proxy.ProxyConfigurationService
+ proxy-configuration-service
+
+
+
+ Proxy Host
+
+ Proxy Host
+
+
+
+ Proxy Port
+
+ Proxy Port
+
+
+
+ Proxy Type
+
+ Proxy Type
+
+
+
+ invokehttp-proxy-user
+
+ invokehttp-proxy-user
+
+
+
+ invokehttp-proxy-password
+
+ invokehttp-proxy-password
+
+
+
+ Put Response Body In Attribute
+
+ Put Response Body In Attribute
+
+
+
+ Max Length To Put In Attribute
+
+ Max Length To Put In Attribute
+
+
+
+ Digest Authentication
+
+ Digest Authentication
+
+
+
+ Always Output Response
+
+ Always Output Response
+
+
+
+ Add Response Headers to Request
+
+ Add Response Headers to Request
+
+
+
+ Content-Type
+
+ Content-Type
+
+
+
+ send-message-body
+
+ send-message-body
+
+
+
+ Use Chunked Encoding
+
+ Use Chunked Encoding
+
+
+
+ Penalize on "No Retry"
+
+ Penalize on "No Retry"
+
+
+
+ use-etag
+
+ use-etag
+
+
+
+ etag-max-cache-size
+
+ etag-max-cache-size
+
+
+
+ ignore-response-content
+
+ ignore-response-content
+
+
+
+ form-body-form-name
+
+ form-body-form-name
+
+
+
+ set-form-filename
+
+ set-form-filename
+
+
+
+ PRIMARY
+ false
+ 10 mins
+ 30 sec
+
+
+ HTTP Method
+ GET
+
+
+ Remote URL
+ https://ip-api.mobidata-bw.de/v1/NVBW/gbfs/v2/station_status.json
+
+
+ SSL Context Service
+
+
+ Connection Timeout
+ 5 secs
+
+
+ Read Timeout
+ 15 secs
+
+
+ idle-timeout
+ 5 mins
+
+
+ max-idle-connections
+ 5
+
+
+ Include Date Header
+ True
+
+
+ Follow Redirects
+ True
+
+
+ cookie-strategy
+ DISABLED
+
+
+ disable-http2
+ False
+
+
+ flow-file-naming-strategy
+ RANDOM
+
+
+ Attributes to Send
+
+
+ Useragent
+
+
+ Basic Authentication Username
+
+
+ Basic Authentication Password
+
+
+ oauth2-access-token-provider
+
+
+ proxy-configuration-service
+
+
+ Proxy Host
+
+
+ Proxy Port
+
+
+ Proxy Type
+ http
+
+
+ invokehttp-proxy-user
+
+
+ invokehttp-proxy-password
+
+
+ Put Response Body In Attribute
+
+
+ Max Length To Put In Attribute
+ 256
+
+
+ Digest Authentication
+ false
+
+
+ Always Output Response
+ false
+
+
+ Add Response Headers to Request
+ false
+
+
+ Content-Type
+ ${mime.type}
+
+
+ send-message-body
+ true
+
+
+ Use Chunked Encoding
+ false
+
+
+ Penalize on "No Retry"
+ false
+
+
+ use-etag
+ false
+
+
+ etag-max-cache-size
+ 10MB
+
+
+ ignore-response-content
+ false
+
+
+ form-body-form-name
+
+
+ set-form-filename
+ true
+
+
+ 10
+ 0
+ 365000 days
+ TIMER_DRIVEN
+ 1 sec
+
+ false
+ Get station status
+
+ true
+ Failure
+ false
+
+
+ true
+ No Retry
+ false
+
+
+ true
+ Original
+ false
+
+
+ false
+ Response
+ false
+
+
+ true
+ Retry
+ false
+
+ STOPPED
+
+ org.apache.nifi.processors.standard.InvokeHTTP
+
+
+ 71a6da10-707a-391c-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 632.0
+ 1416.0
+
+
+ nifi-kafka-2-6-nar
+ org.apache.nifi
+ 1.16.3
+
+
+ PENALIZE_FLOWFILE
+ WARN
+
+ 4
+
+
+ bootstrap.servers
+
+ bootstrap.servers
+
+
+
+ topic
+
+ topic
+
+
+
+ record-reader
+
+ org.apache.nifi.serialization.RecordReaderFactory
+ record-reader
+
+
+
+ record-writer
+
+ org.apache.nifi.serialization.RecordSetWriterFactory
+ record-writer
+
+
+
+ use-transactions
+
+ use-transactions
+
+
+
+ transactional-id-prefix
+
+
+true
+use-transactions
+
+ transactional-id-prefix
+
+
+
+ Failure Strategy
+
+ Failure Strategy
+
+
+
+ acks
+
+ acks
+
+
+
+ attribute-name-regex
+
+ attribute-name-regex
+
+
+
+ message-header-encoding
+
+ message-header-encoding
+
+
+
+ security.protocol
+
+ security.protocol
+
+
+
+ sasl.mechanism
+
+ sasl.mechanism
+
+
+
+ kerberos-credentials-service
+
+ org.apache.nifi.kerberos.KerberosCredentialsService
+ kerberos-credentials-service
+
+
+
+ kerberos-user-service
+
+ org.apache.nifi.kerberos.SelfContainedKerberosUserService
+ kerberos-user-service
+
+
+
+ sasl.kerberos.service.name
+
+ sasl.kerberos.service.name
+
+
+
+ sasl.kerberos.principal
+
+ sasl.kerberos.principal
+
+
+
+ sasl.kerberos.keytab
+
+ sasl.kerberos.keytab
+
+
+
+ sasl.username
+
+ sasl.username
+
+
+
+ sasl.password
+
+ sasl.password
+
+
+
+ sasl.token.auth
+
+ sasl.token.auth
+
+
+
+ ssl.context.service
+
+ org.apache.nifi.ssl.SSLContextService
+ ssl.context.service
+
+
+
+ message-key-field
+
+ message-key-field
+
+
+
+ max.request.size
+
+ max.request.size
+
+
+
+ ack.wait.time
+
+ ack.wait.time
+
+
+
+ max.block.ms
+
+ max.block.ms
+
+
+
+ partitioner.class
+
+ partitioner.class
+
+
+
+ partition
+
+ partition
+
+
+
+ compression.type
+
+ compression.type
+
+
+
+ ALL
+ false
+ 10 mins
+ 30 sec
+
+
+ bootstrap.servers
+ kafka:9092
+
+
+ topic
+ water_levels_measurements
+
+
+ record-reader
+ 61545c67-9ff1-3dda-0000-000000000000
+
+
+ record-writer
+ fdb721b8-788c-37b4-0000-000000000000
+
+
+ use-transactions
+ false
+
+
+ transactional-id-prefix
+
+
+ Failure Strategy
+ Route to Failure
+
+
+ acks
+ all
+
+
+ attribute-name-regex
+
+
+ message-header-encoding
+ UTF-8
+
+
+ security.protocol
+ PLAINTEXT
+
+
+ sasl.mechanism
+ GSSAPI
+
+
+ kerberos-credentials-service
+
+
+ kerberos-user-service
+
+
+ sasl.kerberos.service.name
+
+
+ sasl.kerberos.principal
+
+
+ sasl.kerberos.keytab
+
+
+ sasl.username
+
+
+ sasl.password
+
+
+ sasl.token.auth
+ false
+
+
+ ssl.context.service
+
+
+ message-key-field
+
+
+ max.request.size
+ 1 MB
+
+
+ ack.wait.time
+ 5 secs
+
+
+ max.block.ms
+ 5 sec
+
+
+ partitioner.class
+ org.apache.kafka.clients.producer.internals.DefaultPartitioner
+
+
+ partition
+
+
+ compression.type
+ snappy
+
+
+ 10
+ 0
+ 0 sec
+ TIMER_DRIVEN
+ 1 sec
+
+ false
+ PublishKafkaRecord_2_6
+
+ false
+ failure
+ false
+
+
+ true
+ success
+ false
+
+ RUNNING
+
+ org.apache.nifi.processors.kafka.pubsub.PublishKafkaRecord_2_6
+
+
+ 7a02b8a3-8188-3148-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 2327.0
+ 541.0
+
+
+ nifi-standard-nar
+ org.apache.nifi
+ 1.16.3
+
+
+ PENALIZE_FLOWFILE
+ WARN
+
+ 1
+
+
+ Log Level
+
+ Log Level
+
+
+
+ Log Payload
+
+ Log Payload
+
+
+
+ Attributes to Log
+
+ Attributes to Log
+
+
+
+ attributes-to-log-regex
+
+ attributes-to-log-regex
+
+
+
+ Attributes to Ignore
+
+ Attributes to Ignore
+
+
+
+ attributes-to-ignore-regex
+
+ attributes-to-ignore-regex
+
+
+
+ Log FlowFile Properties
+
+ Log FlowFile Properties
+
+
+
+ Output Format
+
+ Output Format
+
+
+
+ Log prefix
+
+ Log prefix
+
+
+
+ character-set
+
+ character-set
+
+
+
+ ALL
+ false
+ 10 mins
+ 30 sec
+
+
+ Log Level
+ info
+
+
+ Log Payload
+ false
+
+
+ Attributes to Log
+
+
+ attributes-to-log-regex
+ .*
+
+
+ Attributes to Ignore
+
+
+ attributes-to-ignore-regex
+
+
+ Log FlowFile Properties
+ true
+
+
+ Output Format
+ Line per Attribute
+
+
+ Log prefix
+
+
+ character-set
+ US-ASCII
+
+
+ 10
+ 0
+ 0 sec
+ TIMER_DRIVEN
+ 1 sec
+
+ false
+ LogAttribute
+
+ false
+ success
+ false
+
+ STOPPED
+
+ org.apache.nifi.processors.standard.LogAttribute
+
+
+ 81136d3a-233f-3c7a-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 1040.0
+ 552.0
+
+
+ nifi-kafka-2-6-nar
+ org.apache.nifi
+ 1.16.3
+
+
+ PENALIZE_FLOWFILE
+ WARN
+
+ 1
+
+
+ bootstrap.servers
+
+ bootstrap.servers
+
+
+
+ topic
+
+ topic
+
+
+
+ record-reader
+
+ org.apache.nifi.serialization.RecordReaderFactory
+ record-reader
+
+
+
+ record-writer
+
+ org.apache.nifi.serialization.RecordSetWriterFactory
+ record-writer
+
+
+
+ use-transactions
+
+ use-transactions
+
+
+
+ transactional-id-prefix
+
+
+true
+use-transactions
+
+ transactional-id-prefix
+
+
+
+ Failure Strategy
+
+ Failure Strategy
+
+
+
+ acks
+
+ acks
+
+
+
+ attribute-name-regex
+
+ attribute-name-regex
+
+
+
+ message-header-encoding
+
+ message-header-encoding
+
+
+
+ security.protocol
+
+ security.protocol
+
+
+
+ sasl.mechanism
+
+ sasl.mechanism
+
+
+
+ kerberos-credentials-service
+
+ org.apache.nifi.kerberos.KerberosCredentialsService
+ kerberos-credentials-service
+
+
+
+ kerberos-user-service
+
+ org.apache.nifi.kerberos.SelfContainedKerberosUserService
+ kerberos-user-service
+
+
+
+ sasl.kerberos.service.name
+
+ sasl.kerberos.service.name
+
+
+
+ sasl.kerberos.principal
+
+ sasl.kerberos.principal
+
+
+
+ sasl.kerberos.keytab
+
+ sasl.kerberos.keytab
+
+
+
+ sasl.username
+
+ sasl.username
+
+
+
+ sasl.password
+
+ sasl.password
+
+
+
+ sasl.token.auth
+
+ sasl.token.auth
+
+
+
+ ssl.context.service
+
+ org.apache.nifi.ssl.SSLContextService
+ ssl.context.service
+
+
+
+ message-key-field
+
+ message-key-field
+
+
+
+ max.request.size
+
+ max.request.size
+
+
+
+ ack.wait.time
+
+ ack.wait.time
+
+
+
+ max.block.ms
+
+ max.block.ms
+
+
+
+ partitioner.class
+
+ partitioner.class
+
+
+
+ partition
+
+ partition
+
+
+
+ compression.type
+
+ compression.type
+
+
+
+ ALL
+ false
+ 10 mins
+ 30 sec
+
+
+ bootstrap.servers
+ kafka:9092
+
+
+ topic
+ shared_bikes_station_information
+
+
+ record-reader
+ 61545c67-9ff1-3dda-0000-000000000000
+
+
+ record-writer
+ fdb721b8-788c-37b4-0000-000000000000
+
+
+ use-transactions
+ false
+
+
+ transactional-id-prefix
+
+
+ Failure Strategy
+ Route to Failure
+
+
+ acks
+ all
+
+
+ attribute-name-regex
+
+
+ message-header-encoding
+ UTF-8
+
+
+ security.protocol
+ PLAINTEXT
+
+
+ sasl.mechanism
+ GSSAPI
+
+
+ kerberos-credentials-service
+
+
+ kerberos-user-service
+
+
+ sasl.kerberos.service.name
+
+
+ sasl.kerberos.principal
+
+
+ sasl.kerberos.keytab
+
+
+ sasl.username
+
+
+ sasl.password
+
+
+ sasl.token.auth
+ false
+
+
+ ssl.context.service
+
+
+ message-key-field
+
+
+ max.request.size
+ 1 MB
+
+
+ ack.wait.time
+ 5 secs
+
+
+ max.block.ms
+ 5 sec
+
+
+ partitioner.class
+ org.apache.kafka.clients.producer.internals.DefaultPartitioner
+
+
+ partition
+
+
+ compression.type
+ none
+
+
+ 10
+ 0
+ 0 sec
+ TIMER_DRIVEN
+ 1 sec
+
+ false
+ Produce station information records
+
+ true
+ failure
+ false
+
+
+ true
+ success
+ false
+
+ RUNNING
+
+ org.apache.nifi.processors.kafka.pubsub.PublishKafkaRecord_2_6
+
+
+ 906719c0-82ef-3b69-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 0.0
+ 792.0
+
+
+ nifi-standard-nar
+ org.apache.nifi
+ 1.16.3
+
+
+ PENALIZE_FLOWFILE
+ WARN
+
+ 1
+
+
+ Log Level
+
+ Log Level
+
+
+
+ Log Payload
+
+ Log Payload
+
+
+
+ Attributes to Log
+
+ Attributes to Log
+
+
+
+ attributes-to-log-regex
+
+ attributes-to-log-regex
+
+
+
+ Attributes to Ignore
+
+ Attributes to Ignore
+
+
+
+ attributes-to-ignore-regex
+
+ attributes-to-ignore-regex
+
+
+
+ Log FlowFile Properties
+
+ Log FlowFile Properties
+
+
+
+ Output Format
+
+ Output Format
+
+
+
+ Log prefix
+
+ Log prefix
+
+
+
+ character-set
+
+ character-set
+
+
+
+ ALL
+ false
+ 10 mins
+ 30 sec
+
+
+ Log Level
+ info
+
+
+ Log Payload
+ false
+
+
+ Attributes to Log
+
+
+ attributes-to-log-regex
+ .*
+
+
+ Attributes to Ignore
+
+
+ attributes-to-ignore-regex
+
+
+ Log FlowFile Properties
+ true
+
+
+ Output Format
+ Line per Attribute
+
+
+ Log prefix
+
+
+ character-set
+ US-ASCII
+
+
+ 10
+ 0
+ 0 sec
+ TIMER_DRIVEN
+ 1 sec
+
+ false
+ LogAttribute
+
+ true
+ success
+ false
+
+ RUNNING
+
+ org.apache.nifi.processors.standard.LogAttribute
+
+
+ a54cbf16-70b2-3246-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 0.0
+ 1416.0
+
+
+ nifi-standard-nar
+ org.apache.nifi
+ 1.16.3
+
+
+ PENALIZE_FLOWFILE
+ WARN
+
+ 1
+
+
+ Log Level
+
+ Log Level
+
+
+
+ Log Payload
+
+ Log Payload
+
+
+
+ Attributes to Log
+
+ Attributes to Log
+
+
+
+ attributes-to-log-regex
+
+ attributes-to-log-regex
+
+
+
+ Attributes to Ignore
+
+ Attributes to Ignore
+
+
+
+ attributes-to-ignore-regex
+
+ attributes-to-ignore-regex
+
+
+
+ Log FlowFile Properties
+
+ Log FlowFile Properties
+
+
+
+ Output Format
+
+ Output Format
+
+
+
+ Log prefix
+
+ Log prefix
+
+
+
+ character-set
+
+ character-set
+
+
+
+ ALL
+ false
+ 10 mins
+ 30 sec
+
+
+ Log Level
+ info
+
+
+ Log Payload
+ false
+
+
+ Attributes to Log
+
+
+ attributes-to-log-regex
+ .*
+
+
+ Attributes to Ignore
+
+
+ attributes-to-ignore-regex
+
+
+ Log FlowFile Properties
+ true
+
+
+ Output Format
+ Line per Attribute
+
+
+ Log prefix
+
+
+ character-set
+ US-ASCII
+
+
+ 10
+ 0
+ 0 sec
+ TIMER_DRIVEN
+ 1 sec
+
+ false
+ LogAttribute
+
+ true
+ success
+ false
+
+ RUNNING
+
+ org.apache.nifi.processors.standard.LogAttribute
+
+
+ d5d0ac64-15b8-3301-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 632.0
+ 792.0
+
+
+ nifi-standard-nar
+ org.apache.nifi
+ 1.16.3
+
+
+ PENALIZE_FLOWFILE
+ WARN
+
+ 1
+
+
+ HTTP Method
+
+ HTTP Method
+
+
+
+ Remote URL
+
+ Remote URL
+
+
+
+ SSL Context Service
+
+ org.apache.nifi.ssl.SSLContextService
+ SSL Context Service
+
+
+
+ Connection Timeout
+
+ Connection Timeout
+
+
+
+ Read Timeout
+
+ Read Timeout
+
+
+
+ idle-timeout
+
+ idle-timeout
+
+
+
+ max-idle-connections
+
+ max-idle-connections
+
+
+
+ Include Date Header
+
+ Include Date Header
+
+
+
+ Follow Redirects
+
+ Follow Redirects
+
+
+
+ cookie-strategy
+
+ cookie-strategy
+
+
+
+ disable-http2
+
+ disable-http2
+
+
+
+ flow-file-naming-strategy
+
+ flow-file-naming-strategy
+
+
+
+ Attributes to Send
+
+ Attributes to Send
+
+
+
+ Useragent
+
+ Useragent
+
+
+
+ Basic Authentication Username
+
+ Basic Authentication Username
+
+
+
+ Basic Authentication Password
+
+ Basic Authentication Password
+
+
+
+ oauth2-access-token-provider
+
+ org.apache.nifi.oauth2.OAuth2AccessTokenProvider
+ oauth2-access-token-provider
+
+
+
+ proxy-configuration-service
+
+ org.apache.nifi.proxy.ProxyConfigurationService
+ proxy-configuration-service
+
+
+
+ Proxy Host
+
+ Proxy Host
+
+
+
+ Proxy Port
+
+ Proxy Port
+
+
+
+ Proxy Type
+
+ Proxy Type
+
+
+
+ invokehttp-proxy-user
+
+ invokehttp-proxy-user
+
+
+
+ invokehttp-proxy-password
+
+ invokehttp-proxy-password
+
+
+
+ Put Response Body In Attribute
+
+ Put Response Body In Attribute
+
+
+
+ Max Length To Put In Attribute
+
+ Max Length To Put In Attribute
+
+
+
+ Digest Authentication
+
+ Digest Authentication
+
+
+
+ Always Output Response
+
+ Always Output Response
+
+
+
+ Add Response Headers to Request
+
+ Add Response Headers to Request
+
+
+
+ Content-Type
+
+ Content-Type
+
+
+
+ send-message-body
+
+ send-message-body
+
+
+
+ Use Chunked Encoding
+
+ Use Chunked Encoding
+
+
+
+ Penalize on "No Retry"
+
+ Penalize on "No Retry"
+
+
+
+ use-etag
+
+ use-etag
+
+
+
+ etag-max-cache-size
+
+ etag-max-cache-size
+
+
+
+ ignore-response-content
+
+ ignore-response-content
+
+
+
+ form-body-form-name
+
+ form-body-form-name
+
+
+
+ set-form-filename
+
+ set-form-filename
+
+
+
+ ALL
+ false
+ 10 mins
+ 30 sec
+
+
+ HTTP Method
+ GET
+
+
+ Remote URL
+ https://www.pegelonline.wsv.de/webservices/rest-api/v2/stations/${station_uuid}/W/measurements.json?start=P30D
+
+
+ SSL Context Service
+
+
+ Connection Timeout
+ 5 secs
+
+
+ Read Timeout
+ 15 secs
+
+
+ idle-timeout
+ 5 mins
+
+
+ max-idle-connections
+ 5
+
+
+ Include Date Header
+ True
+
+
+ Follow Redirects
+ True
+
+
+ cookie-strategy
+ DISABLED
+
+
+ disable-http2
+ False
+
+
+ flow-file-naming-strategy
+ RANDOM
+
+
+ Attributes to Send
+
+
+ Useragent
+
+
+ Basic Authentication Username
+
+
+ Basic Authentication Password
+
+
+ oauth2-access-token-provider
+
+
+ proxy-configuration-service
+
+
+ Proxy Host
+
+
+ Proxy Port
+
+
+ Proxy Type
+ http
+
+
+ invokehttp-proxy-user
+
+
+ invokehttp-proxy-password
+
+
+ Put Response Body In Attribute
+
+
+ Max Length To Put In Attribute
+ 256
+
+
+ Digest Authentication
+ false
+
+
+ Always Output Response
+ false
+
+
+ Add Response Headers to Request
+ false
+
+
+ Content-Type
+ ${mime.type}
+
+
+ send-message-body
+ true
+
+
+ Use Chunked Encoding
+ false
+
+
+ Penalize on "No Retry"
+ false
+
+
+ use-etag
+ false
+
+
+ etag-max-cache-size
+ 10MB
+
+
+ ignore-response-content
+ false
+
+
+ form-body-form-name
+
+
+ set-form-filename
+ true
+
+
+ 10
+ 0
+ 0 sec
+ TIMER_DRIVEN
+ 1 sec
+
+ false
+ Get historic measurements
+
+ false
+ Failure
+ false
+
+
+ true
+ No Retry
+ false
+
+
+ true
+ Original
+ false
+
+
+ false
+ Response
+ false
+
+
+ true
+ Retry
+ false
+
+ RUNNING
+
+ org.apache.nifi.processors.standard.InvokeHTTP
+
+
+ f7619c90-048a-34ef-0000-000000000000
+ 701bdca0-b08c-3dc3-0000-000000000000
+
+ 1032.0
+ 336.0
+
+
+ nifi-standard-nar
+ org.apache.nifi
+ 1.16.3
+
+
+ PENALIZE_FLOWFILE
+ WARN
+
+ 1
+
+
+ jolt-transform
+
+ jolt-transform
+
+
+
+ jolt-custom-class
+
+
+jolt-transform-custom
+jolt-spec
+
+ jolt-custom-class
+
+
+
+ jolt-custom-modules
+
+
+jolt-transform-custom
+jolt-spec
+
+ jolt-custom-modules
+
+
+
+ jolt-spec
+
+ jolt-spec
+
+
+
+ Transform Cache Size
+
+ Transform Cache Size
+
+
+
+ pretty_print
+
+ pretty_print
+
+
+
+ ALL
+ false
+ 10 mins
+ 30 sec
+
+
+ jolt-transform
+ jolt-transform-chain
+
+
+ jolt-custom-class
+
+
+ jolt-custom-modules
+
+
+ jolt-spec
+ [
+ {
+ "operation": "shift",
+ "spec": {
+ "data": {
+ "*": ""
+ }
+ }
+ }
+]
+
+
+
+ Transform Cache Size
+ 1
+
+
+ pretty_print
+ false
+
+
+ 10
+ 0
+ 0 sec
+ TIMER_DRIVEN
+ 1 sec
+
+ false
+ Extract data attribute
+
+ true
+ failure
+ false
+
+
+ false
+ success
+ false
+
+ RUNNING
+
+ org.apache.nifi.processors.standard.JoltTransformJSON
+
+
+ 10/13/2022 15:22:44 GMT
+
diff --git a/demos/data-warehouse-iceberg-trino-spark/create-nifi-ingestion-job.yaml b/demos/data-warehouse-iceberg-trino-spark/create-nifi-ingestion-job.yaml
new file mode 100644
index 00000000..8585097d
--- /dev/null
+++ b/demos/data-warehouse-iceberg-trino-spark/create-nifi-ingestion-job.yaml
@@ -0,0 +1,72 @@
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+ name: create-nifi-ingestion-job
+spec:
+ template:
+ spec:
+ serviceAccountName: demo-serviceaccount
+ initContainers:
+ - name: wait-for-testdata
+ image: docker.stackable.tech/stackable/tools:0.2.0-stackable0.3.0
+ command: ["bash", "-c", "echo 'Waiting for all kafka brokers to be ready' && kubectl wait --for=condition=ready --timeout=30m pod -l app.kubernetes.io/instance=kafka -l app.kubernetes.io/name=kafka"]
+ containers:
+ - name: create-nifi-ingestion-job
+ image: docker.stackable.tech/stackable/testing-tools:0.1.0-stackable0.1.0
+ command: ["bash", "-c", "curl -O https://raw.githubusercontent.com/stackabletech/stackablectl/demo-data-warehouse-iceberg-trino-spark/demos/data-warehouse-iceberg-trino-spark/WarehouseKafkaIngest.xml && python -u /tmp/script/script.py"]
+ volumeMounts:
+ - name: script
+ mountPath: /tmp/script
+ env:
+ - name: NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.namespace
+ volumes:
+ - name: script
+ configMap:
+ name: create-nifi-ingestion-job-script
+ restartPolicy: OnFailure
+ backoffLimit: 50
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: create-nifi-ingestion-job-script
+data:
+ script.py: |
+ from nipyapi.canvas import get_root_pg_id, schedule_process_group, list_all_controllers, schedule_controller
+ from nipyapi.security import service_login
+ from nipyapi.templates import get_template, upload_template, deploy_template
+ import nipyapi
+ import os
+ import urllib3
+
+ # As of 2022-08-29 we cant use "https://nifi:8443" here because
The request contained an invalid host header [nifi:8443
] in the request [/nifi-api
]. Check for request manipulation or third-party intercept.
+ ENDPOINT = f"https://nifi-node-default-0.nifi-node-default.{os.environ['NAMESPACE']}.svc.cluster.local:8443" # For local testing / developing replace it, afterwards change back to f"https://nifi-node-default-0.nifi-node-default.{os.environ['NAMESPACE']}.svc.cluster.local:8443"
+ USERNAME = "admin"
+ PASSWORD = "adminadmin"
+ TEMPLATE_NAME = "WarehouseKafkaIngest"
+ TEMPLATE_FILE = f"{TEMPLATE_NAME}.xml"
+
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+ nipyapi.config.nifi_config.host = f"{ENDPOINT}/nifi-api"
+ nipyapi.config.nifi_config.verify_ssl = False
+
+ print("Logging in")
+ service_login(username=USERNAME, password=PASSWORD)
+ print("Logged in")
+
+ pg_id = get_root_pg_id()
+
+ upload_template(pg_id, TEMPLATE_FILE)
+
+ template_id = get_template(TEMPLATE_NAME).id
+ deploy_template(pg_id, template_id, 200, 0)
+
+ for controller in list_all_controllers():
+ schedule_controller(controller, scheduled=True)
+
+ schedule_process_group(pg_id, scheduled=True)
diff --git a/demos/data-warehouse-iceberg-trino-spark/create-spark-ingestion-job.yaml b/demos/data-warehouse-iceberg-trino-spark/create-spark-ingestion-job.yaml
new file mode 100644
index 00000000..252b5705
--- /dev/null
+++ b/demos/data-warehouse-iceberg-trino-spark/create-spark-ingestion-job.yaml
@@ -0,0 +1,304 @@
+# We can't simply create the SparkApplication object here as we have to wait for Kafka to be ready because
+# * We currently don't restart failed Spark applications (see https://github.com/stackabletech/spark-k8s-operator/issues/157)
+# * We currently auto-create topics and we need all the brokers to be available so that the topic is distributed among all the brokers
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+ name: create-spark-ingestion-job
+spec:
+ template:
+ spec:
+ serviceAccountName: demo-serviceaccount
+ initContainers:
+ - name: wait-for-testdata
+ image: docker.stackable.tech/stackable/tools:0.2.0-stackable0.3.0
+ command: ["bash", "-c", "echo 'Waiting for all kafka brokers to be ready' && kubectl wait --for=condition=ready --timeout=30m pod -l app.kubernetes.io/instance=kafka -l app.kubernetes.io/name=kafka"]
+ containers:
+ - name: create-spark-ingestion-job
+ image: docker.stackable.tech/stackable/tools:0.2.0-stackable0.3.0
+ command: ["bash", "-c", "echo 'Submitting Spark job' && kubectl apply -f /tmp/manifest/spark-ingestion-job.yaml"]
+ volumeMounts:
+ - name: manifest
+ mountPath: /tmp/manifest
+ volumes:
+ - name: manifest
+ configMap:
+ name: create-spark-ingestion-job-manifest
+ restartPolicy: OnFailure
+ backoffLimit: 50
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: create-spark-ingestion-job-manifest
+data:
+ spark-ingestion-job.yaml: |
+ ---
+ apiVersion: spark.stackable.tech/v1alpha1
+ kind: SparkApplication
+ metadata:
+ name: spark-ingest-into-warehouse
+ spec:
+ version: "1.0"
+ sparkImage: docker.stackable.tech/sbernauer/pyspark-k8s-with-iceberg:latest3 # docker.stackable.tech/stackable/pyspark-k8s:3.3.0-stackable0.2.0
+ mode: cluster
+ mainApplicationFile: local:///stackable/spark/jobs/spark-ingest-into-warehouse.py
+ # deps:
+ # packages:
+ # - org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:0.14.1
+ # - org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0
+ sparkConf:
+ spark.hadoop.fs.s3a.endpoint: http://minio:9000
+ spark.hadoop.fs.s3a.path.style.access: "true"
+ spark.hadoop.fs.s3a.access.key: trino
+ spark.hadoop.fs.s3a.secret.key: trinotrino
+ spark.sql.extensions: org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
+ spark.sql.catalog.warehouse: org.apache.iceberg.spark.SparkCatalog
+ spark.sql.catalog.warehouse.type: hive
+ spark.sql.catalog.warehouse.uri: thrift://hive-iceberg:9083
+ volumes:
+ - name: script
+ configMap:
+ name: write-iceberg-table-script
+ job:
+ resources:
+ cpu:
+ min: "100m"
+ max: "1"
+ driver:
+ resources:
+ cpu:
+ min: "1"
+ max: "1"
+ memory:
+ limit: "2Gi"
+ volumeMounts:
+ - name: script
+ mountPath: /stackable/spark/jobs
+ executor:
+ instances: 4
+ resources:
+ cpu:
+ min: "2"
+ max: "4"
+ memory:
+ limit: "12Gi"
+ volumeMounts:
+ - name: script
+ mountPath: /stackable/spark/jobs
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: write-iceberg-table-script
+data:
+ spark-ingest-into-warehouse.py: |
+ from pyspark.sql import SparkSession
+ from pyspark.sql.types import StructType, StructField, StringType, LongType, ShortType, FloatType, DoubleType, BooleanType, TimestampType, MapType, ArrayType
+ from pyspark.sql.functions import col, from_json, expr
+ import time
+ from datetime import datetime, timedelta
+
+ spark = SparkSession.builder.appName("spark-ingest-into-warehouse").getOrCreate()
+ # spark.sparkContext.setLogLevel("DEBUG")
+
+ spark.sql("CREATE SCHEMA IF NOT EXISTS warehouse.water_levels LOCATION 's3a://warehouse/water-levels/'")
+ spark.sql("CREATE SCHEMA IF NOT EXISTS warehouse.smart_city LOCATION 's3a://warehouse/smart-city/'")
+
+ # Todo add PARTITIONED BY (days(timestamp))
+ # Currently fails with org.apache.spark.sql.AnalysisException: days(timestamp) ASC NULLS FIRST is not currently supported
+ # Don't forget to add option("fanout-enabled", "true") to iceberg sink as well
+ # see https://github.com/apache/iceberg/issues/5625
+ spark.sql("CREATE TABLE IF NOT EXISTS warehouse.water_levels.measurements (station_uuid string, timestamp timestamp, value float) USING iceberg")
+ spark.sql("CREATE TABLE IF NOT EXISTS warehouse.water_levels.stations (uuid string, number bigint, short_name string, long_name string, km float, agency string, latitude double, longitude double, water_short_name string, water_long_name string) USING iceberg")
+ spark.sql("CREATE TABLE IF NOT EXISTS warehouse.smart_city.shared_bikes_bike_status (bike_id string, vehicle_type_id string, latitude double, longitude double, is_reserved boolean, is_disabled boolean, last_reported timestamp) USING iceberg")
+ spark.sql("CREATE TABLE IF NOT EXISTS warehouse.smart_city.shared_bikes_station_information (station_id string, name string, latitude double, longitude double) USING iceberg")
+ spark.sql("CREATE TABLE IF NOT EXISTS warehouse.smart_city.shared_bikes_station_status (station_id string, num_bikes_available short, is_installed boolean, is_renting boolean, is_returning boolean, vehicle_types_available map, last_reported timestamp) USING iceberg")
+
+ schema = StructType([
+ StructField("station_uuid", StringType(), True),
+ StructField("timestamp", TimestampType(), True),
+ StructField("value", FloatType(), True),
+ ])
+ spark \
+ .readStream \
+ .format("kafka") \
+ .option("kafka.bootstrap.servers", "kafka:9092") \
+ .option("subscribe", "water_levels_measurements") \
+ .option("startingOffsets", "earliest") \
+ .option("maxOffsetsPerTrigger", 50000000) \
+ .load() \
+ .selectExpr("cast(key as string)", "cast(value as string)") \
+ .withColumn("json", from_json(col("value"), schema)) \
+ .select("json.station_uuid", "json.timestamp", "json.value") \
+ .writeStream \
+ .queryName("ingest water_level measurements") \
+ .format("iceberg") \
+ .outputMode("append") \
+ .trigger(processingTime='5 minutes') \
+ .option("path", "warehouse.water_levels.measurements") \
+ .option("checkpointLocation", "s3a://warehouse/water-levels/measurements/checkpoints") \
+ .start()
+
+ schema = StructType([ \
+ StructField("uuid", StringType(), True), \
+ StructField("number", StringType(), True), \
+ StructField("shortname", StringType(), True), \
+ StructField("longname", StringType(), True), \
+ StructField("km", FloatType(), True), \
+ StructField("agency", StringType(), True), \
+ StructField("latitude", DoubleType(), True), \
+ StructField("longitude", DoubleType(), True), \
+ StructField("water", \
+ StructType([StructField("shortname", StringType(), True), StructField("longname", StringType(), True)]), \
+ True), \
+ ])
+ spark \
+ .readStream \
+ .format("kafka") \
+ .option("kafka.bootstrap.servers", "kafka:9092") \
+ .option("subscribe", "water_levels_stations") \
+ .option("startingOffsets", "earliest") \
+ .option("maxOffsetsPerTrigger", 10000) \
+ .load() \
+ .selectExpr("cast(key as string)", "cast(value as string)") \
+ .withColumn("json", from_json(col("value"), schema)) \
+ .selectExpr( \
+ "json.uuid", \
+ "cast(json.number as bigint) as number", \
+ "json.shortname as short_name", \
+ "json.longname as long_name", \
+ "json.km", "json.agency", \
+ "json.latitude", \
+ "json.longitude", \
+ "json.water.shortname as water_short_name", \
+ "json.water.longname as water_long_name" \
+ ) \
+ .writeStream \
+ .queryName("ingest water_level stations") \
+ .format("iceberg") \
+ .outputMode("append") \
+ .trigger(processingTime='2 minutes') \
+ .option("path", "warehouse.water_levels.stations") \
+ .option("checkpointLocation", "s3a://warehouse/water-levels/stations/checkpoints") \
+ .start()
+
+ schema = StructType([ \
+ StructField("station_id", StringType(), True), \
+ StructField("lat", DoubleType(), True), \
+ StructField("lon", DoubleType(), True), \
+ StructField("name", StringType(), True), \
+ ])
+ spark \
+ .readStream \
+ .format("kafka") \
+ .option("kafka.bootstrap.servers", "kafka:9092") \
+ .option("subscribe", "shared_bikes_station_information") \
+ .option("startingOffsets", "earliest") \
+ .option("maxOffsetsPerTrigger", 10000) \
+ .load() \
+ .selectExpr("cast(key as string)", "cast(value as string)") \
+ .withColumn("json", from_json(col("value"), schema)) \
+ .selectExpr("json.station_id", "json.name as name", "json.lat as latitude", "json.lon as longitude") \
+ .writeStream \
+ .queryName("ingest smart_city shared_bikes_station_information") \
+ .format("iceberg") \
+ .outputMode("append") \
+ .trigger(processingTime='2 minutes') \
+ .option("path", "warehouse.smart_city.shared_bikes_station_information") \
+ .option("checkpointLocation", "s3a://warehouse/smart-city/shared_bikes_station_information/checkpoints") \
+ .start()
+
+ schema = StructType([ \
+ StructField("station_id", StringType(), True), \
+ StructField("is_installed", BooleanType(), True), \
+ StructField("last_reported", TimestampType(), True), \
+ StructField("num_bikes_available", ShortType(), True), \
+ StructField("is_renting", BooleanType(), True), \
+ StructField("is_returning", BooleanType(), True), \
+ StructField("vehicle_types_available", ArrayType(StructType([StructField("count", ShortType(), True), StructField("vehicle_type_id", StringType(), True)]), True), True), \
+ ])
+ spark \
+ .readStream \
+ .format("kafka") \
+ .option("kafka.bootstrap.servers", "kafka:9092") \
+ .option("subscribe", "shared_bikes_station_status") \
+ .option("startingOffsets", "earliest") \
+ .option("maxOffsetsPerTrigger", 10000) \
+ .load() \
+ .selectExpr("cast(key as string)", "cast(value as string)") \
+ .withColumn("json", from_json(col("value"), schema)) \
+ .selectExpr( \
+ "json.station_id", \
+ "json.num_bikes_available", \
+ "json.is_installed", \
+ "json.is_renting", \
+ "json.is_returning", \
+ "map_from_arrays(json.vehicle_types_available.vehicle_type_id, json.vehicle_types_available.count) as vehicle_types_available", \
+ "json.last_reported" \
+ ) \
+ .writeStream \
+ .queryName("ingest smart_city shared_bikes_station_status") \
+ .format("iceberg") \
+ .outputMode("append") \
+ .trigger(processingTime='2 minutes') \
+ .option("path", "warehouse.smart_city.shared_bikes_station_status") \
+ .option("checkpointLocation", "s3a://warehouse/smart-city/shared_bikes_station_status/checkpoints") \
+ .start()
+
+ schema = StructType([ \
+ StructField("lat", DoubleType(), True), \
+ StructField("lon", DoubleType(), True), \
+ StructField("bike_id", StringType(), True), \
+ StructField("is_reserved", BooleanType(), True), \
+ StructField("is_disabled", BooleanType(), True), \
+ StructField("vehicle_type_id", StringType(), True), \
+ StructField("last_reported", TimestampType(), True), \
+ ])
+ spark \
+ .readStream \
+ .format("kafka") \
+ .option("kafka.bootstrap.servers", "kafka:9092") \
+ .option("subscribe", "shared_bikes_bike_status") \
+ .option("startingOffsets", "earliest") \
+ .option("maxOffsetsPerTrigger", 10000) \
+ .load() \
+ .selectExpr("cast(key as string)", "cast(value as string)") \
+ .withColumn("json", from_json(col("value"), schema)) \
+ .selectExpr("json.bike_id", "json.vehicle_type_id", "json.lat as latitude", "json.lon as longitude", "json.is_reserved", "json.is_disabled", "json.last_reported") \
+ .writeStream \
+ .queryName("ingest smart_city shared_bikes_bike_status") \
+ .format("iceberg") \
+ .outputMode("append") \
+ .trigger(processingTime='2 minutes') \
+ .option("path", "warehouse.smart_city.shared_bikes_bike_status") \
+ .option("checkpointLocation", "s3a://warehouse/smart-city/shared_bikes_bike_status/checkpoints") \
+ .start()
+
+ # key: table name
+ # value: compaction strategy
+ tables_to_compact = {
+ # "water_levels.measurements": ", strategy => 'sort', sort_order => 'station_uuid ASC NULLS LAST,timestamp DESC NULLS LAST'",
+ "water_levels.measurements": "",
+ "water_levels.stations": "",
+ "warehouse.smart_city.shared_bikes_station_information": "",
+ "warehouse.smart_city.shared_bikes_station_status": "",
+ "warehouse.smart_city.shared_bikes_bike_status": "",
+ }
+
+ while True:
+ expire_before = (datetime.now() - timedelta(hours=4)).strftime("%Y-%m-%d %H:%M:%S")
+ for table, table_compaction_strategy in tables_to_compact.items():
+ print(f"[{table}] Expiring snapshots older than 4 hours ({expire_before})")
+ spark.sql(f"CALL warehouse.system.expire_snapshots(table => '{table}', older_than => TIMESTAMP '{expire_before}', retain_last => 50, stream_results => true)")
+
+ print(f"[{table}] Removing orphaned files")
+ spark.sql(f"CALL warehouse.system.remove_orphan_files(table => '{table}')")
+
+ print(f"[{table}] Starting compaction")
+ spark.sql(f"CALL warehouse.system.rewrite_data_files(table => '{table}'{table_compaction_strategy})")
+ print(f"[{table}] Finished compaction")
+
+ print("All tables compacted. Waiting 25min before scheduling next run...")
+ time.sleep(25 * 60) # Assuming compaction takes 5 min run every 30 minutes
diff --git a/demos/data-warehouse-iceberg-trino-spark/create-trino-tables.yaml b/demos/data-warehouse-iceberg-trino-spark/create-trino-tables.yaml
new file mode 100644
index 00000000..5b9baaff
--- /dev/null
+++ b/demos/data-warehouse-iceberg-trino-spark/create-trino-tables.yaml
@@ -0,0 +1,575 @@
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+ name: create-tables-in-trino
+spec:
+ template:
+ spec:
+ serviceAccountName: demo-serviceaccount
+ initContainers:
+ - name: wait-for-testdata
+ image: docker.stackable.tech/stackable/tools:0.2.0-stackable0.3.0
+ command: ["bash", "-c", "echo 'Waiting for job load-test-data to finish' && kubectl wait --for=condition=complete --timeout=30m job/load-test-data"]
+ containers:
+ - name: create-tables-in-trino
+ image: docker.stackable.tech/stackable/testing-tools:0.1.0-stackable0.1.0
+ command: ["bash", "-c", "python -u /tmp/script/script.py"]
+ volumeMounts:
+ - name: script
+ mountPath: /tmp/script
+ volumes:
+ - name: script
+ configMap:
+ name: create-tables-in-trino-script
+ restartPolicy: OnFailure
+ backoffLimit: 50
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: create-tables-in-trino-script
+data:
+ script.py: |
+ import sys
+ import trino
+
+ if not sys.warnoptions:
+ import warnings
+ warnings.simplefilter("ignore")
+
+ def get_connection():
+ connection = trino.dbapi.connect(
+ host="trino-coordinator",
+ port=8443,
+ user="demo",
+ http_scheme='https',
+ auth=trino.auth.BasicAuthentication("demo", "demo"),
+ )
+ connection._http_session.verify = False
+ return connection
+
+ def run_query(connection, query):
+ print(f"[DEBUG] Executing query {query}")
+ cursor = connection.cursor()
+ cursor.execute(query)
+ return cursor.fetchall()
+
+ def run_query_and_assert_more_than_one_row(connection, query):
+ rows = run_query(connection, query)[0][0]
+ assert rows > 0
+
+ connection = get_connection()
+
+ assert run_query(connection, "CREATE SCHEMA IF NOT EXISTS staging.house_sales WITH (location = 's3a://staging/house-sales/')")[0][0] is True
+ assert run_query(connection, "CREATE SCHEMA IF NOT EXISTS staging.earthquakes WITH (location = 's3a://staging/earthquakes/')")[0][0] is True
+ assert run_query(connection, "CREATE SCHEMA IF NOT EXISTS staging.smart_city WITH (location = 's3a://staging/smart-city/')")[0][0] is True
+ assert run_query(connection, "CREATE SCHEMA IF NOT EXISTS staging.taxi WITH (location = 's3a://staging/taxi/')")[0][0] is True
+ assert run_query(connection, "CREATE SCHEMA IF NOT EXISTS warehouse.house_sales WITH (location = 's3a://warehouse/house-sales/')")[0][0] is True
+ assert run_query(connection, "CREATE SCHEMA IF NOT EXISTS warehouse.earthquakes WITH (location = 's3a://warehouse/earthquakes/')")[0][0] is True
+ assert run_query(connection, "CREATE SCHEMA IF NOT EXISTS warehouse.smart_city WITH (location = 's3a://warehouse/smart-city/')")[0][0] is True
+ assert run_query(connection, "CREATE SCHEMA IF NOT EXISTS warehouse.taxi WITH (location = 's3a://warehouse/taxi/')")[0][0] is True
+
+ assert run_query(connection, """
+ CREATE TABLE IF NOT EXISTS staging.house_sales.house_sales (
+ transaction_id varchar,
+ price varchar,
+ date_of_transfer varchar,
+ postcode varchar,
+ property_type varchar,
+ new_build varchar,
+ tenure varchar,
+ paon varchar,
+ saon varchar,
+ street varchar,
+ locality varchar,
+ city varchar,
+ district varchar,
+ county varchar,
+ ppd_category varchar,
+ record_status varchar
+ )
+ WITH (
+ external_location = 's3a://staging/house-sales/house-sales/',
+ format = 'csv',
+ csv_escape = '\\',
+ csv_quote = '"',
+ csv_separator = ','
+ )
+ """)[0][0] is True
+
+ assert run_query(connection, """
+ CREATE TABLE IF NOT EXISTS staging.house_sales.postcode_geo_lookup (
+ postcode varchar,
+ status varchar,
+ usertype varchar,
+ easting varchar,
+ northing varchar,
+ positional_quality_indicator varchar,
+ country varchar,
+ latitude varchar,
+ longitude varchar,
+ postcode_no_space varchar,
+ postcode_fixed_width_seven varchar,
+ postcode_fixed_width_eight varchar,
+ postcode_area varchar,
+ postcode_district varchar,
+ postcode_sector varchar,
+ outcode varchar,
+ incode varchar
+ )
+ WITH (
+ external_location = 's3a://staging/house-sales/postcode-geo-lookup/',
+ format = 'csv',
+ csv_escape = '\\',
+ csv_quote = '"',
+ csv_separator = ','
+ )
+ """)[0][0] is True
+
+ assert run_query(connection, """
+ CREATE TABLE IF NOT EXISTS staging.earthquakes.earthquakes (
+ timestamp varchar,
+ latitude varchar,
+ longitude varchar,
+ depth varchar,
+ mag varchar,
+ mag_type varchar,
+ nst varchar,
+ gap varchar,
+ dmin varchar,
+ rms varchar,
+ net varchar,
+ id varchar,
+ updated varchar,
+ place varchar,
+ type varchar,
+ horizontal_error varchar,
+ depth_error varchar,
+ mag_error varchar,
+ mag_nst varchar,
+ status varchar,
+ location_source varchar,
+ mag_source varchar
+ )
+ WITH (
+ external_location = 's3a://staging/earthquakes/earthquakes/',
+ format = 'csv',
+ csv_escape = '\\',
+ csv_quote = '"',
+ csv_separator = ',',
+ skip_header_line_count = 1
+ )
+ """)[0][0] is True
+
+ assert run_query(connection, """
+ CREATE TABLE IF NOT EXISTS staging.smart_city.e_charging_stations (
+ operator varchar,
+ street varchar,
+ street_number varchar,
+ address_supplement varchar,
+ zip_code varchar,
+ city varchar,
+ state varchar,
+ state_iso_3166_2 varchar,
+ district varchar,
+ latitude varchar,
+ longitude varchar,
+ date_of_installation varchar,
+ power varchar,
+ speed_type varchar,
+ number_of_charging_ports varchar,
+ port_1_types varchar,
+ port_1_power varchar,
+ port_1_pubkey varchar,
+ port_2_types varchar,
+ port_2_power varchar,
+ port_2_pubkey varchar,
+ port_3_types varchar,
+ port_3_power varchar,
+ port_3_pubkey varchar,
+ port_4_types varchar,
+ port_4_power varchar,
+ port_4_pubkey varchar
+ )
+ WITH (
+ external_location = 's3a://staging/smart-city/e-charging-stations/',
+ format = 'csv',
+ csv_escape = '\\',
+ csv_quote = '"',
+ csv_separator = ',',
+ skip_header_line_count = 1
+ )
+ """)[0][0] is True
+
+ assert run_query(connection, """
+ CREATE TABLE IF NOT EXISTS staging.taxi.taxi_zone_lookup (
+ location_id VARCHAR,
+ borough VARCHAR,
+ zone VARCHAR,
+ service_zone VARCHAR
+ ) WITH (
+ external_location = 's3a://staging/taxi/taxi-zone-lookup/',
+ format = 'csv',
+ csv_escape = '\\',
+ csv_quote = '"',
+ csv_separator = ',',
+ skip_header_line_count = 1
+ )
+ """)[0][0] is True
+
+ assert run_query(connection, """
+ CREATE TABLE IF NOT EXISTS staging.taxi.payment_type_lookup (
+ payment_type_id VARCHAR,
+ payment_type VARCHAR
+ ) WITH (
+ external_location = 's3a://staging/taxi/payment-type-lookup/',
+ format = 'csv',
+ csv_escape = '\\',
+ csv_quote = '"',
+ csv_separator = ',',
+ skip_header_line_count = 1
+ )
+ """)[0][0] is True
+
+ assert run_query(connection, """
+ CREATE TABLE IF NOT EXISTS staging.taxi.rate_code_lookup (
+ rate_code_id VARCHAR,
+ rate_code VARCHAR
+ ) WITH (
+ external_location = 's3a://staging/taxi/rate-code-lookup/',
+ format = 'csv',
+ csv_escape = '\\',
+ csv_quote = '"',
+ csv_separator = ',',
+ skip_header_line_count = 1
+ )
+ """)[0][0] is True
+
+ assert run_query(connection, """
+ CREATE TABLE IF NOT EXISTS staging.taxi.yellow_tripdata (
+ VendorID BIGINT,
+ tpep_pickup_datetime TIMESTAMP,
+ tpep_dropoff_datetime TIMESTAMP,
+ passenger_count DOUBLE,
+ trip_distance DOUBLE,
+ RatecodeID DOUBLE,
+ store_and_fwd_flag VARCHAR,
+ PULocationID BIGINT,
+ DOLocationID BIGINT,
+ payment_type BIGINT,
+ fare_amount DOUBLE,
+ extra DOUBLE,
+ mta_tax DOUBLE,
+ tip_amount DOUBLE,
+ tolls_amount DOUBLE,
+ improvement_surcharge DOUBLE,
+ total_amount DOUBLE,
+ congestion_surcharge DOUBLE,
+ airport_fee DOUBLE
+ ) WITH (
+ external_location = 's3a://staging/taxi/yellow-tripdata/',
+ format = 'parquet'
+ )
+ """)[0][0] is True
+
+ assert run_query(connection, """
+ CREATE TABLE IF NOT EXISTS staging.taxi.green_tripdata (
+ VendorID BIGINT,
+ lpep_pickup_datetime TIMESTAMP,
+ lpep_dropoff_datetime TIMESTAMP,
+ store_and_fwd_flag VARCHAR,
+ RatecodeID DOUBLE,
+ PULocationID BIGINT,
+ DOLocationID BIGINT,
+ passenger_count DOUBLE,
+ trip_distance DOUBLE,
+ fare_amount DOUBLE,
+ extra DOUBLE,
+ mta_tax DOUBLE,
+ tip_amount DOUBLE,
+ tolls_amount DOUBLE,
+ ehail_fee INTEGER,
+ improvement_surcharge DOUBLE,
+ total_amount DOUBLE,
+ payment_type DOUBLE,
+ trip_type DOUBLE,
+ congestion_surcharge DOUBLE
+ ) WITH (
+ external_location = 's3a://staging/taxi/green-tripdata/',
+ format = 'parquet'
+ )
+ """)[0][0] is True
+
+ assert run_query(connection, """
+ CREATE TABLE IF NOT EXISTS staging.taxi.fhvhv_tripdata (
+ hvfhs_license_num VARCHAR,
+ dispatching_base_num VARCHAR,
+ originating_base_num VARCHAR,
+ request_datetime TIMESTAMP,
+ on_scene_datetime TIMESTAMP,
+ pickup_datetime TIMESTAMP,
+ dropoff_datetime TIMESTAMP,
+ PULocationID BIGINT,
+ DOLocationID BIGINT,
+ trip_miles DOUBLE,
+ trip_time BIGINT,
+ base_passenger_fare DOUBLE,
+ tolls DOUBLE,
+ bcf DOUBLE,
+ sales_tax DOUBLE,
+ congestion_surcharge DOUBLE,
+ airport_fee DOUBLE,
+ tips DOUBLE,
+ driver_pay DOUBLE,
+ shared_request_flag VARCHAR,
+ shared_match_flag VARCHAR,
+ access_a_ride_flag VARCHAR,
+ wav_request_flag VARCHAR,
+ wav_match_flag VARCHAR
+ ) WITH (
+ external_location = 's3a://staging/taxi/fhvhv-tripdata/',
+ format = 'parquet'
+ )
+ """)[0][0] is True
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ assert run_query(connection, """
+ create table if not exists warehouse.house_sales.house_sales with (
+ partitioning = ARRAY['year(date_of_transfer)']
+ ) as
+ select
+ trim(leading '{' FROM trim(trailing '}' FROM transaction_id)) as transaction_id,
+ cast(price as bigint) as price,
+ cast(date_parse(date_of_transfer, '%Y-%m-%d %H:%i') as date) as date_of_transfer,
+ s.postcode,
+ property_type,
+ case new_build when 'Y' then true else false end as new_build,
+ tenure,
+ paon,
+ street,
+ locality,
+ city,
+ district,
+ county,
+ ppd_category,
+ record_status,
+ l.status,
+ l.usertype,
+ l.country,
+ l.postcode_no_space,
+ cast(l.easting as bigint) as easting,
+ cast(l.northing as bigint) as northing,
+ cast(l.longitude as double) as longitude,
+ cast(l.latitude as double) as latitude
+ from staging.house_sales.house_sales as s
+ left join staging.house_sales.postcode_geo_lookup as l on s.postcode = l.postcode
+ """)[0][0] >= 0
+
+ assert run_query(connection, """
+ create table if not exists warehouse.earthquakes.earthquakes as
+ select
+ cast(date_parse(timestamp, '%Y-%m-%dT%H:%i:%s.%fZ') as timestamp(6)) as timestamp,
+ cast(latitude as double) as latitude,
+ cast(longitude as double) as longitude,
+ try_cast(depth as double) as depth,
+ cast(mag as double) as mag,
+ mag_type,
+ try_cast(nst as bigint) as nst,
+ try_cast(gap as double) as gap,
+ try_cast(dmin as double) as dmin,
+ try_cast(rms as double) as rms,
+ net,
+ id,
+ cast(date_parse(updated, '%Y-%m-%dT%H:%i:%s.%fZ') as timestamp(6)) as updated,
+ place,
+ type,
+ try_cast(horizontal_error as double) as horizontal_error,
+ try_cast(depth_error as double) as depth_error,
+ try_cast(mag_error as double) as mag_error,
+ try_cast(mag_nst as double) as mag_nst,
+ status,
+ location_source,
+ mag_source
+ from staging.earthquakes.earthquakes
+ where length(timestamp) = 24
+ """)[0][0] >= 0
+
+ assert run_query(connection, """
+ create table if not exists warehouse.smart_city.e_charging_stations as
+ select
+ operator,
+ street,
+ street_number,
+ address_supplement,
+ zip_code,
+ city,
+ state,
+ state_iso_3166_2,
+ district,
+ try_cast(replace(latitude, ',', '.') as double) as latitude,
+ try_cast(replace(longitude, ',', '.') as double) as longitude,
+ cast(date_parse(date_of_installation, '%Y/%m/%d') as date) as date_of_installation,
+ cast(replace(power, ',', '.') as double) as power,
+ speed_type,
+ cast(number_of_charging_ports as bigint) as number_of_charging_ports,
+ port_1_types,
+ try_cast(replace(port_1_power, ',', '.') as double) as port_1_power,
+ port_1_pubkey,
+ port_2_types,
+ try_cast(replace(port_2_power, ',', '.') as double) as port_2_power,
+ port_2_pubkey,
+ port_3_types,
+ try_cast(replace(port_3_power, ',', '.') as double) as port_3_power,
+ port_3_pubkey,
+ port_4_types,
+ try_cast(replace(port_4_power, ',', '.') as double) as port_4_power,
+ port_4_pubkey
+ from staging.smart_city.e_charging_stations
+ """)[0][0] >= 0
+
+ # We need to let Trino recognize the lookup tables are small so that is uses a replicated and not a partitioned join
+ assert run_query(connection, """
+ analyze staging.taxi.payment_type_lookup
+ """)[0][0] >= 0
+
+ assert run_query(connection, """
+ analyze staging.taxi.rate_code_lookup
+ """)[0][0] >= 0
+
+ assert run_query(connection, """
+ analyze staging.taxi.taxi_zone_lookup
+ """)[0][0] >= 0
+
+ assert run_query(connection, """
+ create table if not exists warehouse.taxi.yellow_tripdata with (
+ partitioning = ARRAY['month(tpep_pickup_datetime)']
+ )
+ as select
+ VendorID as vendor_id,
+ cast(tpep_pickup_datetime as timestamp(6)) as tpep_pickup_datetime,
+ cast(tpep_dropoff_datetime as timestamp(6)) as tpep_dropoff_datetime,
+ cast(passenger_count as BIGINT) as passenger_count,
+ trip_distance,
+ r.rate_code,
+ store_and_fwd_flag,
+ z_pickup.borough as pickup_borough,
+ z_pickup.zone as pickup_zone,
+ z_pickup.service_zone as pickup_service_zone,
+ z_dropoff.borough as dropoff_borough,
+ z_dropoff.zone as dropoff_zone,
+ z_dropoff.service_zone as dropoff_service_zone,
+ p.payment_type as payment_type,
+ fare_amount,
+ extra,
+ mta_tax,
+ tip_amount,
+ tolls_amount,
+ improvement_surcharge,
+ total_amount,
+ congestion_surcharge,
+ airport_fee
+ from staging.taxi.yellow_tripdata as t
+ left join staging.taxi.taxi_zone_lookup as z_pickup on t.pulocationid = cast(z_pickup.location_id as bigint)
+ left join staging.taxi.taxi_zone_lookup as z_dropoff on t.dolocationid = cast(z_dropoff.location_id as bigint)
+ left join staging.taxi.payment_type_lookup as p on t.payment_type = cast(p.payment_type_id as bigint)
+ left join staging.taxi.rate_code_lookup as r on t.ratecodeid = cast(r.rate_code_id as bigint)
+ where tpep_pickup_datetime >= date '2015-01-01' and tpep_pickup_datetime <= now() -- We have to remove some invalid records
+ """)[0][0] >= 0
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ assert run_query(connection, """
+ create or replace materialized view warehouse.taxi.yellow_tripdata_daily_agg as
+ select
+ date_trunc('day', tpep_pickup_datetime) as day,
+ pickup_borough,
+ pickup_zone,
+ dropoff_borough,
+ dropoff_zone,
+ payment_type,
+ count(*) as trips,
+ avg(total_amount) as avg_total_amount,
+ sum(total_amount) as sum_total_amount,
+ avg(airport_fee) as avg_airport_fee,
+ sum(airport_fee) as sum_airport_fee,
+ avg(trip_distance) as avg_trip_distance,
+ sum(trip_distance) as sum_trip_distance
+ from warehouse.taxi.yellow_tripdata
+ group by 1, 2, 3, 4, 5, 6
+ """)[0][0] == True
+
+ assert run_query(connection, """
+ create or replace materialized view warehouse.taxi.yellow_tripdata_monthly_agg as
+ select
+ date_trunc('month', day) as month,
+ pickup_borough,
+ pickup_zone,
+ dropoff_borough,
+ dropoff_zone,
+ payment_type,
+ sum(trips) as trips,
+ sum(sum_total_amount) as sum_total_amount,
+ sum(sum_airport_fee) as sum_airport_fee,
+ sum(sum_trip_distance) as sum_trip_distance
+ from warehouse.taxi.yellow_tripdata_daily_agg
+ group by 1, 2, 3, 4, 5, 6
+ """)[0][0] == True
+
+ assert run_query(connection, """
+ REFRESH MATERIALIZED VIEW warehouse.taxi.yellow_tripdata_daily_agg
+ """)[0][0] >= 0
+
+ assert run_query(connection, """
+ REFRESH MATERIALIZED VIEW warehouse.taxi.yellow_tripdata_monthly_agg
+ """)[0][0] >= 0
+
+
+
+
+
+
+ # At this point Spark should have created the warehouse.smart_city.shared_bikes_* tables
+
+ assert run_query(connection, """
+ create or replace view warehouse.smart_city.shared_bikes_station_status_joined as
+ select i.name, s.*, i.latitude, i.longitude from warehouse.smart_city.shared_bikes_station_status as s
+ left join warehouse.smart_city.shared_bikes_station_information as i on s.station_id = i.station_id
+ """)[0][0] == True
diff --git a/demos/data-warehouse-iceberg-trino-spark/load-test-data.yaml b/demos/data-warehouse-iceberg-trino-spark/load-test-data.yaml
new file mode 100644
index 00000000..02fde6f2
--- /dev/null
+++ b/demos/data-warehouse-iceberg-trino-spark/load-test-data.yaml
@@ -0,0 +1,15 @@
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+ name: load-test-data
+spec:
+ template:
+ spec:
+ containers:
+ - name: load-test-data
+ image: "bitnami/minio:2022-debian-10"
+ # Please try to order the load jobs from small to large datasets
+ command: ["bash", "-c", "mc --insecure alias set minio http://minio:9000/ demo demodemo && cd /tmp && curl -O https://repo.stackable.tech/repository/misc/datasets/open-postcode-geo/open-postcode-geo.csv && mc cp open-postcode-geo.csv minio/staging/house-sales/postcode-geo-lookup/ && rm open-postcode-geo.csv && for year in {2005..2021}; do curl -O https://repo.stackable.tech/repository/misc/datasets/uk-house-sales/uk-house-sales-$year.csv && mc cp uk-house-sales-$year.csv minio/staging/house-sales/house-sales/ && rm uk-house-sales-$year.csv; done && curl -O https://repo.stackable.tech/repository/misc/earthquake-data/earthquakes_1950_to_2022.csv && mc cp earthquakes_1950_to_2022.csv minio/staging/earthquakes/earthquakes/ && rm earthquakes_1950_to_2022.csv && curl -O https://repo.stackable.tech/repository/misc/datasets/e-charging-stations/e-charging-stations-2022-08.csv && mc cp e-charging-stations-2022-08.csv minio/staging/smart-city/e-charging-stations/ && rm e-charging-stations-2022-08.csv && curl -O https://repo.stackable.tech/repository/misc/ny-taxi-data/taxi_zone_lookup.csv && mc cp taxi_zone_lookup.csv minio/staging/taxi/taxi-zone-lookup/ && rm taxi_zone_lookup.csv && curl -O https://repo.stackable.tech/repository/misc/ny-taxi-data/rate_code_lookup.csv && mc cp rate_code_lookup.csv minio/staging/taxi/rate-code-lookup/ && rm rate_code_lookup.csv && curl -O https://repo.stackable.tech/repository/misc/ny-taxi-data/payment_type_lookup.csv && mc cp payment_type_lookup.csv minio/staging/taxi/payment-type-lookup/ && rm payment_type_lookup.csv && for month in 2021-01 2021-02 2021-03 2021-04 2021-05 2021-06 2021-07 2021-08 2021-09 2021-10 2021-11 2021-12 2022-01 2022-02 2022-03 2022-04 2022-05 2022-06; do curl -O https://repo.stackable.tech/repository/misc/ny-taxi-data/green_tripdata_$month.parquet && mc cp green_tripdata_$month.parquet minio/staging/taxi/green-tripdata/ && rm green_tripdata_$month.parquet; done && for month in 2015-01 2015-02 2015-03 2015-04 2015-05 2015-06 2015-07 2015-08 2015-09 2015-10 2015-11 2015-12 2016-01 2016-02 2016-03 2016-04 2016-05 2016-06 2016-07 2016-08 2016-09 2016-10 2016-11 2016-12 2017-01 2017-02 2017-03 2017-04 2017-05 2017-06 2017-07 2017-08 2017-09 2017-10 2017-11 2017-12 2018-01 2018-02 2018-03 2018-04 2018-05 2018-06 2018-07 2018-08 2018-09 2018-10 2018-11 2018-12 2019-01 2019-02 2019-03 2019-04 2019-05 2019-06 2019-07 2019-08 2019-09 2019-10 2019-11 2019-12 2020-01 2020-02 2020-03 2020-04 2020-05 2020-06 2020-07 2020-08 2020-09 2020-10 2020-11 2020-12 2021-01 2021-02 2021-03 2021-04 2021-05 2021-06 2021-07 2021-08 2021-09 2021-10 2021-11 2021-12 2022-01 2022-02 2022-03 2022-04 2022-05 2022-06; do curl -O https://repo.stackable.tech/repository/misc/ny-taxi-data/yellow_tripdata_$month.parquet && mc cp yellow_tripdata_$month.parquet minio/staging/taxi/yellow-tripdata/ && rm yellow_tripdata_$month.parquet; done && for month in 2020-09 2020-10 2020-11 2020-12 2021-01 2021-02 2021-03 2021-04 2021-05 2021-06 2021-07 2021-08 2021-09 2021-10 2021-11 2021-12 2022-01 2022-02 2022-03 2022-04 2022-05 2022-06; do curl -O https://repo.stackable.tech/repository/misc/ny-taxi-data/fhvhv_tripdata_$month.parquet && mc cp fhvhv_tripdata_$month.parquet minio/staging/taxi/fhvhv-tripdata/ && rm fhvhv_tripdata_$month.parquet; done"]
+ restartPolicy: OnFailure
+ backoffLimit: 50
diff --git a/demos/data-warehouse-iceberg-trino-spark/serviceaccount.yaml b/demos/data-warehouse-iceberg-trino-spark/serviceaccount.yaml
new file mode 100644
index 00000000..cb3ea626
--- /dev/null
+++ b/demos/data-warehouse-iceberg-trino-spark/serviceaccount.yaml
@@ -0,0 +1,50 @@
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: demo-serviceaccount
+ namespace: default
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+ name: demo-clusterrolebinding
+subjects:
+ - kind: ServiceAccount
+ name: demo-serviceaccount
+ namespace: default
+roleRef:
+ kind: ClusterRole
+ name: demo-clusterrole
+ apiGroup: rbac.authorization.k8s.io
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+ name: demo-clusterrole
+rules:
+ - apiGroups:
+ - ""
+ resources:
+ - pods
+ verbs:
+ - get
+ - list
+ - watch
+ - apiGroups:
+ - batch
+ resources:
+ - jobs
+ verbs:
+ - get
+ - list
+ - watch
+ - apiGroups:
+ - spark.stackable.tech
+ resources:
+ - sparkapplications
+ verbs:
+ - get
+ - list
+ - watch
+ - create
diff --git a/demos/data-warehouse-iceberg-trino-spark/setup-superset.yaml b/demos/data-warehouse-iceberg-trino-spark/setup-superset.yaml
new file mode 100644
index 00000000..ecd78e02
--- /dev/null
+++ b/demos/data-warehouse-iceberg-trino-spark/setup-superset.yaml
@@ -0,0 +1,81 @@
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+ name: setup-superset
+spec:
+ template:
+ spec:
+ containers:
+ - name: setup-superset
+ image: docker.stackable.tech/stackable/testing-tools:0.1.0-stackable0.1.0
+ command: ["bash", "-c", "curl -o superset-assets.zip https://raw.githubusercontent.com/stackabletech/stackablectl/demo-data-warehouse-iceberg-trino-spark/demos/data-warehouse-iceberg-trino-spark/superset-assets.zip && python -u /tmp/script/script.py"]
+ volumeMounts:
+ - name: script
+ mountPath: /tmp/script
+ volumes:
+ - name: script
+ configMap:
+ name: setup-superset-script
+ restartPolicy: OnFailure
+ backoffLimit: 50
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: setup-superset-script
+data:
+ script.py: |
+ import logging
+ import requests
+
+ base_url = "http://superset-external:8088" # For local testing / developing replace it, afterwards change back to http://superset-external:8088
+ username = "admin"
+ password = "admin"
+
+ logging.basicConfig(level=logging.INFO)
+ logging.info("Starting setup of Superset")
+
+ logging.info("Getting access token from /api/v1/security/login")
+ session = requests.session()
+ access_token = session.post(f"{base_url}/api/v1/security/login", json={"username": username, "password": password, "provider": "db", "refresh": True}).json()['access_token']
+ # print(f"access_token: {access_token}")
+
+ logging.info("Getting csrf token from /api/v1/security/csrf_token")
+ csrf_token = session.get(f"{base_url}/api/v1/security/csrf_token", headers={"Authorization": f"Bearer {access_token}"}).json()["result"]
+ # print(f"csrf_token: {csrf_token}")
+
+ headers = {
+ "accept": "application/json",
+ "Authorization": f"Bearer {access_token}",
+ "X-CSRFToken": csrf_token,
+ }
+
+ # To retrieve all of the assets (datasources, datasets, charts and dashboards) run the following commands
+ # logging.info("Exporting all assets")
+ # result = session.get(f"{base_url}/api/v1/assets/export", headers=headers)
+ # assert result.status_code == 200
+ # with open("superset-assets.zip", "wb") as f:
+ # f.write(result.content)
+
+
+ #########################
+ # IMPORTANT
+ #########################
+ # This will overwrite the created druid database from the DruidConnection object.
+ # This is intentional as the Datasets in the assets point to the uuid of the Database object.
+ # So we can not use the provisioned one, which will have a random uuid assigned.
+ #########################
+ logging.info("Importing all assets")
+ files = {
+ "bundle": ("superset-assets.zip", open("superset-assets.zip", "rb")),
+ }
+ data = {
+ "passwords": '{"databases/Trino_warehouse.yaml": "demo"}'
+ }
+ result = session.post(f"{base_url}/api/v1/assets/import", headers=headers, files=files, data=data)
+ print(result)
+ print(result.text)
+ assert result.status_code == 200
+
+ logging.info("Finished setup of Superset")
diff --git a/demos/data-warehouse-iceberg-trino-spark/superset-assets.zip b/demos/data-warehouse-iceberg-trino-spark/superset-assets.zip
new file mode 100644
index 00000000..f2e02f0c
Binary files /dev/null and b/demos/data-warehouse-iceberg-trino-spark/superset-assets.zip differ
diff --git a/demos/demos-v1.yaml b/demos/demos-v1.yaml
index 60154dd1..a14ac6cf 100644
--- a/demos/demos-v1.yaml
+++ b/demos/demos-v1.yaml
@@ -32,7 +32,7 @@ demos:
- superset
- minio
- s3
- - earthquake
+ - earthquakes
manifests:
- plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/nifi-kafka-druid-earthquake-data/create-nifi-ingestion-job.yaml
- plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/nifi-kafka-druid-earthquake-data/create-druid-ingestion-job.yaml
@@ -67,3 +67,26 @@ demos:
- plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/trino-taxi-data/load-test-data.yaml
- plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/trino-taxi-data/create-table-in-trino.yaml
- plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/trino-taxi-data/setup-superset.yaml
+ data-warehouse-iceberg-trino-spark:
+ description: Data warehouse using Iceberg warehouse on S3, Trino as query engine, Spark for streaming ingest and Superset for data visualization. Multiple datasources like taxi data, water levels in Germany, earthquakes, e-charging stations and more are loaded.
+ documentation: https://docs.stackable.tech/stackablectl/stable/demos/data-warehouse-with-trin.html
+ stackableStack: data-warehouse-iceberg-trino-spark
+ labels:
+ - iceberg
+ - trino
+ - spark
+ - superset
+ - kafka
+ - nifi
+ - minio
+ - s3
+ - ny-taxi-data
+ - water-levels
+ - earthquakes
+ manifests:
+ - plainYaml: demos/data-warehouse-iceberg-trino-spark/serviceaccount.yaml
+ - plainYaml: demos/data-warehouse-iceberg-trino-spark/load-test-data.yaml
+ - plainYaml: demos/data-warehouse-iceberg-trino-spark/create-trino-tables.yaml
+ - plainYaml: demos/data-warehouse-iceberg-trino-spark/create-nifi-ingestion-job.yaml
+ - plainYaml: demos/data-warehouse-iceberg-trino-spark/create-spark-ingestion-job.yaml
+ - plainYaml: demos/data-warehouse-iceberg-trino-spark/setup-superset.yaml
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/dbeaver_1.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/dbeaver_1.png
new file mode 100644
index 00000000..1dfb8e12
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/dbeaver_1.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/dbeaver_2.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/dbeaver_2.png
new file mode 100644
index 00000000..3dbf0c89
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/dbeaver_2.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/dbeaver_3.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/dbeaver_3.png
new file mode 100644
index 00000000..1dab5dd1
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/dbeaver_3.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/minio_1.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/minio_1.png
new file mode 100644
index 00000000..95f50815
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/minio_1.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/minio_2.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/minio_2.png
new file mode 100644
index 00000000..52742780
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/minio_2.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/minio_3.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/minio_3.png
new file mode 100644
index 00000000..c879fd01
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/minio_3.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/minio_4.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/minio_4.png
new file mode 100644
index 00000000..0ff200c8
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/minio_4.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/minio_5.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/minio_5.png
new file mode 100644
index 00000000..f067af26
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/minio_5.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/nifi_1.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/nifi_1.png
new file mode 100644
index 00000000..da238a03
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/nifi_1.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/nifi_2.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/nifi_2.png
new file mode 100644
index 00000000..6ff2f5ea
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/nifi_2.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/overview.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/overview.png
new file mode 100644
index 00000000..331c53d4
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/overview.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/spark_1.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/spark_1.png
new file mode 100644
index 00000000..d2eddf86
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/spark_1.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/spark_2.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/spark_2.png
new file mode 100644
index 00000000..4777652e
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/spark_2.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/spark_3.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/spark_3.png
new file mode 100644
index 00000000..d10ba708
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/spark_3.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_1.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_1.png
new file mode 100644
index 00000000..71ba38a9
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_1.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_2.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_2.png
new file mode 100644
index 00000000..2c963477
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_2.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_3.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_3.png
new file mode 100644
index 00000000..aecaef45
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_3.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_4.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_4.png
new file mode 100644
index 00000000..37fb0b15
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_4.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_5.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_5.png
new file mode 100644
index 00000000..b5fb7830
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_5.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_6.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_6.png
new file mode 100644
index 00000000..56a7857d
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_6.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_7.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_7.png
new file mode 100644
index 00000000..4102cd75
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_7.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_8.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_8.png
new file mode 100644
index 00000000..ba815341
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_8.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_9.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_9.png
new file mode 100644
index 00000000..c4174484
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/superset_9.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/topics.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/topics.png
new file mode 100644
index 00000000..ee9107d3
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/topics.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/trino_1.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/trino_1.png
new file mode 100644
index 00000000..ad9477ff
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/trino_1.png differ
diff --git a/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/trino_2.png b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/trino_2.png
new file mode 100644
index 00000000..23c5f860
Binary files /dev/null and b/docs/modules/ROOT/images/demo-data-warehouse-iceberg-trino-spark/trino_2.png differ
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
index 80b14dce..87ede21e 100644
--- a/docs/modules/ROOT/nav.adoc
+++ b/docs/modules/ROOT/nav.adoc
@@ -8,6 +8,7 @@
** xref:commands/stack.adoc[]
* xref:demos/index.adoc[]
** xref:demos/airflow-scheduled-job.adoc[]
+** xref:demos/data-warehouse-iceberg-trino-spark.adoc[]
** xref:demos/hbase-hdfs-load-cycling-data.adoc[]
** xref:demos/nifi-kafka-druid-earthquake-data.adoc[]
** xref:demos/nifi-kafka-druid-water-level-data.adoc[]
diff --git a/docs/modules/ROOT/pages/demos/airflow-scheduled-job.adoc b/docs/modules/ROOT/pages/demos/airflow-scheduled-job.adoc
index f243d67a..23e090f5 100644
--- a/docs/modules/ROOT/pages/demos/airflow-scheduled-job.adoc
+++ b/docs/modules/ROOT/pages/demos/airflow-scheduled-job.adoc
@@ -2,7 +2,7 @@
[NOTE]
====
-This guide assumes you already have the demo `airflow-scheduled-job` installed.
+This guide assumes that you already have the demo `airflow-scheduled-job` installed.
If you don't have it installed please follow the xref:commands/demo.adoc#_install_demo[documentation on how to install a demo].
To put it simply you have to run `stackablectl demo install airflow-scheduled-job`.
====
diff --git a/docs/modules/ROOT/pages/demos/data-warehouse-iceberg-trino-spark.adoc b/docs/modules/ROOT/pages/demos/data-warehouse-iceberg-trino-spark.adoc
new file mode 100644
index 00000000..4e7890a0
--- /dev/null
+++ b/docs/modules/ROOT/pages/demos/data-warehouse-iceberg-trino-spark.adoc
@@ -0,0 +1,275 @@
+= data-warehouse-iceberg-trino-spark
+
+[WARNING]
+====
+This demos uses significant amount of resources. It will most likely not run on your workstation.
+It was developed and tested on 10 nodes with each 4 cores (8 threads), 20GB RAM and 30GB HDD.
+Additionally, persistent volumes with a total size of approximately 1TB will get created.
+A smaller version of this demo might be created in the future.
+====
+
+[NOTE]
+====
+This guide assumes that you already have the demo `data-warehouse-iceberg-trino-spark` installed.
+If you don't have it installed please follow the xref:commands/demo.adoc#_install_demo[documentation on how to install a demo].
+To put it simply you have to run `stackablectl demo install data-warehouse-iceberg-trino-spark`.
+====
+
+This demo will
+
+* Install the required Stackable operators
+* Spin up the following data products
+** *Trino*: A fast distributed SQL query engine for big data analytics that helps you explore your data universe. This demo uses it to enable SQL access to the data
+** *Spark*: A multi-language engine for executing data engineering, data science, and machine learning. This demo uses it to stream data from Kafka into the warehouse
+** *MinIO*: A S3 compatible object store. This demo uses it as persistent storage to store all the data used
+** *Kafka*: A distributed event streaming platform for high-performance data pipelines, streaming analytics and data integration. This demos uses it as an event streaming platform to stream the data in near real-time
+** *NiFi*: An easy-to-use, powerful system to process and distribute data. This demos uses it to fetch multiple online real-time data sources and ingest it into Kafka
+** *Hive metastore*: A service that stores metadata related to Apache Hive and other services. This demo uses it as metadata storage for Trino and Spark
+** *Open policy agent* (OPA): An open source, general-purpose policy engine that unifies policy enforcement across the stack. This demo uses it as the authorizer for Trino, which decides which user is able to query which data.
+** *Superset*: A modern data exploration and visualization platform. This demo utilizes Superset to retrieve data from Trino via SQL queries and build dashboards on top of that data
+* Copy multiple data sources in CSV and Parquet format into the S3 staging area
+* Let Trino copy the data from staging area into the warehouse area. During the copy transformations such as validating, casting, parsing timestamps and enriching the data by joining lookup-tables are done
+* Simultaneously start a NiFi workflow, which fetches datasets in real-time via the internet and ingests the data as JSON records into Kafka
+* Spark structured streaming job is started, which streams the data out of Kafka into the warehouse
+* Create Superset dashboards for visualization of the different datasets
+
+You can see the deployed products as well as their relationship in the following diagram:
+
+image::demo-data-warehouse-iceberg-trino-spark/overview.png[]
+
+== Apache iceberg
+As Apache iceberg states on their https://iceberg.apache.org/docs/latest/[website]:
+
+> Apache Iceberg is an open table format for huge analytic datasets. Iceberg adds tables to compute engines including Spark, Trino, PrestoDB, Flink, Hive and Impala using a high-performance table format that works just like a SQL table.
+
+This demos uses Iceberg as it plays along nicely with object storage as well as having a Trino and Spark integration.
+
+== List deployed Stackable services
+To list the installed installed Stackable services run the following command:
+
+[source,console]
+----
+$ stackablectl services list --all-namespaces
+ PRODUCT NAME NAMESPACE ENDPOINTS EXTRA INFOS
+
+ hive hive default hive 212.227.224.138:31022
+ metrics 212.227.224.138:30459
+
+ hive hive-iceberg default hive 212.227.233.131:31511
+ metrics 212.227.233.131:30003
+
+ kafka kafka default metrics 217.160.118.190:32160
+ kafka 217.160.118.190:31736
+
+ nifi nifi default https https://217.160.120.117:31499 Admin user: admin, password: adminadmin
+
+ opa opa default http http://217.160.222.211:31767
+
+ superset superset default external-superset http://212.227.233.47:32393 Admin user: admin, password: admin
+
+ trino trino default coordinator-metrics 212.227.224.138:30610
+ coordinator-https https://212.227.224.138:30876
+
+ zookeeper zookeeper default zk 212.227.224.138:32321
+
+ minio minio default http http://217.160.222.211:32031 Third party service
+ console-http http://217.160.222.211:31429 Admin user: admin, password: adminadmin
+----
+
+[NOTE]
+====
+When a product instance has not finished starting yet, the service will have no endpoint.
+Starting all the product instances might take a considerable amount of time depending on your internet connectivity.
+In case the product is not ready yet a warning might be shown.
+====
+
+== MinIO
+=== List buckets
+The S3 provided by MinIO is used as persistent storage to store all the data used.
+Open the `minio` endpoint `console-http` retrieved by `stackablectl services list` in your browser (http://217.160.222.211:31429 in this case).
+
+image::demo-data-warehouse-iceberg-trino-spark/minio_1.png[]
+
+Log in with the username `admin` and password `adminadmin`.
+
+image::demo-data-warehouse-iceberg-trino-spark/minio_2.png[]
+
+Here you can see the two buckets contained in the S3:
+
+1. `staging`: The demo loads static datasets into this area. It is stored in different formats, such as CSV and Parquet. It does contain actual data tables as well as lookup tables.
+2. `warehouse`: This bucket is where the cleaned and/or aggregated data resides. The data is stored in the https://iceberg.apache.org/[Apache Iceberg] table format.
+
+=== Inspect warehouse
+Click on the blue button `Browse` on the bucket `warehouse`.
+
+image::demo-data-warehouse-iceberg-trino-spark/minio_3.png[]
+
+You can see multiple folders (called prefixes in S3) - each containing a different dataset.
+
+Click on the folders `house-sales` afterwards the folder starting with `house-sales-*` afterwards 'data'.
+
+image::demo-data-warehouse-iceberg-trino-spark/minio_4.png[]
+
+As you can see the table `house-sales` is partitioned by day.
+Go ahead and click on any folder.
+
+image::demo-data-warehouse-iceberg-trino-spark/minio_5.png[]
+
+You can see that Trino has placed a single file here containing all the house sales of that particular year.
+
+== NiFi
+
+NiFi is used to fetch multiple datasources from the internet and ingest it into Kafka near-realtime.
+Some data sources are statically downloaded (e.g. as CSV) and others are dynamically fetched via APIs such as REST APIs.
+This includes the following data sources:
+
+* https://www.pegelonline.wsv.de/webservice/guideRestapi[Water level measurements in Germany] (real-time)
+* https://mobidata-bw.de/dataset/bikesh[Shared bikes in Germany] (real-time)
+* https://www.gov.uk/government/statistical-data-sets/price-paid-data-downloads[House sales in UK] (static)
+* https://www.usgs.gov/programs/earthquake-hazards/earthquakes[Registered earthquakes worldwide] (static)
+* https://mobidata-bw.de/dataset/e-ladesaulen[E-charging stations in Germany] (static)
+* https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page[NewYork taxi data] (static)
+
+=== View ingestion jobs
+You can have a look at the ingestion job running in NiFi by opening the given `nifi` endpoint `https` from your `stackablectl services list` command output (https://217.160.120.117:31499 in this case).
+If you get a warning regarding the self-signed certificate generated by the xref:secret-operator::index.adoc[Secret Operator] (e.g. `Warning: Potential Security Risk Ahead`), you have to tell your browser to trust the website and continue.
+
+image::demo-data-warehouse-iceberg-trino-spark/nifi_1.png[]
+
+Log in with the username `admin` and password `adminadmin`.
+
+image::demo-data-warehouse-iceberg-trino-spark/nifi_2.png[]
+
+As you can see, the NiFi workflow consists of lots of components.
+You can zoom in by using your mouse and mouse wheel.
+On the left side are two strands, that
+
+1. Fetch the list of known water-level stations and ingest them into Kafka
+2. Continuously run a loop fetching the measurements of the last 30 for every measuring station and ingesting the measurements into Kafka
+
+On the right side are three strands, that
+
+1. Fetch the current shared bike stations information
+2. Fetch the current shared bike stations status
+3. Fetch the current shared bike bike status
+
+For details on the NiFi workflow ingesting water-level data please read on the xref:demos/nifi-kafka-druid-water-level-data.adoc#_nifi[nifi-kafka-druid-water-level-data documentation on NiFi].
+
+== Spark
+
+https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html[Spark Structured Streaming] is used to stream data from Kafka into the warehouse.
+
+To have access to the Spark WebUI you need to run the following command to port-forward the Port 4040 to your local machine
+
+[source,console]
+----
+kubectl port-forward $(kubectl get pod -o name | grep 'spark-ingest-into-warehouse-.*-driver') 4040
+----
+
+Afterwards you can reach the Webinterface on http://localhost:4040.
+
+image::demo-data-warehouse-iceberg-trino-spark/spark_1.png[]
+
+On the UI the last jobs are shown.
+Each running Structured Streaming job creates lots of Spark jobs internally.
+
+Click on the tab `Structured Streaming` to see the running streaming jobs.
+
+image::demo-data-warehouse-iceberg-trino-spark/spark_2.png[]
+
+Five streaming jobs are currently running.
+The job with the highest throughput is the `ingest water_level measurements` job.
+Click on the `Run ID` highlighted in blue.
+
+image::demo-data-warehouse-iceberg-trino-spark/spark_3.png[]
+
+== Trino
+Trino is used to enable SQL access to the data.
+
+=== View WebUI
+Open up the the given `trino` endpoint `coordinator-https` from your `stackablectl services list` command output (https://212.227.224.138:30876 in this case).
+
+image::demo-data-warehouse-iceberg-trino-spark/trino_1.png[]
+
+Log in with the username `admin` and password `admin`.
+
+image::demo-data-warehouse-iceberg-trino-spark/trino_2.png[]
+
+=== Connect with DBeaver
+https://dbeaver.io/[DBeaver] is free multi-platform database tool that can be used to connect to Trino.
+Please have a look at the trino-operator documentation on how to connect DBeaver to Trino.
+
+image::demo-data-warehouse-iceberg-trino-spark/dbeaver_1.png[]
+
+image::demo-data-warehouse-iceberg-trino-spark/dbeaver_2.png[]
+You need to modify the setting `TLS` to `true`.
+Additionally you need to add the setting `SSLVerification` and set it to `NONE`.
+
+image::demo-data-warehouse-iceberg-trino-spark/dbeaver_3.png[]
+
+Here you can see all the available Trino catalogs.
+
+* `staging`: The staging area containing raw data in various data formats such as CSV or Parquet
+* `system`: Internal catalog to retrieve Trino internals
+* `tpcds`: https://trino.io/docs/current/connector/tpcds.html[TPCDS connector] providing a set of schemas to support the http://www.tpc.org/tpcds/[TPC Benchmark⢠DS]
+* `tpch`: https://trino.io/docs/current/connector/tpch.html[TPCH connector] providing a set of schemas to support the http://www.tpc.org/tpcds/[TPC Benchmark⢠DS]
+* `warehouse`: The warehouse area containing the enriched and performant accessible data
+
+== Superset
+Superset provides the ability to execute SQL queries and build dashboards.
+Open the `superset` endpoint `external-superset` in your browser (http://212.227.233.47:32393 in this case).
+
+image::demo-data-warehouse-iceberg-trino-spark/superset_1.png[]
+
+Log in with the username `admin` and password `admin`.
+
+image::demo-data-warehouse-iceberg-trino-spark/superset_2.png[]
+
+=== View dashboard
+The demo has created dashboards to visualize the different data sources.
+To the dashboards click on the tab `Dashboards` at the top.
+
+image::demo-data-warehouse-iceberg-trino-spark/superset_3.png[]
+
+Click on the dashboard called `House sales`.
+It might take some time until the dashboards renders all the included charts.
+
+image::demo-data-warehouse-iceberg-trino-spark/superset_4.png[]
+
+Another dashboard to look at is `Earthquakes`.
+
+image::demo-data-warehouse-iceberg-trino-spark/superset_5.png[]
+
+Another dashboard to look at is `Taxi trips`.
+
+image::demo-data-warehouse-iceberg-trino-spark/superset_6.png[]
+
+There are multiple other dashboards you can explore on you own.
+
+=== View charts
+
+The dashboards consists of multiple charts.
+To list the charts click on the tab `Charts` at the top.
+
+=== Execute arbitrary SQL statements
+Within Superset you can not only create dashboards but also run arbitrary SQL statements.
+On the top click on the tab `SQL Lab` -> `SQL Editor`.
+
+image::demo-data-warehouse-iceberg-trino-spark/superset_7.png[]
+
+On the left select the database `Trino warehouse`, the schema `house_sales` and set `See table schema` to `house_sales`.
+
+image::demo-data-warehouse-iceberg-trino-spark/superset_8.png[]
+
+On the right textbox enter the desired SQL statement.
+If you do not want to make one up, you can use the following:
+
+[source,sql]
+----
+select city, sum(price) as sales
+from house_sales
+group by 1
+order by 2 desc
+----
+
+image::demo-data-warehouse-iceberg-trino-spark/superset_9.png[]
diff --git a/docs/modules/ROOT/pages/demos/hbase-hdfs-load-cycling-data.adoc b/docs/modules/ROOT/pages/demos/hbase-hdfs-load-cycling-data.adoc
index 8842206e..ae42eb72 100644
--- a/docs/modules/ROOT/pages/demos/hbase-hdfs-load-cycling-data.adoc
+++ b/docs/modules/ROOT/pages/demos/hbase-hdfs-load-cycling-data.adoc
@@ -2,7 +2,7 @@
[NOTE]
====
-This guide assumes you already have the demo `hbase-hdfs-load-cycling-data` installed.
+This guide assumes that you already have the demo `hbase-hdfs-load-cycling-data` installed.
If you don't have it installed please follow the xref:commands/demo.adoc#_install_demo[documentation on how to install a demo].
To put it simply you have to run `stackablectl demo install hbase-hdfs-load-cycling-data`.
====
@@ -55,7 +55,7 @@ PRODUCT NAME NAMESPACE ENDPOINTS
[NOTE]
====
When a product instance has not finished starting yet, the service will have no endpoint.
-Starting all of the product instances might take a considerable amount of time depending on your internet connectivity.
+Starting all the product instances might take a considerable amount of time depending on your internet connectivity.
In case the product is not ready yet a warning might be shown.
====
diff --git a/docs/modules/ROOT/pages/demos/nifi-kafka-druid-earthquake-data.adoc b/docs/modules/ROOT/pages/demos/nifi-kafka-druid-earthquake-data.adoc
index 32f169b1..ccfa9bda 100644
--- a/docs/modules/ROOT/pages/demos/nifi-kafka-druid-earthquake-data.adoc
+++ b/docs/modules/ROOT/pages/demos/nifi-kafka-druid-earthquake-data.adoc
@@ -2,7 +2,7 @@
[NOTE]
====
-This guide assumes you already have the demo `nifi-kafka-druid-earthquake-data` installed.
+This guide assumes that you already have the demo `nifi-kafka-druid-earthquake-data` installed.
If you don't have it installed please follow the xref:commands/demo.adoc#_install_demo[documentation on how to install a demo].
To put it simply you have to run `stackablectl demo install nifi-kafka-druid-earthquake-data`.
====
@@ -55,7 +55,7 @@ $ stackablectl services list --all-namespaces
[NOTE]
====
When a product instance has not finished starting yet, the service will have no endpoint.
-Starting all of the product instances might take a considerable amount of time depending on your internet connectivity.
+Starting all the product instances might take a considerable amount of time depending on your internet connectivity.
In case the product is not ready yet a warning might be shown.
====
diff --git a/docs/modules/ROOT/pages/demos/nifi-kafka-druid-water-level-data.adoc b/docs/modules/ROOT/pages/demos/nifi-kafka-druid-water-level-data.adoc
index af6b4036..ef475ed6 100644
--- a/docs/modules/ROOT/pages/demos/nifi-kafka-druid-water-level-data.adoc
+++ b/docs/modules/ROOT/pages/demos/nifi-kafka-druid-water-level-data.adoc
@@ -2,7 +2,7 @@
[NOTE]
====
-This guide assumes you already have the demo `nifi-kafka-druid-water-level-data` installed.
+This guide assumes that you already have the demo `nifi-kafka-druid-water-level-data` installed.
If you don't have it installed please follow the xref:commands/demo.adoc#_install_demo[documentation on how to install a demo].
To put it simply you have to run `stackablectl demo install nifi-kafka-druid-water-level-data`.
====
@@ -57,7 +57,7 @@ $ stackablectl services list --all-namespaces
[NOTE]
====
When a product instance has not finished starting yet, the service will have no endpoint.
-Starting all of the product instances might take a considerable amount of time depending on your internet connectivity.
+Starting all the product instances might take a considerable amount of time depending on your internet connectivity.
In case the product is not ready yet a warning might be shown.
====
diff --git a/docs/modules/ROOT/pages/demos/trino-taxi-data.adoc b/docs/modules/ROOT/pages/demos/trino-taxi-data.adoc
index 9d1f5ecf..af6f1967 100644
--- a/docs/modules/ROOT/pages/demos/trino-taxi-data.adoc
+++ b/docs/modules/ROOT/pages/demos/trino-taxi-data.adoc
@@ -2,7 +2,7 @@
[NOTE]
====
-This guide assumes you already have the demo `trino-taxi-data` installed.
+This guide assumes that you already have the demo `trino-taxi-data` installed.
If you don't have it installed please follow the xref:commands/demo.adoc#_install_demo[documentation on how to install a demo].
To put it simply you have to run `stackablectl demo install trino-taxi-data`.
====
@@ -15,7 +15,7 @@ This demo will
** *Trino*: A fast distributed SQL query engine for big data analytics that helps you explore your data universe. This demo uses it to enable SQL access to the data
** *MinIO*: A S3 compatible object store. This demo uses it as persistent storage to store all the data used
** *Hive metastore*: A service that stores metadata related to Apache Hive and other services. This demo uses it as metadata storage for Trino
-** *Open policy agent* (OPA): A open source, general-purpose policy engine that unifies policy enforcement across the stack. This demo uses it as the authorizer for Trino, which decides which user is able to query which data.
+** *Open policy agent* (OPA): An open source, general-purpose policy engine that unifies policy enforcement across the stack. This demo uses it as the authorizer for Trino, which decides which user is able to query which data.
* Load testdata into S3. It contains 2.5 years of https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page[New York City taxi trips]
* Make data accessible via SQL in Trino
* Create Superset dashboards for visualization of the data
@@ -50,7 +50,7 @@ $ stackablectl services list --all-namespaces
[NOTE]
====
When a product instance has not finished starting yet, the service will have no endpoint.
-Starting all of the product instances might take an considerable amount of time depending on your internet connectivity.
+Starting all the product instances might take an considerable amount of time depending on your internet connectivity.
In case the product is not ready yet a warning might be shown.
====
diff --git a/releases.yaml b/releases.yaml
index 2f6e3241..11f9d15b 100644
--- a/releases.yaml
+++ b/releases.yaml
@@ -1,40 +1,34 @@
---
releases:
- This version of stackablectl has been deprecated, please update:
- releaseDate: 2022-06-02
- description:
- products:
- commons:
- operatorVersion: 0.1.0
- 22.06:
- releaseDate: 2022-06-30
- description: First official release of the Stackable Data Platform
+ 22.09-latest-trino-spark:
+ releaseDate: 2022-09-09
+ description: Second release focusing on security and OpenShift support
products:
airflow:
- operatorVersion: 0.4.0
+ operatorVersion: 0.5.0
commons:
- operatorVersion: 0.2.0
+ operatorVersion: 0.3.0
druid:
- operatorVersion: 0.6.0
+ operatorVersion: 0.7.0
hbase:
- operatorVersion: 0.3.0
- hdfs:
operatorVersion: 0.4.0
+ hdfs:
+ operatorVersion: 0.5.0
hive:
- operatorVersion: 0.6.0
+ operatorVersion: 0.7.0
kafka:
- operatorVersion: 0.6.0
+ operatorVersion: 0.7.0
nifi:
- operatorVersion: 0.6.0
+ operatorVersion: 0.7.0
opa:
- operatorVersion: 0.9.0
+ operatorVersion: 0.10.0
secret:
operatorVersion: 0.5.0
spark-k8s:
- operatorVersion: 0.3.0
+ operatorVersion: 0.6.0-nightly # Needed for resources
superset:
- operatorVersion: 0.5.0
+ operatorVersion: 0.6.0
trino:
- operatorVersion: 0.4.0
+ operatorVersion: 0.8.0-nightly # Needed for iceberg
zookeeper:
- operatorVersion: 0.10.0
+ operatorVersion: 0.11.0
diff --git a/stacks/data-warehouse-iceberg-trino-spark/hive-metastores.yaml b/stacks/data-warehouse-iceberg-trino-spark/hive-metastores.yaml
new file mode 100644
index 00000000..53f7637c
--- /dev/null
+++ b/stacks/data-warehouse-iceberg-trino-spark/hive-metastores.yaml
@@ -0,0 +1,69 @@
+---
+apiVersion: hive.stackable.tech/v1alpha1
+kind: HiveCluster
+metadata:
+ name: hive
+spec:
+ version: 3.1.3-stackable0.1.0
+ s3:
+ reference: hive
+ metastore:
+ roleGroups:
+ default:
+ replicas: 1
+ config:
+ database:
+ connString: jdbc:postgresql://postgresql-hive:5432/hive
+ user: hive
+ password: hive
+ dbType: postgres
+---
+apiVersion: hive.stackable.tech/v1alpha1
+kind: HiveCluster
+metadata:
+ name: hive-iceberg
+spec:
+ version: 3.1.3-stackable0.1.0
+ s3:
+ reference: hive
+ metastore:
+ roleGroups:
+ default:
+ replicas: 1
+ config:
+ database:
+ connString: jdbc:postgresql://postgresql-hive-iceberg:5432/hive
+ user: hive
+ password: hive
+ dbType: postgres
+---
+apiVersion: s3.stackable.tech/v1alpha1
+kind: S3Connection
+metadata:
+ name: hive
+spec:
+ host: minio
+ port: 9000
+ accessStyle: Path
+ credentials:
+ secretClass: hive-s3-credentials
+---
+apiVersion: secrets.stackable.tech/v1alpha1
+kind: SecretClass
+metadata:
+ name: hive-s3-credentials
+spec:
+ backend:
+ k8sSearch:
+ searchNamespace:
+ pod: {}
+---
+apiVersion: v1
+kind: Secret
+metadata:
+ name: hive-s3-credentials
+ labels:
+ secrets.stackable.tech/class: hive-s3-credentials
+stringData:
+ accessKey: hive
+ secretKey: hivehive
diff --git a/stacks/data-warehouse-iceberg-trino-spark/kafka.yaml b/stacks/data-warehouse-iceberg-trino-spark/kafka.yaml
new file mode 100644
index 00000000..61743173
--- /dev/null
+++ b/stacks/data-warehouse-iceberg-trino-spark/kafka.yaml
@@ -0,0 +1,42 @@
+---
+apiVersion: zookeeper.stackable.tech/v1alpha1
+kind: ZookeeperZnode
+metadata:
+ name: kafka-znode
+spec:
+ clusterRef:
+ name: zookeeper
+---
+apiVersion: kafka.stackable.tech/v1alpha1
+kind: KafkaCluster
+metadata:
+ name: kafka
+spec:
+ version: 3.2.0-stackable0.1.0
+ zookeeperConfigMapName: kafka-znode
+ config:
+ tls: null
+ brokers:
+ config:
+ resources:
+ storage:
+ logDirs:
+ capacity: 50Gi
+ cpu:
+ min: "1500m"
+ max: "3"
+ memory:
+ limit: 2Gi
+ roleGroups:
+ default:
+ replicas: 5
+ configOverrides:
+ server.properties:
+ num.partitions: "27"
+ # We have
+ # 5 brokers
+ # and 1 topic (with 27 partitions) with large volume of data
+ # 50Gi pvc * 5 brokers = 250Gi total disk size.
+ # Lets aim for 50% => 0.7 * 250Gi = 125Gi / 27 partitions = 4.6Gi
+ log.segment.bytes: "50000000" # 0.5GB
+ log.retention.bytes: "4000000000" # 4 GB. Should keep between 4.0 and 4.5GB
diff --git a/stacks/data-warehouse-iceberg-trino-spark/nifi.yaml b/stacks/data-warehouse-iceberg-trino-spark/nifi.yaml
new file mode 100644
index 00000000..67d97a08
--- /dev/null
+++ b/stacks/data-warehouse-iceberg-trino-spark/nifi.yaml
@@ -0,0 +1,54 @@
+---
+apiVersion: nifi.stackable.tech/v1alpha1
+kind: NifiCluster
+metadata:
+ name: nifi
+spec:
+ version: 1.16.3-stackable0.1.0
+ zookeeperConfigMapName: nifi-znode
+ config:
+ authentication:
+ method:
+ singleUser:
+ adminCredentialsSecret: nifi-admin-credentials
+ sensitiveProperties:
+ keySecret: nifi-sensitive-property-key
+ autoGenerate: true
+ nodes:
+ config:
+ resources:
+ memory:
+ limit: '6Gi'
+ cpu:
+ min: "2"
+ max: "4"
+ storage:
+ contentRepo:
+ capacity: "10Gi"
+ databaseRepo:
+ capacity: "5Gi"
+ flowfileRepo:
+ capacity: "5Gi"
+ provenanceRepo:
+ capacity: "5Gi"
+ stateRepo:
+ capacity: "5Gi"
+ roleGroups:
+ default:
+ replicas: 2
+---
+apiVersion: v1
+kind: Secret
+metadata:
+ name: nifi-admin-credentials
+stringData:
+ username: admin
+ password: adminadmin
+---
+apiVersion: zookeeper.stackable.tech/v1alpha1
+kind: ZookeeperZnode
+metadata:
+ name: nifi-znode
+spec:
+ clusterRef:
+ name: zookeeper
diff --git a/stacks/data-warehouse-iceberg-trino-spark/trino.yaml b/stacks/data-warehouse-iceberg-trino-spark/trino.yaml
new file mode 100644
index 00000000..1df906f2
--- /dev/null
+++ b/stacks/data-warehouse-iceberg-trino-spark/trino.yaml
@@ -0,0 +1,165 @@
+---
+apiVersion: trino.stackable.tech/v1alpha1
+kind: TrinoCluster
+metadata:
+ name: trino
+spec:
+ version: 396-stackable0.1.0
+ catalogLabelSelector:
+ matchLabels:
+ trino: trino
+ authentication:
+ method:
+ multiUser:
+ userCredentialsSecret:
+ name: trino-users
+ opa:
+ configMapName: opa
+ package: trino
+ coordinators:
+ config:
+ queryMaxMemory: 10TB
+ resources:
+ cpu:
+ min: "1"
+ max: "4"
+ memory:
+ limit: '6Gi'
+ roleGroups:
+ default:
+ replicas: 1
+ workers:
+ config:
+ queryMaxMemoryPerNode: 5GB
+ resources:
+ cpu:
+ min: "2"
+ max: "4"
+ memory:
+ limit: '10Gi'
+ roleGroups:
+ default:
+ replicas: 4
+---
+apiVersion: v1
+kind: Secret
+metadata:
+ name: trino-users
+type: kubernetes.io/opaque
+stringData:
+ # admin:admin
+ admin: $2y$10$89xReovvDLacVzRGpjOyAOONnayOgDAyIS2nW9bs5DJT98q17Dy5i
+ # demo:demo
+ demo: $2y$10$mMRoIKfWtAuycEQnKiDCeOlCSYiWkvbs0WsMFLkaSnNO0ZnFKVRXm
+---
+apiVersion: trino.stackable.tech/v1alpha1
+kind: TrinoCatalog
+metadata:
+ name: staging
+ labels:
+ trino: trino
+spec:
+ connector:
+ hive:
+ metastore:
+ configMap: hive
+ s3:
+ reference: trino
+---
+apiVersion: trino.stackable.tech/v1alpha1
+kind: TrinoCatalog
+metadata:
+ name: warehouse
+ labels:
+ trino: trino
+spec:
+ connector:
+ iceberg:
+ metastore:
+ configMap: hive-iceberg
+ s3:
+ reference: trino
+---
+apiVersion: trino.stackable.tech/v1alpha1
+kind: TrinoCatalog
+metadata:
+ name: tpcds
+ labels:
+ trino: trino
+spec:
+ connector:
+ tpcds: {}
+---
+apiVersion: trino.stackable.tech/v1alpha1
+kind: TrinoCatalog
+metadata:
+ name: tpch
+ labels:
+ trino: trino
+spec:
+ connector:
+ tpch: {}
+---
+apiVersion: s3.stackable.tech/v1alpha1
+kind: S3Connection
+metadata:
+ name: trino
+spec:
+ host: minio
+ port: 9000
+ accessStyle: Path
+ credentials:
+ secretClass: trino-s3-credentials
+---
+apiVersion: secrets.stackable.tech/v1alpha1
+kind: SecretClass
+metadata:
+ name: trino-s3-credentials
+spec:
+ backend:
+ k8sSearch:
+ searchNamespace:
+ pod: {}
+---
+apiVersion: v1
+kind: Secret
+metadata:
+ name: trino-s3-credentials
+ labels:
+ secrets.stackable.tech/class: trino-s3-credentials
+stringData:
+ accessKey: trino
+ secretKey: trinotrino
+---
+apiVersion: opa.stackable.tech/v1alpha1
+kind: OpaCluster
+metadata:
+ name: opa
+spec:
+ version: 0.41.0-stackable0.1.0
+ servers:
+ roleGroups:
+ default:
+ selector:
+ matchLabels:
+ kubernetes.io/os: linux
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: trino-opa-bundle
+ labels:
+ opa.stackable.tech/bundle: "trino"
+data:
+ trino.rego: |
+ package trino
+
+ default allow = false
+
+ allow {
+ input.context.identity.user == "admin"
+ }
+
+ allow {
+ input.context.identity.user == "demo"
+ }
diff --git a/stacks/data-warehouse-iceberg-trino-spark/zookeeper.yaml b/stacks/data-warehouse-iceberg-trino-spark/zookeeper.yaml
new file mode 100644
index 00000000..ff0c094c
--- /dev/null
+++ b/stacks/data-warehouse-iceberg-trino-spark/zookeeper.yaml
@@ -0,0 +1,11 @@
+---
+apiVersion: zookeeper.stackable.tech/v1alpha1
+kind: ZookeeperCluster
+metadata:
+ name: zookeeper
+spec:
+ version: 3.8.0-stackable0.7.1
+ servers:
+ roleGroups:
+ default:
+ replicas: 3
diff --git a/stacks/stacks-v1.yaml b/stacks/stacks-v1.yaml
index 542423f0..f552a76c 100644
--- a/stacks/stacks-v1.yaml
+++ b/stacks/stacks-v1.yaml
@@ -6,7 +6,7 @@ _templates:
repo:
name: minio
url: https://charts.min.io/
- version: 4.0.2
+ version: 4.0.15
options:
rootUser: root
rootPassword: rootroot
@@ -35,7 +35,7 @@ _templates:
repo:
name: minio
url: https://charts.min.io/
- version: 4.0.5
+ version: 4.0.15
options:
rootUser: root
rootPassword: rootroot
@@ -57,6 +57,46 @@ _templates:
policy: public
resources:
requests:
+ cpu: 500m
+ memory: 1Gi
+ service:
+ type: NodePort
+ nodePort: null
+ consoleService:
+ type: NodePort
+ nodePort: null
+ - helmChart: &template-minio-distributed
+ releaseName: minio
+ name: minio
+ repo:
+ name: minio
+ url: https://charts.min.io/
+ version: 4.0.15
+ options:
+ rootUser: admin
+ rootPassword: adminadmin
+ mode: distributed
+ replicas: 5
+ persistence:
+ size: 250Gi
+ users:
+ - accessKey: trino
+ secretKey: trinotrino
+ policy: readwrite
+ - accessKey: hive
+ secretKey: hivehive
+ policy: readwrite
+ - accessKey: demo
+ secretKey: demodemo
+ policy: readwrite
+ buckets:
+ - name: staging
+ policy: public
+ - name: warehouse
+ policy: public
+ resources:
+ requests:
+ cpu: 500m
memory: 2Gi
service:
type: NodePort
@@ -76,6 +116,18 @@ _templates:
postgresqlUsername: hive
postgresqlPassword: hive
postgresqlDatabase: hive
+ - helmChart: &template-postgresql-hive-iceberg
+ releaseName: postgresql-hive-iceberg
+ name: postgresql
+ repo:
+ name: bitnami
+ url: https://charts.bitnami.com/bitnami/
+ version: 10.16.2
+ options:
+ # Old version (10) of helm-charts has old way of setting credentials
+ postgresqlUsername: hive
+ postgresqlPassword: hive
+ postgresqlDatabase: hive
- helmChart: &template-postgresql-superset
releaseName: postgresql-superset
name: postgresql
@@ -123,6 +175,29 @@ stacks:
- helmChart: *template-postgresql-airflow
- helmChart: *template-redis-airflow
- plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/stacks/airflow/airflow.yaml
+ data-warehouse-iceberg-trino-spark:
+ description: Data warehouse using Iceberg warehouse on S3, Trino as query engine, Spark for streaming ingest and Superset for data visualization
+ stackableRelease: 22.09-latest-trino-spark
+ labels:
+ - iceberg
+ - trino
+ - spark
+ - superset
+ - kafka
+ - nifi
+ - minio
+ - s3
+ manifests:
+ - helmChart: *template-minio-distributed
+ - helmChart: *template-postgresql-hive
+ - helmChart: *template-postgresql-hive-iceberg
+ - helmChart: *template-postgresql-superset
+ - plainYaml: stacks/data-warehouse-iceberg-trino-spark/hive-metastores.yaml
+ - plainYaml: stacks/data-warehouse-iceberg-trino-spark/trino.yaml
+ - plainYaml: stacks/data-warehouse-iceberg-trino-spark/zookeeper.yaml
+ - plainYaml: stacks/data-warehouse-iceberg-trino-spark/kafka.yaml
+ - plainYaml: stacks/data-warehouse-iceberg-trino-spark/nifi.yaml
+ - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/stacks/kafka-druid-superset-s3/superset.yaml # Reuse
hdfs-hbase:
description: HBase cluster using HDFS as underlying storage
stackableRelease: 22.09