Improve water-level demo (#126)

sbernauer · sbernauer · commit 1c6dc290d282 · 2022-09-30T12:13:03.000Z
## Description Run with `stackablectl --additional-demos-file demos/demos-v1.yaml --additional-stacks-file stacks/stacks-v1.yaml demo install nifi-kafka-druid-water-level-data` Tested demo with 2.500.000.000 records Hi all, here a short summary of the observations of the water-level demo: NiFi uses content-repo pvc but keeps it at ~50% usage => Shoud be fine forever Actions: * Increase content-repo 5->10 gb, better safe than sorry. I was able to crash it by using large queues and stalling processors. Kafka uses pvc (currently 15gb) => Should work fine for ~1 week Actions: * Look into retentions settings (low priority as it should work ~1 week) so that it works forever Druid uses S3 for deep storage (S3 has 15gb). But currently it also cashes *everything* locally at the historical because we set `druid.segmentCache.locations=[{"path"\:"/stackable/var/druid/segment-cache","maxSize"\:"300g"}]` (hardcoded in https://github.com/stackabletech/druid-operator/blob/45525033f5f3f52e0997a9b4d79ebe9090e9e0a0/deploy/config-spec/properties.yaml#L725) This does *not* really effect the demo, as 100.000.000 records (let's call it data of ~1 week) have ~400MB. I think the main problem with the demo is that queries take > 5 minutes to complete and Superset shows timeouts. The historical pod suspiciously uses exactly one core of cpu and the queries are really slow for a "big data" system IMHO. This could be because either druid is only using a single core or because we dont set any resources (yet!) and the node does not have more cores available. Going to reasearch that. Actions: * Created stackabletech/druid-operator#306 * In the meantime configure overwrite in the demo `druid.segmentCache.locations=[{"path"\:"/stackable/var/druid/segment-cache","maxSize"\:"3g","freeSpacePercent":"5.0"}]` * Research slow query performance * Have a look at the queries the Superset Dashboard executes and optimize them * Maybe we should bump the druid-operator versions in the demo (e.g. create release 22.09-druid which basically is 22.09 with a newer druid-op version). Therefore we get stable resources. * Enable Druid auto compaction to reduce number of segments
diff --git a/demos/demos-v1.yaml b/demos/demos-v1.yaml
@@ -50,9 +50,9 @@ demos:
       - s3
       - water-levels
     manifests:
-      - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/nifi-kafka-druid-water-level-data/create-nifi-ingestion-job.yaml # TODO
-      - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/nifi-kafka-druid-water-level-data/create-druid-ingestion-job.yaml # TODO
-      - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/nifi-kafka-druid-water-level-data/setup-superset.yaml # TODO
+      - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/nifi-kafka-druid-water-level-data/create-nifi-ingestion-job.yaml
+      - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/nifi-kafka-druid-water-level-data/create-druid-ingestion-job.yaml
+      - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/nifi-kafka-druid-water-level-data/setup-superset.yaml
   trino-taxi-data:
     description: Demo loading 2.5 years of New York taxi data into S3 bucket, creating a Trino table and a Superset dashboard
     documentation: https://docs.stackable.tech/stackablectl/stable/demos/trino-taxi-data.html
diff --git a/demos/nifi-kafka-druid-water-level-data/create-druid-ingestion-job.yaml b/demos/nifi-kafka-druid-water-level-data/create-druid-ingestion-job.yaml
@@ -9,7 +9,7 @@ spec:
       containers:
         - name: create-druid-ingestion-job
           image: docker.stackable.tech/stackable/testing-tools:0.1.0-stackable0.1.0
-          command: ["bash", "-c", "curl -X POST -H 'Content-Type: application/json' -d @/tmp/ingestion-job-spec/stations-ingestion-job-spec.json http://druid-coordinator:8081/druid/indexer/v1/supervisor && curl -X POST -H 'Content-Type: application/json' -d @/tmp/ingestion-job-spec/measurements-ingestion-job-spec.json http://druid-coordinator:8081/druid/indexer/v1/supervisor"]
+          command: ["bash", "-c", "curl -X POST -H 'Content-Type: application/json' -d @/tmp/ingestion-job-spec/stations-ingestion-job-spec.json http://druid-coordinator:8081/druid/indexer/v1/supervisor && curl -X POST -H 'Content-Type: application/json' -d @/tmp/ingestion-job-spec/measurements-ingestion-job-spec.json http://druid-coordinator:8081/druid/indexer/v1/supervisor && curl -X POST -H 'Content-Type: application/json' -d @/tmp/ingestion-job-spec/measurements-compaction-job-spec.json http://druid-coordinator:8081/druid/coordinator/v1/config/compaction"]
           volumeMounts:
             - name: ingestion-job-spec
               mountPath: /tmp/ingestion-job-spec
@@ -65,9 +65,9 @@ data:
           },
           "dimensionsSpec": {
             "dimensions": [
+              "uuid",
               "water_longname",
               "water_shortname",
-              "uuid",
               {
                 "type": "long",
                 "name": "number"
@@ -124,11 +124,11 @@ data:
           "transformSpec": {},
           "dimensionsSpec": {
             "dimensions": [
+              "station_uuid",
               {
                 "type": "long",
                 "name": "value"
-              },
-              "station_uuid"
+              }
             ]
           },
           "granularitySpec": {
@@ -139,3 +139,20 @@ data:
         }
       }
     }
+  measurements-compaction-job-spec.json: |
+    {
+      "dataSource": "measurements",
+      "skipOffsetFromLatest": "PT1H",
+      "granularitySpec": {
+        "segmentGranularity": "DAY"
+      },
+      "tuningConfig": {
+        "partitionsSpec": {
+          "type": "range",
+          "partitionDimensions": [
+            "station_uuid"
+          ],
+          "targetRowsPerSegment": 5000000
+        }
+      }
+    }
diff --git a/demos/nifi-kafka-druid-water-level-data/superset-assets.zip b/demos/nifi-kafka-druid-water-level-data/superset-assets.zip
diff --git a/stacks/kafka-druid-superset-s3/druid.yaml b/stacks/kafka-druid-superset-s3/druid.yaml
@@ -35,7 +35,12 @@ spec:
   historicals:
     roleGroups:
       default:
-        replicas: 1
+        replicas: 2
+        configOverrides:
+          runtime.properties:
+            druid.processing.numThreads: "4" # As we are on 22.09 we can't set any resources requests or limits
+            # See https://github.com/stackabletech/druid-operator/issues/306
+            druid.segmentCache.locations: '[{"path":"/stackable/var/druid/segment-cache","maxSize":"8g","freeSpacePercent":"5.0"}]'
   middleManagers:
     roleGroups:
       default:
diff --git a/stacks/kafka-druid-superset-s3/kafka.yaml b/stacks/kafka-druid-superset-s3/kafka.yaml
@@ -30,3 +30,6 @@ spec:
     roleGroups:
       default:
         replicas: 1
+        configOverrides:
+          server.properties:
+            log.retention.bytes: "4294967296" # 4Gi, as this is for every partition and the demos/users might add multiple topics
diff --git a/stacks/nifi-kafka-druid-superset-s3/nifi.yaml b/stacks/nifi-kafka-druid-superset-s3/nifi.yaml
@@ -18,17 +18,17 @@ spec:
     config:
       resources:
         memory:
-          limit: '4Gi'
+          limit: '6Gi'
         cpu:
           min: "500m"
           max: "4"
         storage:
           contentRepo:
-            capacity: "5Gi"
+            capacity: "10Gi"
           databaseRepo:
             capacity: "5Gi"
           flowfileRepo:
-            capacity: "10Gi"
+            capacity: "5Gi"
           provenanceRepo:
             capacity: "5Gi"
           stateRepo: