From 96fd207a751c3e8a038fc649e5db8232046bde80 Mon Sep 17 00:00:00 2001
From: Florian Bacher <florian.bacher@dynatrace.com>
Date: Fri, 29 Mar 2024 09:51:13 +0100
Subject: [PATCH 1/3] poc: integrate KeptnMetrics into Flagger analysis

Signed-off-by: Florian Bacher <florian.bacher@dynatrace.com>
---
 examples/flagger/README.md                   | 32 +++++++
 examples/flagger/assets/canary.yaml          | 69 ++++++++++++++++
 examples/flagger/assets/deployment.yaml      | 87 ++++++++++++++++++++
 examples/flagger/assets/hpa.yaml             | 21 +++++
 examples/flagger/assets/ingress-gateway.yaml | 15 ++++
 examples/flagger/assets/keptnmetric.yaml     | 10 +++
 examples/flagger/assets/metric-template.yaml | 11 +++
 7 files changed, 245 insertions(+)
 create mode 100644 examples/flagger/README.md
 create mode 100644 examples/flagger/assets/canary.yaml
 create mode 100644 examples/flagger/assets/deployment.yaml
 create mode 100644 examples/flagger/assets/hpa.yaml
 create mode 100644 examples/flagger/assets/ingress-gateway.yaml
 create mode 100644 examples/flagger/assets/keptnmetric.yaml
 create mode 100644 examples/flagger/assets/metric-template.yaml

diff --git a/examples/flagger/README.md b/examples/flagger/README.md
new file mode 100644
index 0000000000..029b722b50
--- /dev/null
+++ b/examples/flagger/README.md
@@ -0,0 +1,32 @@
+# PoC: Integration with Flagger
+
+This example shows a integration of Keptn Metrics
+into a Flagger Canary.
+In this example, we are making use of the Prometheus endpoint provided
+by Keptn (i.e. the metrics-operator), which serves the values of all `KeptnMetrics`.
+
+This way, we are able to use a Flagger `MetricTemplate` of type `prometheus`,
+which retrieves the value from a Prometheus instance that has access to the `KeptnMetrics`.
+
+The example is based on the [Istio Canary Deployments tutorial](https://docs.flagger.app/tutorials/istio-progressive-delivery)
+provided in the Flagger docs.
+
+The difference to the tutorial is that instead of using the `request-duration` duration
+provided by Istio via Prometheus, we are referring to a `KeptnMetric` called `response-time`.
+The Flagger metrics provider is in this case still `prometheus`.
+
+What could be an interesting idea would be to contribute to Flagger by adding
+a `keptn` metrics provider to their [provider implementations](https://github.com/fluxcd/flagger/tree/main/pkg/metrics/providers).
+This would also open up the possibility to use Keptn `Analyses` in Flagger, which might be a
+valuable addition that benefits both projects.
+
+In terms of observability, we do get the OpenTelemetry traces generated by Keptn out of the box
+if the relevant annotations are present in the deployment managed by Flagger.
+
+The addition of pre-/post-deployment tasks using Keptn is also possible,
+but here Flagger provides a similar concept via [Webhooks](https://docs.flagger.app/usage/webhooks),
+which are naturally more tailored to Flagger as they also allow to do intermediate checks after the
+pods for the canary deployment have been started, e.g. to decide if more traffic should be sent to the canary.
+This is something Keptn does not provide, as we operate on pre-/post-deployment of the deployment, but
+are not aware of the canary increments of Flagger.
+ 
\ No newline at end of file
diff --git a/examples/flagger/assets/canary.yaml b/examples/flagger/assets/canary.yaml
new file mode 100644
index 0000000000..51c3fe9f35
--- /dev/null
+++ b/examples/flagger/assets/canary.yaml
@@ -0,0 +1,69 @@
+apiVersion: flagger.app/v1beta1
+kind: Canary
+metadata:
+  name: podinfo
+  namespace: test
+spec:
+  # deployment reference
+  targetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: podinfo
+  # the maximum time in seconds for the canary deployment
+  # to make progress before it is rollback (default 600s)
+  progressDeadlineSeconds: 60
+  # HPA reference (optional)
+  autoscalerRef:
+    apiVersion: autoscaling/v2
+    kind: HorizontalPodAutoscaler
+    name: podinfo
+  service:
+    # service port number
+    port: 9898
+    # container port number or name (optional)
+    targetPort: 9898
+    # Istio traffic policy (optional)
+    trafficPolicy:
+      tls:
+        # use ISTIO_MUTUAL when mTLS is enabled
+        mode: DISABLE
+    # Istio retry policy (optional)
+    retries:
+      attempts: 3
+      perTryTimeout: 1s
+      retryOn: "gateway-error,connect-failure,refused-stream"
+  analysis:
+    # schedule interval (default 60s)
+    interval: 1m
+    # max number of failed metric checks before rollback
+    threshold: 5
+    # max traffic percentage routed to canary
+    # percentage (0-100)
+    maxWeight: 50
+    # canary increment step
+    # percentage (0-100)
+    stepWeight: 10
+    metrics:
+      - name: response-time
+        templateRef:
+          name: response-time
+          namespace: keptn-system
+        # maximum req duration P99
+        # milliseconds
+        thresholdRange:
+          max: 0.1
+        interval: 30s
+    # testing (optional)
+    webhooks:
+      - name: acceptance-test
+        type: pre-rollout
+        url: http://flagger-loadtester.test/
+        timeout: 30s
+        metadata:
+          type: bash
+          cmd: "curl -sd 'test' http://podinfo-canary:9898/token | grep token"
+      - name: load-test
+        url: http://flagger-loadtester.test/
+        timeout: 5s
+        metadata:
+          cmd: "hey -z 1m -q 10 -c 2 http://podinfo-canary.test:9898/"
diff --git a/examples/flagger/assets/deployment.yaml b/examples/flagger/assets/deployment.yaml
new file mode 100644
index 0000000000..180f244c9e
--- /dev/null
+++ b/examples/flagger/assets/deployment.yaml
@@ -0,0 +1,87 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: test
+  annotations:
+    keptn.sh/lifecycle-toolkit: enabled
+  labels:
+    istio-injection: enabled
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: podinfo
+  namespace: test
+  labels:
+    app: podinfo
+spec:
+  minReadySeconds: 5
+  revisionHistoryLimit: 5
+  progressDeadlineSeconds: 60
+  strategy:
+    rollingUpdate:
+      maxUnavailable: 1
+    type: RollingUpdate
+  selector:
+    matchLabels:
+      app: podinfo
+  template:
+    metadata:
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "9797"
+      labels:
+        app: podinfo
+        app.kubernetes.io/name: podinfo
+    spec:
+      containers:
+        - name: podinfod
+          image: ghcr.io/stefanprodan/podinfo:6.0.0
+          imagePullPolicy: IfNotPresent
+          ports:
+            - name: http
+              containerPort: 9898
+              protocol: TCP
+            - name: http-metrics
+              containerPort: 9797
+              protocol: TCP
+            - name: grpc
+              containerPort: 9999
+              protocol: TCP
+          command:
+            - ./podinfo
+            - --port=9898
+            - --port-metrics=9797
+            - --grpc-port=9999
+            - --grpc-service-name=podinfo
+            - --level=info
+            - --random-delay=false
+            - --random-error=false
+          env:
+            - name: PODINFO_UI_COLOR
+              value: "#34577c"
+          livenessProbe:
+            exec:
+              command:
+                - podcli
+                - check
+                - http
+                - localhost:9898/healthz
+            initialDelaySeconds: 5
+            timeoutSeconds: 5
+          readinessProbe:
+            exec:
+              command:
+                - podcli
+                - check
+                - http
+                - localhost:9898/readyz
+            initialDelaySeconds: 5
+            timeoutSeconds: 5
+          resources:
+            limits:
+              cpu: 2000m
+              memory: 512Mi
+            requests:
+              cpu: 100m
+              memory: 64Mi
diff --git a/examples/flagger/assets/hpa.yaml b/examples/flagger/assets/hpa.yaml
new file mode 100644
index 0000000000..672992c8ef
--- /dev/null
+++ b/examples/flagger/assets/hpa.yaml
@@ -0,0 +1,21 @@
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: podinfo
+  namespace: test
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: podinfo
+  minReplicas: 2
+  maxReplicas: 4
+  metrics:
+    - type: Resource
+      resource:
+        name: cpu
+        target:
+          type: Utilization
+          # scale up if usage is above
+          # 99% of the requested CPU (100m)
+          averageUtilization: 99
diff --git a/examples/flagger/assets/ingress-gateway.yaml b/examples/flagger/assets/ingress-gateway.yaml
new file mode 100644
index 0000000000..9b46088dd1
--- /dev/null
+++ b/examples/flagger/assets/ingress-gateway.yaml
@@ -0,0 +1,15 @@
+apiVersion: networking.istio.io/v1alpha3
+kind: Gateway
+metadata:
+  name: public-gateway
+  namespace: istio-system
+spec:
+  selector:
+    istio: ingressgateway
+  servers:
+    - port:
+        number: 80
+        name: http
+        protocol: HTTP
+      hosts:
+        - "*"
\ No newline at end of file
diff --git a/examples/flagger/assets/keptnmetric.yaml b/examples/flagger/assets/keptnmetric.yaml
new file mode 100644
index 0000000000..f9f0898a62
--- /dev/null
+++ b/examples/flagger/assets/keptnmetric.yaml
@@ -0,0 +1,10 @@
+apiVersion: metrics.keptn.sh/v1beta1
+kind: KeptnMetric
+metadata:
+  name: response-time
+  namespace: keptn-system
+spec:
+  provider:
+    name: my-prometheus-provider
+  query: "histogram_quantile(0.8, sum by(le) (rate(http_server_request_latency_seconds_bucket{status_code='200', job='simple-go-backend'}[5m])))"
+  fetchIntervalSeconds: 10
diff --git a/examples/flagger/assets/metric-template.yaml b/examples/flagger/assets/metric-template.yaml
new file mode 100644
index 0000000000..90b55ded2f
--- /dev/null
+++ b/examples/flagger/assets/metric-template.yaml
@@ -0,0 +1,11 @@
+apiVersion: flagger.app/v1beta1
+kind: MetricTemplate
+metadata:
+  name: response-time
+  namespace: keptn-system
+spec:
+  provider:
+    type: prometheus
+    address: http://prometheus-k8s.monitoring:9090
+  query: |
+    response_time{namespace="keptn-system"}

From 1020bbb71c1f4805b01f4dcf5abebf1d93664c79 Mon Sep 17 00:00:00 2001
From: Florian Bacher <florian.bacher@dynatrace.com>
Date: Mon, 8 Apr 2024 07:05:32 +0200
Subject: [PATCH 2/3] add analysis example

Signed-off-by: Florian Bacher <florian.bacher@dynatrace.com>
---
 .../flagger/assets/analysisdefinition.yaml    | 29 +++++++++++++++++++
 examples/flagger/assets/metric-template.yaml  |  6 ++--
 2 files changed, 32 insertions(+), 3 deletions(-)
 create mode 100644 examples/flagger/assets/analysisdefinition.yaml

diff --git a/examples/flagger/assets/analysisdefinition.yaml b/examples/flagger/assets/analysisdefinition.yaml
new file mode 100644
index 0000000000..58bedeb01d
--- /dev/null
+++ b/examples/flagger/assets/analysisdefinition.yaml
@@ -0,0 +1,29 @@
+apiVersion: metrics.keptn.sh/v1beta1
+kind: AnalysisDefinition
+metadata:
+  name: response-time-analysis
+  namespace: simple-go
+spec:
+  objectives:
+    - analysisValueTemplateRef:
+        name: response-time-p95
+      keyObjective: false
+      target:
+        failure:
+          greaterThan:
+            fixedValue: 30M
+      weight: 1
+  totalScore:
+    passPercentage: 100
+    warningPercentage: 75
+---
+apiVersion: metrics.keptn.sh/v1beta1
+kind: AnalysisValueTemplate
+metadata:
+  name: response-time-p95
+  namespace: simple-go
+spec:
+  provider:
+    name: my-provider
+  query: histogram_quantile(0.95, sum by(le) (rate(http_server_request_latency_seconds_bucket{job='{{.workload}}'}[1m[])))
+
diff --git a/examples/flagger/assets/metric-template.yaml b/examples/flagger/assets/metric-template.yaml
index 90b55ded2f..a9d38a6c9e 100644
--- a/examples/flagger/assets/metric-template.yaml
+++ b/examples/flagger/assets/metric-template.yaml
@@ -5,7 +5,7 @@ metadata:
   namespace: keptn-system
 spec:
   provider:
-    type: prometheus
-    address: http://prometheus-k8s.monitoring:9090
+    type: keptn
+    address: ""
   query: |
-    response_time{namespace="keptn-system"}
+    analysis/simple-go/my-analysis-definition/1m/workload=simple-go-service

From 86755e30dc882f3dbd9249c5f7e7a7e166cf4985 Mon Sep 17 00:00:00 2001
From: Florian Bacher <florian.bacher@dynatrace.com>
Date: Tue, 16 Apr 2024 09:41:04 +0200
Subject: [PATCH 3/3] adapt canary.yaml

Signed-off-by: Florian Bacher <florian.bacher@dynatrace.com>
---
 examples/flagger/assets/canary.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/flagger/assets/canary.yaml b/examples/flagger/assets/canary.yaml
index 51c3fe9f35..c66de85264 100644
--- a/examples/flagger/assets/canary.yaml
+++ b/examples/flagger/assets/canary.yaml
@@ -51,7 +51,7 @@ spec:
         # maximum req duration P99
         # milliseconds
         thresholdRange:
-          max: 0.1
+          min: 1.0
         interval: 30s
     # testing (optional)
     webhooks: