keptn · bacherfl · Mar 29, 2024 · Apr 8, 2024 · Apr 16, 2024 · StackScribe
@@ -0,0 +1,32 @@
+# PoC: Integration with Flagger
+
+This example shows a integration of Keptn Metrics
+into a Flagger Canary.
+In this example, we are making use of the Prometheus endpoint provided
+by Keptn (i.e. the metrics-operator), which serves the values of all `KeptnMetrics`.
+
+This way, we are able to use a Flagger `MetricTemplate` of type `prometheus`,
-This way, we are able to use a Flagger `MetricTemplate` of type `prometheus`,
+This enables us to use a Flagger `MetricTemplate` of type `prometheus`,
-This way, we are able to use a Flagger `MetricTemplate` of type `prometheus`,
+This enables us to use a Flagger `MetricTemplate` of type `prometheus`,
+which retrieves the value from a Prometheus instance that has access to the `KeptnMetrics`.
-which retrieves the value from a Prometheus instance that has access to the `KeptnMetrics`.
+which retrieves the value from a Prometheus instance that has access to the `KeptnMetrics` resource.
-which retrieves the value from a Prometheus instance that has access to the `KeptnMetrics`.
+which retrieves the value from a Prometheus instance that has access to the `KeptnMetrics` resource.
+
+The example is based on the [Istio Canary Deployments tutorial](https://docs.flagger.app/tutorials/istio-progressive-delivery)
+provided in the Flagger docs.
+
+The difference to the tutorial is that instead of using the `request-duration` duration
-The difference to the tutorial is that instead of using the `request-duration` duration
+The difference from the tutorial is that, instead of using the `request-duration` duration
-The difference to the tutorial is that instead of using the `request-duration` duration
+The difference from the tutorial is that, instead of using the `request-duration` duration
+provided by Istio via Prometheus, we are referring to a `KeptnMetric` called `response-time`.
+The Flagger metrics provider is in this case still `prometheus`.
+
+What could be an interesting idea would be to contribute to Flagger by adding
+a `keptn` metrics provider to their [provider implementations](https://github.com/fluxcd/flagger/tree/main/pkg/metrics/providers).
+This would also open up the possibility to use Keptn `Analyses` in Flagger, which might be a
+valuable addition that benefits both projects.
+
+In terms of observability, we do get the OpenTelemetry traces generated by Keptn out of the box
+if the relevant annotations are present in the deployment managed by Flagger.
+
+The addition of pre-/post-deployment tasks using Keptn is also possible,
+but here Flagger provides a similar concept via [Webhooks](https://docs.flagger.app/usage/webhooks),
+which are naturally more tailored to Flagger as they also allow to do intermediate checks after the
+pods for the canary deployment have been started, e.g. to decide if more traffic should be sent to the canary.
+This is something Keptn does not provide, as we operate on pre-/post-deployment of the deployment, but
+are not aware of the canary increments of Flagger.
+
@@ -0,0 +1,29 @@
+apiVersion: metrics.keptn.sh/v1beta1
+kind: AnalysisDefinition
+metadata:
+  name: response-time-analysis
+  namespace: simple-go
+spec:
+  objectives:
+    - analysisValueTemplateRef:
+        name: response-time-p95
+      keyObjective: false
+      target:
+        failure:
+          greaterThan:
+            fixedValue: 30M
+      weight: 1
+  totalScore:
+    passPercentage: 100
+    warningPercentage: 75
+---
+apiVersion: metrics.keptn.sh/v1beta1
+kind: AnalysisValueTemplate
+metadata:
+  name: response-time-p95
+  namespace: simple-go
+spec:
+  provider:
+    name: my-provider
+  query: histogram_quantile(0.95, sum by(le) (rate(http_server_request_latency_seconds_bucket{job='{{.workload}}'}[1m[])))
+
@@ -0,0 +1,69 @@
+apiVersion: flagger.app/v1beta1
+kind: Canary
+metadata:
+  name: podinfo
+  namespace: test
+spec:
+  # deployment reference
+  targetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: podinfo
+  # the maximum time in seconds for the canary deployment
+  # to make progress before it is rollback (default 600s)
+  progressDeadlineSeconds: 60
+  # HPA reference (optional)
+  autoscalerRef:
+    apiVersion: autoscaling/v2
+    kind: HorizontalPodAutoscaler
+    name: podinfo
+  service:
+    # service port number
+    port: 9898
+    # container port number or name (optional)
+    targetPort: 9898
+    # Istio traffic policy (optional)
+    trafficPolicy:
+      tls:
+        # use ISTIO_MUTUAL when mTLS is enabled
+        mode: DISABLE
+    # Istio retry policy (optional)
+    retries:
+      attempts: 3
+      perTryTimeout: 1s
+      retryOn: "gateway-error,connect-failure,refused-stream"
+  analysis:
+    # schedule interval (default 60s)
+    interval: 1m
+    # max number of failed metric checks before rollback
+    threshold: 5
+    # max traffic percentage routed to canary
+    # percentage (0-100)
+    maxWeight: 50
+    # canary increment step
+    # percentage (0-100)
+    stepWeight: 10
+    metrics:
+      - name: response-time
+        templateRef:
+          name: response-time
+          namespace: keptn-system
+        # maximum req duration P99
+        # milliseconds
+        thresholdRange:
+          min: 1.0
+        interval: 30s
+    # testing (optional)
+    webhooks:
+      - name: acceptance-test
+        type: pre-rollout
+        url: http://flagger-loadtester.test/
+        timeout: 30s
+        metadata:
+          type: bash
+          cmd: "curl -sd 'test' http://podinfo-canary:9898/token | grep token"
+      - name: load-test
+        url: http://flagger-loadtester.test/
+        timeout: 5s
+        metadata:
+          cmd: "hey -z 1m -q 10 -c 2 http://podinfo-canary.test:9898/"
@@ -0,0 +1,87 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: test
+  annotations:
+    keptn.sh/lifecycle-toolkit: enabled
+  labels:
+    istio-injection: enabled
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: podinfo
+  namespace: test
+  labels:
+    app: podinfo
+spec:
+  minReadySeconds: 5
+  revisionHistoryLimit: 5
+  progressDeadlineSeconds: 60
+  strategy:
+    rollingUpdate:
+      maxUnavailable: 1
+    type: RollingUpdate
+  selector:
+    matchLabels:
+      app: podinfo
+  template:
+    metadata:
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "9797"
+      labels:
+        app: podinfo
+        app.kubernetes.io/name: podinfo
+    spec:
+      containers:
+        - name: podinfod
+          image: ghcr.io/stefanprodan/podinfo:6.0.0
+          imagePullPolicy: IfNotPresent
+          ports:
+            - name: http
+              containerPort: 9898
+              protocol: TCP
+            - name: http-metrics
+              containerPort: 9797
+              protocol: TCP
+            - name: grpc
+              containerPort: 9999
+              protocol: TCP
+          command:
+            - ./podinfo
+            - --port=9898
+            - --port-metrics=9797
+            - --grpc-port=9999
+            - --grpc-service-name=podinfo
+            - --level=info
+            - --random-delay=false
+            - --random-error=false
+          env:
+            - name: PODINFO_UI_COLOR
+              value: "#34577c"
+          livenessProbe:
+            exec:
+              command:
+                - podcli
+                - check
+                - http
+                - localhost:9898/healthz
+            initialDelaySeconds: 5
+            timeoutSeconds: 5
+          readinessProbe:
+            exec:
+              command:
+                - podcli
+                - check
+                - http
+                - localhost:9898/readyz
+            initialDelaySeconds: 5
+            timeoutSeconds: 5
+          resources:
+            limits:
+              cpu: 2000m
+              memory: 512Mi
+            requests:
+              cpu: 100m
+              memory: 64Mi
@@ -0,0 +1,21 @@
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: podinfo
+  namespace: test
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: podinfo
+  minReplicas: 2
+  maxReplicas: 4
+  metrics:
+    - type: Resource
+      resource:
+        name: cpu
+        target:
+          type: Utilization
+          # scale up if usage is above
+          # 99% of the requested CPU (100m)
+          averageUtilization: 99
@@ -0,0 +1,15 @@
+apiVersion: networking.istio.io/v1alpha3
+kind: Gateway
+metadata:
+  name: public-gateway
+  namespace: istio-system
+spec:
+  selector:
+    istio: ingressgateway
+  servers:
+    - port:
+        number: 80
+        name: http
+        protocol: HTTP
+      hosts:
+        - "*"
@@ -0,0 +1,10 @@
+apiVersion: metrics.keptn.sh/v1beta1
+kind: KeptnMetric
+metadata:
+  name: response-time
+  namespace: keptn-system
+spec:
+  provider:
+    name: my-prometheus-provider
+  query: "histogram_quantile(0.8, sum by(le) (rate(http_server_request_latency_seconds_bucket{status_code='200', job='simple-go-backend'}[5m])))"
+  fetchIntervalSeconds: 10
@@ -0,0 +1,11 @@
+apiVersion: flagger.app/v1beta1
+kind: MetricTemplate
+metadata:
+  name: response-time
+  namespace: keptn-system
+spec:
+  provider:
+    type: keptn
+    address: ""
+  query: |
+    analysis/simple-go/my-analysis-definition/1m/workload=simple-go-service