From 96fd207a751c3e8a038fc649e5db8232046bde80 Mon Sep 17 00:00:00 2001 From: Florian Bacher Date: Fri, 29 Mar 2024 09:51:13 +0100 Subject: [PATCH 1/3] poc: integrate KeptnMetrics into Flagger analysis Signed-off-by: Florian Bacher --- examples/flagger/README.md | 32 +++++++ examples/flagger/assets/canary.yaml | 69 ++++++++++++++++ examples/flagger/assets/deployment.yaml | 87 ++++++++++++++++++++ examples/flagger/assets/hpa.yaml | 21 +++++ examples/flagger/assets/ingress-gateway.yaml | 15 ++++ examples/flagger/assets/keptnmetric.yaml | 10 +++ examples/flagger/assets/metric-template.yaml | 11 +++ 7 files changed, 245 insertions(+) create mode 100644 examples/flagger/README.md create mode 100644 examples/flagger/assets/canary.yaml create mode 100644 examples/flagger/assets/deployment.yaml create mode 100644 examples/flagger/assets/hpa.yaml create mode 100644 examples/flagger/assets/ingress-gateway.yaml create mode 100644 examples/flagger/assets/keptnmetric.yaml create mode 100644 examples/flagger/assets/metric-template.yaml diff --git a/examples/flagger/README.md b/examples/flagger/README.md new file mode 100644 index 0000000000..029b722b50 --- /dev/null +++ b/examples/flagger/README.md @@ -0,0 +1,32 @@ +# PoC: Integration with Flagger + +This example shows a integration of Keptn Metrics +into a Flagger Canary. +In this example, we are making use of the Prometheus endpoint provided +by Keptn (i.e. the metrics-operator), which serves the values of all `KeptnMetrics`. + +This way, we are able to use a Flagger `MetricTemplate` of type `prometheus`, +which retrieves the value from a Prometheus instance that has access to the `KeptnMetrics`. + +The example is based on the [Istio Canary Deployments tutorial](https://docs.flagger.app/tutorials/istio-progressive-delivery) +provided in the Flagger docs. + +The difference to the tutorial is that instead of using the `request-duration` duration +provided by Istio via Prometheus, we are referring to a `KeptnMetric` called `response-time`. +The Flagger metrics provider is in this case still `prometheus`. + +What could be an interesting idea would be to contribute to Flagger by adding +a `keptn` metrics provider to their [provider implementations](https://github.com/fluxcd/flagger/tree/main/pkg/metrics/providers). +This would also open up the possibility to use Keptn `Analyses` in Flagger, which might be a +valuable addition that benefits both projects. + +In terms of observability, we do get the OpenTelemetry traces generated by Keptn out of the box +if the relevant annotations are present in the deployment managed by Flagger. + +The addition of pre-/post-deployment tasks using Keptn is also possible, +but here Flagger provides a similar concept via [Webhooks](https://docs.flagger.app/usage/webhooks), +which are naturally more tailored to Flagger as they also allow to do intermediate checks after the +pods for the canary deployment have been started, e.g. to decide if more traffic should be sent to the canary. +This is something Keptn does not provide, as we operate on pre-/post-deployment of the deployment, but +are not aware of the canary increments of Flagger. + \ No newline at end of file diff --git a/examples/flagger/assets/canary.yaml b/examples/flagger/assets/canary.yaml new file mode 100644 index 0000000000..51c3fe9f35 --- /dev/null +++ b/examples/flagger/assets/canary.yaml @@ -0,0 +1,69 @@ +apiVersion: flagger.app/v1beta1 +kind: Canary +metadata: + name: podinfo + namespace: test +spec: + # deployment reference + targetRef: + apiVersion: apps/v1 + kind: Deployment + name: podinfo + # the maximum time in seconds for the canary deployment + # to make progress before it is rollback (default 600s) + progressDeadlineSeconds: 60 + # HPA reference (optional) + autoscalerRef: + apiVersion: autoscaling/v2 + kind: HorizontalPodAutoscaler + name: podinfo + service: + # service port number + port: 9898 + # container port number or name (optional) + targetPort: 9898 + # Istio traffic policy (optional) + trafficPolicy: + tls: + # use ISTIO_MUTUAL when mTLS is enabled + mode: DISABLE + # Istio retry policy (optional) + retries: + attempts: 3 + perTryTimeout: 1s + retryOn: "gateway-error,connect-failure,refused-stream" + analysis: + # schedule interval (default 60s) + interval: 1m + # max number of failed metric checks before rollback + threshold: 5 + # max traffic percentage routed to canary + # percentage (0-100) + maxWeight: 50 + # canary increment step + # percentage (0-100) + stepWeight: 10 + metrics: + - name: response-time + templateRef: + name: response-time + namespace: keptn-system + # maximum req duration P99 + # milliseconds + thresholdRange: + max: 0.1 + interval: 30s + # testing (optional) + webhooks: + - name: acceptance-test + type: pre-rollout + url: http://flagger-loadtester.test/ + timeout: 30s + metadata: + type: bash + cmd: "curl -sd 'test' http://podinfo-canary:9898/token | grep token" + - name: load-test + url: http://flagger-loadtester.test/ + timeout: 5s + metadata: + cmd: "hey -z 1m -q 10 -c 2 http://podinfo-canary.test:9898/" diff --git a/examples/flagger/assets/deployment.yaml b/examples/flagger/assets/deployment.yaml new file mode 100644 index 0000000000..180f244c9e --- /dev/null +++ b/examples/flagger/assets/deployment.yaml @@ -0,0 +1,87 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: test + annotations: + keptn.sh/lifecycle-toolkit: enabled + labels: + istio-injection: enabled +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: podinfo + namespace: test + labels: + app: podinfo +spec: + minReadySeconds: 5 + revisionHistoryLimit: 5 + progressDeadlineSeconds: 60 + strategy: + rollingUpdate: + maxUnavailable: 1 + type: RollingUpdate + selector: + matchLabels: + app: podinfo + template: + metadata: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9797" + labels: + app: podinfo + app.kubernetes.io/name: podinfo + spec: + containers: + - name: podinfod + image: ghcr.io/stefanprodan/podinfo:6.0.0 + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 9898 + protocol: TCP + - name: http-metrics + containerPort: 9797 + protocol: TCP + - name: grpc + containerPort: 9999 + protocol: TCP + command: + - ./podinfo + - --port=9898 + - --port-metrics=9797 + - --grpc-port=9999 + - --grpc-service-name=podinfo + - --level=info + - --random-delay=false + - --random-error=false + env: + - name: PODINFO_UI_COLOR + value: "#34577c" + livenessProbe: + exec: + command: + - podcli + - check + - http + - localhost:9898/healthz + initialDelaySeconds: 5 + timeoutSeconds: 5 + readinessProbe: + exec: + command: + - podcli + - check + - http + - localhost:9898/readyz + initialDelaySeconds: 5 + timeoutSeconds: 5 + resources: + limits: + cpu: 2000m + memory: 512Mi + requests: + cpu: 100m + memory: 64Mi diff --git a/examples/flagger/assets/hpa.yaml b/examples/flagger/assets/hpa.yaml new file mode 100644 index 0000000000..672992c8ef --- /dev/null +++ b/examples/flagger/assets/hpa.yaml @@ -0,0 +1,21 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: podinfo + namespace: test +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: podinfo + minReplicas: 2 + maxReplicas: 4 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + # scale up if usage is above + # 99% of the requested CPU (100m) + averageUtilization: 99 diff --git a/examples/flagger/assets/ingress-gateway.yaml b/examples/flagger/assets/ingress-gateway.yaml new file mode 100644 index 0000000000..9b46088dd1 --- /dev/null +++ b/examples/flagger/assets/ingress-gateway.yaml @@ -0,0 +1,15 @@ +apiVersion: networking.istio.io/v1alpha3 +kind: Gateway +metadata: + name: public-gateway + namespace: istio-system +spec: + selector: + istio: ingressgateway + servers: + - port: + number: 80 + name: http + protocol: HTTP + hosts: + - "*" \ No newline at end of file diff --git a/examples/flagger/assets/keptnmetric.yaml b/examples/flagger/assets/keptnmetric.yaml new file mode 100644 index 0000000000..f9f0898a62 --- /dev/null +++ b/examples/flagger/assets/keptnmetric.yaml @@ -0,0 +1,10 @@ +apiVersion: metrics.keptn.sh/v1beta1 +kind: KeptnMetric +metadata: + name: response-time + namespace: keptn-system +spec: + provider: + name: my-prometheus-provider + query: "histogram_quantile(0.8, sum by(le) (rate(http_server_request_latency_seconds_bucket{status_code='200', job='simple-go-backend'}[5m])))" + fetchIntervalSeconds: 10 diff --git a/examples/flagger/assets/metric-template.yaml b/examples/flagger/assets/metric-template.yaml new file mode 100644 index 0000000000..90b55ded2f --- /dev/null +++ b/examples/flagger/assets/metric-template.yaml @@ -0,0 +1,11 @@ +apiVersion: flagger.app/v1beta1 +kind: MetricTemplate +metadata: + name: response-time + namespace: keptn-system +spec: + provider: + type: prometheus + address: http://prometheus-k8s.monitoring:9090 + query: | + response_time{namespace="keptn-system"} From 1020bbb71c1f4805b01f4dcf5abebf1d93664c79 Mon Sep 17 00:00:00 2001 From: Florian Bacher Date: Mon, 8 Apr 2024 07:05:32 +0200 Subject: [PATCH 2/3] add analysis example Signed-off-by: Florian Bacher --- .../flagger/assets/analysisdefinition.yaml | 29 +++++++++++++++++++ examples/flagger/assets/metric-template.yaml | 6 ++-- 2 files changed, 32 insertions(+), 3 deletions(-) create mode 100644 examples/flagger/assets/analysisdefinition.yaml diff --git a/examples/flagger/assets/analysisdefinition.yaml b/examples/flagger/assets/analysisdefinition.yaml new file mode 100644 index 0000000000..58bedeb01d --- /dev/null +++ b/examples/flagger/assets/analysisdefinition.yaml @@ -0,0 +1,29 @@ +apiVersion: metrics.keptn.sh/v1beta1 +kind: AnalysisDefinition +metadata: + name: response-time-analysis + namespace: simple-go +spec: + objectives: + - analysisValueTemplateRef: + name: response-time-p95 + keyObjective: false + target: + failure: + greaterThan: + fixedValue: 30M + weight: 1 + totalScore: + passPercentage: 100 + warningPercentage: 75 +--- +apiVersion: metrics.keptn.sh/v1beta1 +kind: AnalysisValueTemplate +metadata: + name: response-time-p95 + namespace: simple-go +spec: + provider: + name: my-provider + query: histogram_quantile(0.95, sum by(le) (rate(http_server_request_latency_seconds_bucket{job='{{.workload}}'}[1m[]))) + diff --git a/examples/flagger/assets/metric-template.yaml b/examples/flagger/assets/metric-template.yaml index 90b55ded2f..a9d38a6c9e 100644 --- a/examples/flagger/assets/metric-template.yaml +++ b/examples/flagger/assets/metric-template.yaml @@ -5,7 +5,7 @@ metadata: namespace: keptn-system spec: provider: - type: prometheus - address: http://prometheus-k8s.monitoring:9090 + type: keptn + address: "" query: | - response_time{namespace="keptn-system"} + analysis/simple-go/my-analysis-definition/1m/workload=simple-go-service From 86755e30dc882f3dbd9249c5f7e7a7e166cf4985 Mon Sep 17 00:00:00 2001 From: Florian Bacher Date: Tue, 16 Apr 2024 09:41:04 +0200 Subject: [PATCH 3/3] adapt canary.yaml Signed-off-by: Florian Bacher --- examples/flagger/assets/canary.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/flagger/assets/canary.yaml b/examples/flagger/assets/canary.yaml index 51c3fe9f35..c66de85264 100644 --- a/examples/flagger/assets/canary.yaml +++ b/examples/flagger/assets/canary.yaml @@ -51,7 +51,7 @@ spec: # maximum req duration P99 # milliseconds thresholdRange: - max: 0.1 + min: 1.0 interval: 30s # testing (optional) webhooks: