diff --git a/examples/flagger/README.md b/examples/flagger/README.md new file mode 100644 index 0000000000..029b722b50 --- /dev/null +++ b/examples/flagger/README.md @@ -0,0 +1,32 @@ +# PoC: Integration with Flagger + +This example shows a integration of Keptn Metrics +into a Flagger Canary. +In this example, we are making use of the Prometheus endpoint provided +by Keptn (i.e. the metrics-operator), which serves the values of all `KeptnMetrics`. + +This way, we are able to use a Flagger `MetricTemplate` of type `prometheus`, +which retrieves the value from a Prometheus instance that has access to the `KeptnMetrics`. + +The example is based on the [Istio Canary Deployments tutorial](https://docs.flagger.app/tutorials/istio-progressive-delivery) +provided in the Flagger docs. + +The difference to the tutorial is that instead of using the `request-duration` duration +provided by Istio via Prometheus, we are referring to a `KeptnMetric` called `response-time`. +The Flagger metrics provider is in this case still `prometheus`. + +What could be an interesting idea would be to contribute to Flagger by adding +a `keptn` metrics provider to their [provider implementations](https://github.com/fluxcd/flagger/tree/main/pkg/metrics/providers). +This would also open up the possibility to use Keptn `Analyses` in Flagger, which might be a +valuable addition that benefits both projects. + +In terms of observability, we do get the OpenTelemetry traces generated by Keptn out of the box +if the relevant annotations are present in the deployment managed by Flagger. + +The addition of pre-/post-deployment tasks using Keptn is also possible, +but here Flagger provides a similar concept via [Webhooks](https://docs.flagger.app/usage/webhooks), +which are naturally more tailored to Flagger as they also allow to do intermediate checks after the +pods for the canary deployment have been started, e.g. to decide if more traffic should be sent to the canary. +This is something Keptn does not provide, as we operate on pre-/post-deployment of the deployment, but +are not aware of the canary increments of Flagger. + \ No newline at end of file diff --git a/examples/flagger/assets/analysisdefinition.yaml b/examples/flagger/assets/analysisdefinition.yaml new file mode 100644 index 0000000000..58bedeb01d --- /dev/null +++ b/examples/flagger/assets/analysisdefinition.yaml @@ -0,0 +1,29 @@ +apiVersion: metrics.keptn.sh/v1beta1 +kind: AnalysisDefinition +metadata: + name: response-time-analysis + namespace: simple-go +spec: + objectives: + - analysisValueTemplateRef: + name: response-time-p95 + keyObjective: false + target: + failure: + greaterThan: + fixedValue: 30M + weight: 1 + totalScore: + passPercentage: 100 + warningPercentage: 75 +--- +apiVersion: metrics.keptn.sh/v1beta1 +kind: AnalysisValueTemplate +metadata: + name: response-time-p95 + namespace: simple-go +spec: + provider: + name: my-provider + query: histogram_quantile(0.95, sum by(le) (rate(http_server_request_latency_seconds_bucket{job='{{.workload}}'}[1m[]))) + diff --git a/examples/flagger/assets/canary.yaml b/examples/flagger/assets/canary.yaml new file mode 100644 index 0000000000..c66de85264 --- /dev/null +++ b/examples/flagger/assets/canary.yaml @@ -0,0 +1,69 @@ +apiVersion: flagger.app/v1beta1 +kind: Canary +metadata: + name: podinfo + namespace: test +spec: + # deployment reference + targetRef: + apiVersion: apps/v1 + kind: Deployment + name: podinfo + # the maximum time in seconds for the canary deployment + # to make progress before it is rollback (default 600s) + progressDeadlineSeconds: 60 + # HPA reference (optional) + autoscalerRef: + apiVersion: autoscaling/v2 + kind: HorizontalPodAutoscaler + name: podinfo + service: + # service port number + port: 9898 + # container port number or name (optional) + targetPort: 9898 + # Istio traffic policy (optional) + trafficPolicy: + tls: + # use ISTIO_MUTUAL when mTLS is enabled + mode: DISABLE + # Istio retry policy (optional) + retries: + attempts: 3 + perTryTimeout: 1s + retryOn: "gateway-error,connect-failure,refused-stream" + analysis: + # schedule interval (default 60s) + interval: 1m + # max number of failed metric checks before rollback + threshold: 5 + # max traffic percentage routed to canary + # percentage (0-100) + maxWeight: 50 + # canary increment step + # percentage (0-100) + stepWeight: 10 + metrics: + - name: response-time + templateRef: + name: response-time + namespace: keptn-system + # maximum req duration P99 + # milliseconds + thresholdRange: + min: 1.0 + interval: 30s + # testing (optional) + webhooks: + - name: acceptance-test + type: pre-rollout + url: http://flagger-loadtester.test/ + timeout: 30s + metadata: + type: bash + cmd: "curl -sd 'test' http://podinfo-canary:9898/token | grep token" + - name: load-test + url: http://flagger-loadtester.test/ + timeout: 5s + metadata: + cmd: "hey -z 1m -q 10 -c 2 http://podinfo-canary.test:9898/" diff --git a/examples/flagger/assets/deployment.yaml b/examples/flagger/assets/deployment.yaml new file mode 100644 index 0000000000..180f244c9e --- /dev/null +++ b/examples/flagger/assets/deployment.yaml @@ -0,0 +1,87 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: test + annotations: + keptn.sh/lifecycle-toolkit: enabled + labels: + istio-injection: enabled +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: podinfo + namespace: test + labels: + app: podinfo +spec: + minReadySeconds: 5 + revisionHistoryLimit: 5 + progressDeadlineSeconds: 60 + strategy: + rollingUpdate: + maxUnavailable: 1 + type: RollingUpdate + selector: + matchLabels: + app: podinfo + template: + metadata: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9797" + labels: + app: podinfo + app.kubernetes.io/name: podinfo + spec: + containers: + - name: podinfod + image: ghcr.io/stefanprodan/podinfo:6.0.0 + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 9898 + protocol: TCP + - name: http-metrics + containerPort: 9797 + protocol: TCP + - name: grpc + containerPort: 9999 + protocol: TCP + command: + - ./podinfo + - --port=9898 + - --port-metrics=9797 + - --grpc-port=9999 + - --grpc-service-name=podinfo + - --level=info + - --random-delay=false + - --random-error=false + env: + - name: PODINFO_UI_COLOR + value: "#34577c" + livenessProbe: + exec: + command: + - podcli + - check + - http + - localhost:9898/healthz + initialDelaySeconds: 5 + timeoutSeconds: 5 + readinessProbe: + exec: + command: + - podcli + - check + - http + - localhost:9898/readyz + initialDelaySeconds: 5 + timeoutSeconds: 5 + resources: + limits: + cpu: 2000m + memory: 512Mi + requests: + cpu: 100m + memory: 64Mi diff --git a/examples/flagger/assets/hpa.yaml b/examples/flagger/assets/hpa.yaml new file mode 100644 index 0000000000..672992c8ef --- /dev/null +++ b/examples/flagger/assets/hpa.yaml @@ -0,0 +1,21 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: podinfo + namespace: test +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: podinfo + minReplicas: 2 + maxReplicas: 4 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + # scale up if usage is above + # 99% of the requested CPU (100m) + averageUtilization: 99 diff --git a/examples/flagger/assets/ingress-gateway.yaml b/examples/flagger/assets/ingress-gateway.yaml new file mode 100644 index 0000000000..9b46088dd1 --- /dev/null +++ b/examples/flagger/assets/ingress-gateway.yaml @@ -0,0 +1,15 @@ +apiVersion: networking.istio.io/v1alpha3 +kind: Gateway +metadata: + name: public-gateway + namespace: istio-system +spec: + selector: + istio: ingressgateway + servers: + - port: + number: 80 + name: http + protocol: HTTP + hosts: + - "*" \ No newline at end of file diff --git a/examples/flagger/assets/keptnmetric.yaml b/examples/flagger/assets/keptnmetric.yaml new file mode 100644 index 0000000000..f9f0898a62 --- /dev/null +++ b/examples/flagger/assets/keptnmetric.yaml @@ -0,0 +1,10 @@ +apiVersion: metrics.keptn.sh/v1beta1 +kind: KeptnMetric +metadata: + name: response-time + namespace: keptn-system +spec: + provider: + name: my-prometheus-provider + query: "histogram_quantile(0.8, sum by(le) (rate(http_server_request_latency_seconds_bucket{status_code='200', job='simple-go-backend'}[5m])))" + fetchIntervalSeconds: 10 diff --git a/examples/flagger/assets/metric-template.yaml b/examples/flagger/assets/metric-template.yaml new file mode 100644 index 0000000000..a9d38a6c9e --- /dev/null +++ b/examples/flagger/assets/metric-template.yaml @@ -0,0 +1,11 @@ +apiVersion: flagger.app/v1beta1 +kind: MetricTemplate +metadata: + name: response-time + namespace: keptn-system +spec: + provider: + type: keptn + address: "" + query: | + analysis/simple-go/my-analysis-definition/1m/workload=simple-go-service