diff --git a/dashboards/apiserver.libsonnet b/dashboards/apiserver.libsonnet index 3e63842d9..a91215d94 100644 --- a/dashboards/apiserver.libsonnet +++ b/dashboards/apiserver.libsonnet @@ -82,7 +82,7 @@ local singlestat = grafana.singlestat; format='s', description='How many seconds is the 99th percentile for reading (LIST|GET) a given resource?', ) - .addTarget(prometheus.target('cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile{verb="read", %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{ resource }}')); + .addTarget(prometheus.target('cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile{verb="read", %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{ resource }}')); local writeAvailability = singlestat.new( @@ -130,7 +130,7 @@ local singlestat = grafana.singlestat; format='s', description='How many seconds is the 99th percentile for writing (POST|PUT|PATCH|DELETE) a given resource?', ) - .addTarget(prometheus.target('cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile{verb="write", %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{ resource }}')); + .addTarget(prometheus.target('cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile{verb="write", %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{ resource }}')); local workQueueAddRate = graphPanel.new( diff --git a/rules/kube_apiserver.libsonnet b/rules/kube_apiserver.libsonnet index be20c25c3..fa49d1ad4 100644 --- a/rules/kube_apiserver.libsonnet +++ b/rules/kube_apiserver.libsonnet @@ -5,7 +5,7 @@ kubeApiserverReadSelector: 'verb=~"LIST|GET"', kubeApiserverWriteSelector: 'verb=~"POST|PUT|PATCH|DELETE"', kubeApiserverNonStreamingSelector: 'subresource!~"proxy|attach|log|exec|portforward"', - // These are buckets that exist on the apiserver_request_slo_duration_seconds_bucket histogram. + // These are buckets that exist on the apiserver_request_sli_duration_seconds_bucket histogram. // They are what the Kubernetes SIG Scalability is using to measure availability of Kubernetes clusters. // If you want to change these, make sure the "le" buckets exist on the histogram! kubeApiserverReadResourceLatency: '1', @@ -31,18 +31,18 @@ ( ( # too slow - sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s])) + sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s])) - ( ( - sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}[%(window)s])) + sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}[%(window)s])) or vector(0) ) + - sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}[%(window)s])) + sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}[%(window)s])) + - sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}[%(window)s])) + sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}[%(window)s])) ) ) + @@ -79,9 +79,9 @@ ( ( # too slow - sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s])) + sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s])) - - sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s,le="%(kubeApiserverWriteLatency)s"}[%(window)s])) + sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s,le="%(kubeApiserverWriteLatency)s"}[%(window)s])) ) + sum by (%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,code=~"5.."}[%(window)s])) @@ -114,9 +114,9 @@ rules: [ { - record: 'cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile', + record: 'cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile', expr: ||| - histogram_quantile(0.99, sum by (%s, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{%s}[5m]))) > 0 + histogram_quantile(0.99, sum by (%s, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{%s}[5m]))) > 0 ||| % [$._config.clusterLabel, std.join(',', [$._config.kubeApiserverSelector, verb.selector, $._config.kubeApiserverNonStreamingSelector])], labels: { verb: verb.type, @@ -149,27 +149,27 @@ for verb in verbs ] + [ { - record: 'cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h', + record: 'cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h', expr: ||| - sum by (%(clusterLabel)s, verb, scope) (increase(apiserver_request_slo_duration_seconds_count{%(kubeApiserverSelector)s}[1h])) + sum by (%(clusterLabel)s, verb, scope) (increase(apiserver_request_sli_duration_seconds_count{%(kubeApiserverSelector)s}[1h])) ||| % $._config, }, { - record: 'cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase%s' % SLODays, + record: 'cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%s' % SLODays, expr: ||| - sum by (%s, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h[%s]) * 24 * %s) + sum by (%s, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h[%s]) * 24 * %s) ||| % [$._config.clusterLabel, SLODays, $._config.SLOs.apiserver.days], }, { - record: 'cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h', + record: 'cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h', expr: ||| - sum by (%(clusterLabel)s, verb, scope, le) (increase(apiserver_request_slo_duration_seconds_bucket[1h])) + sum by (%(clusterLabel)s, verb, scope, le) (increase(apiserver_request_sli_duration_seconds_bucket[1h])) ||| % $._config, }, { - record: 'cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%s' % SLODays, + record: 'cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%s' % SLODays, expr: ||| - sum by (%s, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h[%s]) * 24 * %s) + sum by (%s, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[%s]) * 24 * %s) ||| % [$._config.clusterLabel, SLODays, $._config.SLOs.apiserver.days], }, { @@ -178,24 +178,24 @@ 1 - ( ( # write too slow - sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s}) + sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s}) - - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le="%(kubeApiserverWriteLatency)s"}) + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le="%(kubeApiserverWriteLatency)s"}) ) + ( # read too slow - sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s}) + sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s}) - ( ( - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}) + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}) or vector(0) ) + - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}) + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}) + - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}) + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}) ) ) + # errors @@ -212,19 +212,19 @@ record: 'apiserver_request:availability%s' % SLODays, expr: ||| 1 - ( - sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s}) + sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s}) - ( # too slow ( - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}) + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}) or vector(0) ) + - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}) + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}) + - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}) + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}) ) + # errors @@ -243,9 +243,9 @@ 1 - ( ( # too slow - sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s}) + sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s}) - - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le="%(kubeApiserverWriteLatency)s"}) + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le="%(kubeApiserverWriteLatency)s"}) ) + # errors