diff --git a/alerts/apps_alerts.libsonnet b/alerts/apps_alerts.libsonnet index 4f36f1b33..fd0e34b20 100644 --- a/alerts/apps_alerts.libsonnet +++ b/alerts/apps_alerts.libsonnet @@ -133,6 +133,45 @@ }, 'for': '10m', }, + { + alert: 'KubeCronJobRunning', + expr: ||| + time() - kube_cronjob_next_schedule_time{%(kubeStateMetricsSelector)s} > 3600 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: 'CronJob {{ $labels.namespaces }}/{{ $labels.cronjob }} is taking more than 1h to complete.', + }, + }, + { + alert: 'KubeJobCompletion', + expr: ||| + kube_job_spec_completions{%(kubeStateMetricsSelector)s} - kube_job_status_succeeded{%(kubeStateMetricsSelector)s} > 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Job {{ $labels.namespaces }}/{{ $labels.job }} is taking more than 1h to complete.', + }, + }, + { + alert: 'KubeJobFailed', + expr: ||| + kube_job_status_failed{%(kubeStateMetricsSelector)s} > 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Job {{ $labels.namespaces }}/{{ $labels.job }} failed to complete.', + }, + }, ], }, ], diff --git a/runbook.md b/runbook.md index 74e5d8f0a..b12e2e378 100644 --- a/runbook.md +++ b/runbook.md @@ -49,9 +49,26 @@ This page collects this repositories alerts and begins the process of describing ##### Alert Name: "KubeDaemonSetNotScheduled" + *Message*: `A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} are not scheduled.` + *Severity*: warning + ##### Alert Name: "KubeDaemonSetMisScheduled" + *Message*: `A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} are running where they are not supposed to run.` + *Severity*: warning + +##### Alert Name: "KubeCronJobRunning" ++ *Message*: `CronJob {{ $labels.namespaces }}/{{ $labels.cronjob }} is taking more than 1h to complete.` ++ *Severity*: warning ++ *Action*: Check the cronjob using `kubectl decribe cronjob ` and look at the pod logs using `kubectl logs ` for further information. + +##### Alert Name: "KubeJobCompletion" ++ *Message*: `Job {{ $labels.namespaces }}/{{ $labels.job }} is taking more than 1h to complete.` ++ *Severity*: warning ++ *Action*: Check the job using `kubectl decribe job ` and look at the pod logs using `kubectl logs ` for further information. + +##### Alert Name: "KubeJobFailed" ++ *Message*: `Job {{ $labels.namespaces }}/{{ $labels.job }} failed to complete.` ++ *Severity*: warning ++ *Action*: Check the job using `kubectl decribe job ` and look at the pod logs using `kubectl logs ` for further information. + ### Group Name: "kubernetes-resources" ##### Alert Name: "KubeCPUOvercommit" + *Message*: `Overcommited CPU resource requests on Pods, cannot tolerate node failure.`