From 66726137ae1e00ec0ccdf0b2d55ffe3c07d0848b Mon Sep 17 00:00:00 2001 From: Francois Andrieu Date: Mon, 23 Jan 2023 14:32:37 +0100 Subject: [PATCH] websites: add alerts for pod/job/build errors --- playbooks/openshift-apps/websites.yml | 6 ++ .../websites/files/prometheusRules.yml | 57 +++++++++++++++++++ roles/openshift/project/tasks/main.yml | 7 +++ .../project/templates/alertmanager.yml | 16 ++++++ 4 files changed, 86 insertions(+) create mode 100644 roles/openshift-apps/websites/files/prometheusRules.yml create mode 100644 roles/openshift/project/templates/alertmanager.yml diff --git a/playbooks/openshift-apps/websites.yml b/playbooks/openshift-apps/websites.yml index a15b461072..89842b6908 100644 --- a/playbooks/openshift-apps/websites.yml +++ b/playbooks/openshift-apps/websites.yml @@ -99,3 +99,9 @@ template: deployment.yml objectname: deployment.yml when: env == "staging" + + - role: openshift/object + app: websites + file: prometheusRules.yml + objectname: prometheusRules.yml + when: env == "staging" diff --git a/roles/openshift-apps/websites/files/prometheusRules.yml b/roles/openshift-apps/websites/files/prometheusRules.yml new file mode 100644 index 0000000000..436870b4ef --- /dev/null +++ b/roles/openshift-apps/websites/files/prometheusRules.yml @@ -0,0 +1,57 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: alerts +spec: + groups: + - name: jobFailed + rules: + - alert: JobFailed + annotations: + description: Job {{$labels.namespace}}/{{$labels.job_name}} has failed. + summary: At least one job has failed. + expr: kube_job_failed > 0 + labels: + severity: warning + - name: BuildFailed + rules: + - alert: BuildFailed + annotations: + description: Build {{$labels.namespace}}/{{$labels.buildconfig}} ({{$labels.build}}) has failed. + summary: Build {{$labels.buildconfig}} has failed. + expr: openshift_build_status_phase_total{build_phase="failed"} > 0 + labels: + severity: warning + - name: PodFailing + rules: + - alert: PodPending + annotations: + description: Pod {{$labels.namespace}}/{{$labels.pod}} is in pending state for more than 10m. + summary: Pod {{$labels.pod}} is in pending state. + expr: kube_pod_status_phase{phase="Pending"} > 0 + for: 10m + labels: + severity: warning + - alert: PodRestarted + annotations: + description: Container {{$labels.container}} in Pod {{$labels.namespace}}/{{$labels.pod}} has restarted. + summary: Containers in pod {{$labels.pod}} has restarted. + expr: rate(kube_pod_container_status_restarts_total[10m]) * 60 * 10 > 0 + labels: + severity: warning + - alert: PodCrashLoop + annotations: + description: Container {{$labels.container}} in Pod {{$labels.namespace}}/{{$labels.pod}} has restarted {{ printf "%.2f" $value }} in the last 15 minutes. + summary: Pod {{$labels.pod}} is in CrashLoop state. + expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 2 + labels: + severity: warning + for: 15m + - alert: PodOOMKilled + annotations: + description: Container {{$labels.container}} in Pod {{$labels.namespace}}/{{$labels.pod}} ran out + of memory and has been killed. + summary: Containers in pod {{$labels.pod}} has been OOMKilled. + expr: kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} > 0 + labels: + severity: warning diff --git a/roles/openshift/project/tasks/main.yml b/roles/openshift/project/tasks/main.yml index fcf2acba49..e764a7b526 100644 --- a/roles/openshift/project/tasks/main.yml +++ b/roles/openshift/project/tasks/main.yml @@ -57,6 +57,13 @@ objectname: appowners.yml template_fullpath: "{{roles_path}}/openshift/project/templates/appowners.yml" +- name: alertmanager.yml + include_role: + name: openshift/object + vars: + objectname: alertmanager.yml + template_fullpath: "{{roles_path}}/openshift/project/templates/alertmanager.yml" + - name: egresspolicy.yml include_role: name: openshift/object diff --git a/roles/openshift/project/templates/alertmanager.yml b/roles/openshift/project/templates/alertmanager.yml new file mode 100644 index 0000000000..0065c6bb6f --- /dev/null +++ b/roles/openshift/project/templates/alertmanager.yml @@ -0,0 +1,16 @@ +apiVersion: monitoring.coreos.com/v1beta1 +kind: AlertmanagerConfig +metadata: + name: appowners-alerts + namespace: "{{app}}" +spec: + receivers: + - emailConfigs: + - sendResolved: true + to: "{{ appowners | product(['fedoraproject.org']) | map('join', '@') | join(',') }}" + name: default + route: + groupBy: + - alertname + - namespace + receiver: default