Merge "Prometheus: Update pod container status alerts"
This commit is contained in:
commit
d1b77b2bea
@ -526,6 +526,12 @@ conf:
|
||||
service_description: "Daemonset_not-scheduled"
|
||||
check_command: check_prom_alert!daemonsets_not_scheduled!CRITICAL- Daemonset {daemonset} is missing to be scheduled in some nodes!OK- All daemonset scheduling is as desired
|
||||
check_interval: 60
|
||||
- check_daemonset_unavailable:
|
||||
use: notifying_service
|
||||
hostgroup_name: prometheus-hosts
|
||||
service_description: "Daemonset_pods-unavailable"
|
||||
check_command: check_prom_alert!daemonset_pods_unavailable!CRITICAL- Daemonset {daemonset} has pods unavailable!OK- All daemonset pods available
|
||||
check_interval: 60
|
||||
- check_deployment_replicas_unavailable:
|
||||
use: notifying_service
|
||||
hostgroup_name: prometheus-hosts
|
||||
@ -562,6 +568,18 @@ conf:
|
||||
service_description: "Pod_status-error-image-pull"
|
||||
check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status
|
||||
check_interval: 60
|
||||
- check_pod_status_error_image_pull_backoff:
|
||||
use: notifying_service
|
||||
hostgroup_name: prometheus-hosts
|
||||
service_description: "Pod_status-error-image-pull"
|
||||
check_command: check_prom_alert! pod_status_error_image_pull_backoff!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ImagePullBackOff for more than 10 minutes!OK- No pods in error status
|
||||
check_interval: 60
|
||||
- check_pod_status_error_container_config_error:
|
||||
use: notifying_service
|
||||
hostgroup_name: prometheus-hosts
|
||||
service_description: "Pod_status-error-image-pull"
|
||||
check_command: check_prom_alert! pod_error_config_error!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of CreateContainerConfigError for more than 10 minutes!OK- No pods in error status
|
||||
check_interval: 60
|
||||
- check_pod_error_crash_loop_back_off:
|
||||
use: notifying_service
|
||||
hostgroup_name: prometheus-hosts
|
||||
|
@ -1300,6 +1300,14 @@ conf:
|
||||
annotations:
|
||||
description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
|
||||
summary: 'Less than desired number of daemonsets scheduled'
|
||||
- alert: daemonset_pods_unavailable
|
||||
expr: kube_daemonset_status_number_unavailable > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'Daemonset {{$labels.daemonset}} currently has pods unavailable'
|
||||
summary: 'Daemonset pods unavailable, due to one of many reasons'
|
||||
- alert: deployment_replicas_unavailable
|
||||
expr: kube_deployment_status_replicas_unavailable > 0
|
||||
for: 10m
|
||||
@ -1340,13 +1348,13 @@ conf:
|
||||
annotations:
|
||||
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
|
||||
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
||||
- alert: pod_status_error_image_pull
|
||||
expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
|
||||
- alert: pod_status_error_image_pull_backoff
|
||||
expr: kube_pod_container_status_waiting_reason {reason="ImagePullBackOff"} == 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
|
||||
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an ImagePullBackOff error for more than 10 minutes'
|
||||
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
||||
- alert: pod_error_crash_loop_back_off
|
||||
expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1
|
||||
@ -1356,6 +1364,14 @@ conf:
|
||||
annotations:
|
||||
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff error for more than 10 minutes'
|
||||
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
||||
- alert: pod_error_config_error
|
||||
expr: kube_pod_container_status_waiting_reason {reason="CreateContainerConfigError"} == 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a CreateContainerConfigError error for more than 10 minutes'
|
||||
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
||||
- alert: replicaset_missing_replicas
|
||||
expr: kube_replicaset_spec_replicas - kube_replicaset_status_ready_replicas > 0
|
||||
for: 10m
|
||||
|
Loading…
x
Reference in New Issue
Block a user