Merge "Prometheus: Update pod container status alerts"
This commit is contained in:
commit
d1b77b2bea
@ -526,6 +526,12 @@ conf:
|
|||||||
service_description: "Daemonset_not-scheduled"
|
service_description: "Daemonset_not-scheduled"
|
||||||
check_command: check_prom_alert!daemonsets_not_scheduled!CRITICAL- Daemonset {daemonset} is missing to be scheduled in some nodes!OK- All daemonset scheduling is as desired
|
check_command: check_prom_alert!daemonsets_not_scheduled!CRITICAL- Daemonset {daemonset} is missing to be scheduled in some nodes!OK- All daemonset scheduling is as desired
|
||||||
check_interval: 60
|
check_interval: 60
|
||||||
|
- check_daemonset_unavailable:
|
||||||
|
use: notifying_service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "Daemonset_pods-unavailable"
|
||||||
|
check_command: check_prom_alert!daemonset_pods_unavailable!CRITICAL- Daemonset {daemonset} has pods unavailable!OK- All daemonset pods available
|
||||||
|
check_interval: 60
|
||||||
- check_deployment_replicas_unavailable:
|
- check_deployment_replicas_unavailable:
|
||||||
use: notifying_service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
@ -562,6 +568,18 @@ conf:
|
|||||||
service_description: "Pod_status-error-image-pull"
|
service_description: "Pod_status-error-image-pull"
|
||||||
check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status
|
check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status
|
||||||
check_interval: 60
|
check_interval: 60
|
||||||
|
- check_pod_status_error_image_pull_backoff:
|
||||||
|
use: notifying_service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "Pod_status-error-image-pull"
|
||||||
|
check_command: check_prom_alert! pod_status_error_image_pull_backoff!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ImagePullBackOff for more than 10 minutes!OK- No pods in error status
|
||||||
|
check_interval: 60
|
||||||
|
- check_pod_status_error_container_config_error:
|
||||||
|
use: notifying_service
|
||||||
|
hostgroup_name: prometheus-hosts
|
||||||
|
service_description: "Pod_status-error-image-pull"
|
||||||
|
check_command: check_prom_alert! pod_error_config_error!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of CreateContainerConfigError for more than 10 minutes!OK- No pods in error status
|
||||||
|
check_interval: 60
|
||||||
- check_pod_error_crash_loop_back_off:
|
- check_pod_error_crash_loop_back_off:
|
||||||
use: notifying_service
|
use: notifying_service
|
||||||
hostgroup_name: prometheus-hosts
|
hostgroup_name: prometheus-hosts
|
||||||
|
@ -1300,6 +1300,14 @@ conf:
|
|||||||
annotations:
|
annotations:
|
||||||
description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
|
description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
|
||||||
summary: 'Less than desired number of daemonsets scheduled'
|
summary: 'Less than desired number of daemonsets scheduled'
|
||||||
|
- alert: daemonset_pods_unavailable
|
||||||
|
expr: kube_daemonset_status_number_unavailable > 0
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
description: 'Daemonset {{$labels.daemonset}} currently has pods unavailable'
|
||||||
|
summary: 'Daemonset pods unavailable, due to one of many reasons'
|
||||||
- alert: deployment_replicas_unavailable
|
- alert: deployment_replicas_unavailable
|
||||||
expr: kube_deployment_status_replicas_unavailable > 0
|
expr: kube_deployment_status_replicas_unavailable > 0
|
||||||
for: 10m
|
for: 10m
|
||||||
@ -1340,13 +1348,13 @@ conf:
|
|||||||
annotations:
|
annotations:
|
||||||
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
|
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
|
||||||
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
||||||
- alert: pod_status_error_image_pull
|
- alert: pod_status_error_image_pull_backoff
|
||||||
expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
|
expr: kube_pod_container_status_waiting_reason {reason="ImagePullBackOff"} == 1
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: page
|
severity: page
|
||||||
annotations:
|
annotations:
|
||||||
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
|
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an ImagePullBackOff error for more than 10 minutes'
|
||||||
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
||||||
- alert: pod_error_crash_loop_back_off
|
- alert: pod_error_crash_loop_back_off
|
||||||
expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1
|
expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1
|
||||||
@ -1356,6 +1364,14 @@ conf:
|
|||||||
annotations:
|
annotations:
|
||||||
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff error for more than 10 minutes'
|
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff error for more than 10 minutes'
|
||||||
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
||||||
|
- alert: pod_error_config_error
|
||||||
|
expr: kube_pod_container_status_waiting_reason {reason="CreateContainerConfigError"} == 1
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a CreateContainerConfigError error for more than 10 minutes'
|
||||||
|
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
||||||
- alert: replicaset_missing_replicas
|
- alert: replicaset_missing_replicas
|
||||||
expr: kube_replicaset_spec_replicas - kube_replicaset_status_ready_replicas > 0
|
expr: kube_replicaset_spec_replicas - kube_replicaset_status_ready_replicas > 0
|
||||||
for: 10m
|
for: 10m
|
||||||
|
Loading…
x
Reference in New Issue
Block a user