PVC monitoring: Add alerting rules and service check for PVCs
This adds a basic check for capacity utilization for persistent volume claims. To accomplish this, it adds a basic alerting rule to prometheus that triggers after a persistent volume's usage exceeds 80%, and triggers 5 minutes after that state has been reached. In addition, there is a service check added to the nagios chart that will query Prometheus to check if the alarm for that threshhold is firing for any of the volume claims. Change-Id: I862c860ac479a715733202f679bb151885d7aa7c
This commit is contained in:
parent
9f1cfbacd8
commit
561780f347
@ -378,6 +378,12 @@ conf:
|
||||
service_description: "Deployment_replicas-unavailable"
|
||||
check_command: check_prom_alert!deployment_replicas_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas!OK- All deployments have desired replicas
|
||||
check_interval: 60
|
||||
- check_volume_claim_high_utilization:
|
||||
use: notifying_service
|
||||
hostgroup_name: prometheus-hosts
|
||||
service_description: "Volume_claim_high_utilization"
|
||||
check_command: check_prom_alert!volume_claim_capacity_high_utilization!CRITICAL- Volume claim {persistentvolumeclaim} has exceed 80% utilization!OK- All volume claims less than 80% utilization
|
||||
check_interval: 60
|
||||
- check_deployment_rollingupdate_replicas_unavailable:
|
||||
use: notifying_service
|
||||
hostgroup_name: prometheus-hosts
|
||||
|
@ -900,6 +900,14 @@ conf:
|
||||
annotations:
|
||||
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a container terminated for more than 10 minutes'
|
||||
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
|
||||
- alert: volume_claim_capacity_high_utilization
|
||||
expr: (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'volume claim {{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity'
|
||||
summary: '{{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity.'
|
||||
basic_linux:
|
||||
groups:
|
||||
- name: basic_linux.rules
|
||||
|
Loading…
Reference in New Issue
Block a user