PVC monitoring: Add alerting rules and service check for PVCs

This adds a basic check for capacity utilization for persistent
volume claims. To accomplish this, it adds a basic alerting rule
to prometheus that triggers after a persistent volume's usage
exceeds 80%, and triggers 5 minutes after that state has been
reached.  In addition, there is a service check added to the
nagios chart that will query Prometheus to check if the alarm
for that threshhold is firing for any of the volume claims.

Change-Id: I862c860ac479a715733202f679bb151885d7aa7c
This commit is contained in:
Steve Wilkerson 2018-06-11 10:46:41 -05:00
parent 9f1cfbacd8
commit 561780f347
2 changed files with 14 additions and 0 deletions

View File

@ -378,6 +378,12 @@ conf:
service_description: "Deployment_replicas-unavailable"
check_command: check_prom_alert!deployment_replicas_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas!OK- All deployments have desired replicas
check_interval: 60
- check_volume_claim_high_utilization:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Volume_claim_high_utilization"
check_command: check_prom_alert!volume_claim_capacity_high_utilization!CRITICAL- Volume claim {persistentvolumeclaim} has exceed 80% utilization!OK- All volume claims less than 80% utilization
check_interval: 60
- check_deployment_rollingupdate_replicas_unavailable:
use: notifying_service
hostgroup_name: prometheus-hosts

View File

@ -900,6 +900,14 @@ conf:
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a container terminated for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: volume_claim_capacity_high_utilization
expr: (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.80
for: 5m
labels:
severity: page
annotations:
description: 'volume claim {{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity'
summary: '{{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity.'
basic_linux:
groups:
- name: basic_linux.rules