From 561780f347cfd56e0219e9d59a5e076a31dc4994 Mon Sep 17 00:00:00 2001 From: Steve Wilkerson Date: Mon, 11 Jun 2018 10:46:41 -0500 Subject: [PATCH] PVC monitoring: Add alerting rules and service check for PVCs This adds a basic check for capacity utilization for persistent volume claims. To accomplish this, it adds a basic alerting rule to prometheus that triggers after a persistent volume's usage exceeds 80%, and triggers 5 minutes after that state has been reached. In addition, there is a service check added to the nagios chart that will query Prometheus to check if the alarm for that threshhold is firing for any of the volume claims. Change-Id: I862c860ac479a715733202f679bb151885d7aa7c --- nagios/values.yaml | 6 ++++++ prometheus/values.yaml | 8 ++++++++ 2 files changed, 14 insertions(+) diff --git a/nagios/values.yaml b/nagios/values.yaml index 212d007fa..d98cbb6cc 100644 --- a/nagios/values.yaml +++ b/nagios/values.yaml @@ -378,6 +378,12 @@ conf: service_description: "Deployment_replicas-unavailable" check_command: check_prom_alert!deployment_replicas_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas!OK- All deployments have desired replicas check_interval: 60 + - check_volume_claim_high_utilization: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "Volume_claim_high_utilization" + check_command: check_prom_alert!volume_claim_capacity_high_utilization!CRITICAL- Volume claim {persistentvolumeclaim} has exceed 80% utilization!OK- All volume claims less than 80% utilization + check_interval: 60 - check_deployment_rollingupdate_replicas_unavailable: use: notifying_service hostgroup_name: prometheus-hosts diff --git a/prometheus/values.yaml b/prometheus/values.yaml index 0c1ae2909..7fc98bf91 100644 --- a/prometheus/values.yaml +++ b/prometheus/values.yaml @@ -900,6 +900,14 @@ conf: annotations: description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a container terminated for more than 10 minutes' summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status' + - alert: volume_claim_capacity_high_utilization + expr: (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.80 + for: 5m + labels: + severity: page + annotations: + description: 'volume claim {{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity' + summary: '{{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity.' basic_linux: groups: - name: basic_linux.rules