From 561780f347cfd56e0219e9d59a5e076a31dc4994 Mon Sep 17 00:00:00 2001
From: Steve Wilkerson <wilkers.steve@gmail.com>
Date: Mon, 11 Jun 2018 10:46:41 -0500
Subject: [PATCH] PVC monitoring: Add alerting rules and service check for PVCs

This adds a basic check for capacity utilization for persistent
volume claims. To accomplish this, it adds a basic alerting rule
to prometheus that triggers after a persistent volume's usage
exceeds 80%, and triggers 5 minutes after that state has been
reached.  In addition, there is a service check added to the
nagios chart that will query Prometheus to check if the alarm
for that threshhold is firing for any of the volume claims.

Change-Id: I862c860ac479a715733202f679bb151885d7aa7c
---
 nagios/values.yaml     | 6 ++++++
 prometheus/values.yaml | 8 ++++++++
 2 files changed, 14 insertions(+)

diff --git a/nagios/values.yaml b/nagios/values.yaml
index 212d007fa..d98cbb6cc 100644
--- a/nagios/values.yaml
+++ b/nagios/values.yaml
@@ -378,6 +378,12 @@ conf:
           service_description: "Deployment_replicas-unavailable"
           check_command: check_prom_alert!deployment_replicas_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas!OK- All deployments have desired replicas
           check_interval: 60
+      - check_volume_claim_high_utilization:
+          use: notifying_service
+          hostgroup_name: prometheus-hosts
+          service_description: "Volume_claim_high_utilization"
+          check_command: check_prom_alert!volume_claim_capacity_high_utilization!CRITICAL- Volume claim {persistentvolumeclaim} has exceed 80% utilization!OK- All volume claims less than 80% utilization
+          check_interval: 60
       - check_deployment_rollingupdate_replicas_unavailable:
           use: notifying_service
           hostgroup_name: prometheus-hosts
diff --git a/prometheus/values.yaml b/prometheus/values.yaml
index 0c1ae2909..7fc98bf91 100644
--- a/prometheus/values.yaml
+++ b/prometheus/values.yaml
@@ -900,6 +900,14 @@ conf:
             annotations:
               description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a container terminated for more than 10 minutes'
               summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
+          - alert: volume_claim_capacity_high_utilization
+            expr: (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'volume claim {{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity'
+              summary: '{{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity.'
       basic_linux:
         groups:
         - name: basic_linux.rules