From f37865d6a03f478aaaada748f85b8e9d5d82ad72 Mon Sep 17 00:00:00 2001 From: Steven Fitzpatrick Date: Fri, 8 Nov 2019 14:00:12 -0600 Subject: [PATCH] Prometheus: Ceph Alerts Scalar/Vector Conversion This change updates the prometheus alerting rules to use ranged vectors in their expressions, to avoid situations wher missed scrapes would cause scalar metrics to "go stale" - resetting the alert timer. Only the ceph alerts are affected by this change. Change-Id: Ib47866d12616aaa808e6a09c58aa4352e338a152 Co-Authored-By: Meghan Heisler --- nagios/values.yaml | 10 +++++- prometheus/values_overrides/ceph.yaml | 47 +++++++++++++++++---------- 2 files changed, 38 insertions(+), 19 deletions(-) diff --git a/nagios/values.yaml b/nagios/values.yaml index 30cbe721b..ba8c31e0d 100644 --- a/nagios/values.yaml +++ b/nagios/values.yaml @@ -990,7 +990,15 @@ conf: } define service { - check_command check_prom_alert!ceph_mon_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists + check_command check_prom_alert!ceph_monitor_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists + check_interval 60 + hostgroup_name prometheus-hosts + service_description CEPH_quorum + use notifying_service + } + + define service { + check_command check_prom_alert!ceph_monitor_quorum_absent!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists check_interval 60 hostgroup_name prometheus-hosts service_description CEPH_quorum diff --git a/prometheus/values_overrides/ceph.yaml b/prometheus/values_overrides/ceph.yaml index 91e8e98d7..233f3237d 100644 --- a/prometheus/values_overrides/ceph.yaml +++ b/prometheus/values_overrides/ceph.yaml @@ -3,7 +3,17 @@ conf: rules: ceph: groups: - - name: ceph.rules + - name: ceph.recording_rules + rules: + - record: ceph_cluster_usage_percent + expr: 100 * (ceph_cluster_total_used_bytes / ceph_cluster_total_bytes) + - record: ceph_placement_group_degrade_percent + expr: 100 * (ceph_pg_degraded / ceph_pg_total) + - record: ceph_osd_down_percent + expr: 100 * (count(ceph_osd_up == 0) / count(ceph_osd_metadata)) + - record: ceph_osd_out_percent + expr: 100 * (count(ceph_osd_in == 0) / count(ceph_osd_metadata)) + - name: ceph.alerting_rules rules: - alert: prom_exporter_ceph_unavailable expr: absent(ceph_health_status) @@ -14,14 +24,13 @@ conf: description: Ceph exporter is not collecting metrics or is not available for past 10 minutes title: Ceph exporter is not collecting metrics or is not available - alert: no_active_ceph_mgr - expr: count(up{job="ceph-mgr"} == 1) == 0 - for: 5m + expr: avg_over_time(up{job="ceph-mgr"}[5m]) == 0 labels: severity: warning annotations: description: 'no ceph active mgr is present or all ceph mgr are down' summary: 'no ceph active mgt is present' - - alert: ceph_mon_quorum_low + - alert: ceph_monitor_quorum_low expr: ceph_mon_quorum_count < 3 for: 5m labels: @@ -29,43 +38,45 @@ conf: annotations: description: 'ceph monitor quorum has been less than 3 for more than 5 minutes' summary: 'ceph high availability is at risk' + - alert: ceph_monitor_quorum_absent + expr: absent(avg_over_time(ceph_mon_quorum_status[5m])) + labels: + severity: page + annotations: + description: 'ceph monitor quorum has been gone for more than 5 minutes' + summary: 'ceph high availability is at risk' - alert: ceph_cluster_usage_high - expr: 100* ceph_cluster_total_used_bytes/ceph_cluster_total_bytes > 80 - for: 5m + expr: avg_over_time(ceph_cluster_usage_percent[5m]) > 80 labels: severity: page annotations: description: 'ceph cluster capacity usage more than 80 percent' summary: 'ceph cluster usage is more than 80 percent' - alert: ceph_placement_group_degrade_pct_high - expr: 100 * sum(ceph_pg_degraded)/sum(ceph_osd_numpg) > 80 - for: 5m + expr: avg_over_time(ceph_placement_group_degrade_percent[5m]) > 80 labels: severity: critical annotations: description: 'ceph placement group degradation is more than 80 percent' summary: 'ceph placement groups degraded' - alert: ceph_osd_down_pct_high - expr: 100 * count(ceph_osd_up==0)/count(ceph_osd_metadata) > 80 - for: 5m + expr: avg_over_time(ceph_osd_down_percent[5m]) > 80 labels: severity: critical annotations: description: 'ceph OSDs down percent is more than 80 percent' summary: 'ceph OSDs down percent is high' - alert: ceph_osd_down - expr: ceph_osd_up == 0 - for: 1m + expr: avg_over_time(ceph_osd_up[5m]) == 0 labels: severity: critical annotations: - description: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}' - summary: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}' + description: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}.' + summary: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}.' - alert: ceph_osd_out - expr: ceph_osd_in == 0 - for: 5m + expr: avg_over_time(ceph_osd_in[5m]) == 0 labels: severity: page annotations: - description: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}' - summary: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}' + description: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}.' + summary: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}.'