Merge "Prometheus: Ceph Alerts Scalar/Vector Conversion"
This commit is contained in:
commit
57ad8ad603
@ -990,7 +990,15 @@ conf:
|
|||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
check_command check_prom_alert!ceph_mon_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists
|
check_command check_prom_alert!ceph_monitor_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description CEPH_quorum
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!ceph_monitor_quorum_absent!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists
|
||||||
check_interval 60
|
check_interval 60
|
||||||
hostgroup_name prometheus-hosts
|
hostgroup_name prometheus-hosts
|
||||||
service_description CEPH_quorum
|
service_description CEPH_quorum
|
||||||
|
@ -3,7 +3,17 @@ conf:
|
|||||||
rules:
|
rules:
|
||||||
ceph:
|
ceph:
|
||||||
groups:
|
groups:
|
||||||
- name: ceph.rules
|
- name: ceph.recording_rules
|
||||||
|
rules:
|
||||||
|
- record: ceph_cluster_usage_percent
|
||||||
|
expr: 100 * (ceph_cluster_total_used_bytes / ceph_cluster_total_bytes)
|
||||||
|
- record: ceph_placement_group_degrade_percent
|
||||||
|
expr: 100 * (ceph_pg_degraded / ceph_pg_total)
|
||||||
|
- record: ceph_osd_down_percent
|
||||||
|
expr: 100 * (count(ceph_osd_up == 0) / count(ceph_osd_metadata))
|
||||||
|
- record: ceph_osd_out_percent
|
||||||
|
expr: 100 * (count(ceph_osd_in == 0) / count(ceph_osd_metadata))
|
||||||
|
- name: ceph.alerting_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: prom_exporter_ceph_unavailable
|
- alert: prom_exporter_ceph_unavailable
|
||||||
expr: absent(ceph_health_status)
|
expr: absent(ceph_health_status)
|
||||||
@ -14,14 +24,13 @@ conf:
|
|||||||
description: Ceph exporter is not collecting metrics or is not available for past 10 minutes
|
description: Ceph exporter is not collecting metrics or is not available for past 10 minutes
|
||||||
title: Ceph exporter is not collecting metrics or is not available
|
title: Ceph exporter is not collecting metrics or is not available
|
||||||
- alert: no_active_ceph_mgr
|
- alert: no_active_ceph_mgr
|
||||||
expr: count(up{job="ceph-mgr"} == 1) == 0
|
expr: avg_over_time(up{job="ceph-mgr"}[5m]) == 0
|
||||||
for: 5m
|
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
description: 'no ceph active mgr is present or all ceph mgr are down'
|
description: 'no ceph active mgr is present or all ceph mgr are down'
|
||||||
summary: 'no ceph active mgt is present'
|
summary: 'no ceph active mgt is present'
|
||||||
- alert: ceph_mon_quorum_low
|
- alert: ceph_monitor_quorum_low
|
||||||
expr: ceph_mon_quorum_count < 3
|
expr: ceph_mon_quorum_count < 3
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
@ -29,43 +38,45 @@ conf:
|
|||||||
annotations:
|
annotations:
|
||||||
description: 'ceph monitor quorum has been less than 3 for more than 5 minutes'
|
description: 'ceph monitor quorum has been less than 3 for more than 5 minutes'
|
||||||
summary: 'ceph high availability is at risk'
|
summary: 'ceph high availability is at risk'
|
||||||
|
- alert: ceph_monitor_quorum_absent
|
||||||
|
expr: absent(avg_over_time(ceph_mon_quorum_status[5m]))
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
description: 'ceph monitor quorum has been gone for more than 5 minutes'
|
||||||
|
summary: 'ceph high availability is at risk'
|
||||||
- alert: ceph_cluster_usage_high
|
- alert: ceph_cluster_usage_high
|
||||||
expr: 100* ceph_cluster_total_used_bytes/ceph_cluster_total_bytes > 80
|
expr: avg_over_time(ceph_cluster_usage_percent[5m]) > 80
|
||||||
for: 5m
|
|
||||||
labels:
|
labels:
|
||||||
severity: page
|
severity: page
|
||||||
annotations:
|
annotations:
|
||||||
description: 'ceph cluster capacity usage more than 80 percent'
|
description: 'ceph cluster capacity usage more than 80 percent'
|
||||||
summary: 'ceph cluster usage is more than 80 percent'
|
summary: 'ceph cluster usage is more than 80 percent'
|
||||||
- alert: ceph_placement_group_degrade_pct_high
|
- alert: ceph_placement_group_degrade_pct_high
|
||||||
expr: 100 * sum(ceph_pg_degraded)/sum(ceph_osd_numpg) > 80
|
expr: avg_over_time(ceph_placement_group_degrade_percent[5m]) > 80
|
||||||
for: 5m
|
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
description: 'ceph placement group degradation is more than 80 percent'
|
description: 'ceph placement group degradation is more than 80 percent'
|
||||||
summary: 'ceph placement groups degraded'
|
summary: 'ceph placement groups degraded'
|
||||||
- alert: ceph_osd_down_pct_high
|
- alert: ceph_osd_down_pct_high
|
||||||
expr: 100 * count(ceph_osd_up==0)/count(ceph_osd_metadata) > 80
|
expr: avg_over_time(ceph_osd_down_percent[5m]) > 80
|
||||||
for: 5m
|
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
description: 'ceph OSDs down percent is more than 80 percent'
|
description: 'ceph OSDs down percent is more than 80 percent'
|
||||||
summary: 'ceph OSDs down percent is high'
|
summary: 'ceph OSDs down percent is high'
|
||||||
- alert: ceph_osd_down
|
- alert: ceph_osd_down
|
||||||
expr: ceph_osd_up == 0
|
expr: avg_over_time(ceph_osd_up[5m]) == 0
|
||||||
for: 1m
|
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
description: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}'
|
description: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}.'
|
||||||
summary: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}'
|
summary: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}.'
|
||||||
- alert: ceph_osd_out
|
- alert: ceph_osd_out
|
||||||
expr: ceph_osd_in == 0
|
expr: avg_over_time(ceph_osd_in[5m]) == 0
|
||||||
for: 5m
|
|
||||||
labels:
|
labels:
|
||||||
severity: page
|
severity: page
|
||||||
annotations:
|
annotations:
|
||||||
description: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}'
|
description: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}.'
|
||||||
summary: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}'
|
summary: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}.'
|
||||||
|
Loading…
Reference in New Issue
Block a user