Merge "Prometheus alerts, nagios defn - rabbitmq,mariadb,ES"
This commit is contained in:
commit
1cc4cec5fe
@ -543,6 +543,111 @@ conf:
|
||||
service_description: Calico_datapane_failures_high
|
||||
check_command: check_prom_alert!calico_datapane_failures_high_1h!CRITICAL- Felix instance {instance} has seen high dataplane failures within the last hour!OK- datapane failures are none or low
|
||||
hostgroup_name: prometheus-hosts
|
||||
- check_rabbitmq_network_partitions_detected:
|
||||
use: generic-service
|
||||
service_description: Rabbitmq_network-partitions-exist
|
||||
check_command: check_prom_alert!rabbitmq_network_pratitions_detected!CRITICAL- Rabbitmq instance {instance} has network partitions!OK- no network partitions detected in rabbitmq
|
||||
hostgroup_name: prometheus-hosts
|
||||
- check_rabbitmq_available:
|
||||
use: generic-service
|
||||
service_description: Rabbitmq_up
|
||||
check_command: check_prom_alert!rabbitmq_down!CRITICAL- Rabbitmq instance {instance} is down!OK- rabbitmq is available
|
||||
hostgroup_name: prometheus-hosts
|
||||
- check_rabbitmq_fd_usage:
|
||||
use: generic-service
|
||||
service_description: Rabbitmq_file-descriptor-usage
|
||||
check_command: check_prom_alert!rabbitmq_file_descriptor_usage_high!CRITICAL- Rabbitmq instance {instance} has file desciptor usage more than 80 percent!OK- rabbitmq file descriptor usage is normal
|
||||
hostgroup_name: prometheus-hosts
|
||||
- check_rabbitmq_node_disk_alarm:
|
||||
use: generic-service
|
||||
service_description: Rabbitmq_node-disk-alarm
|
||||
check_command: check_prom_alert!rabbitmq_node_disk_free_alarm!CRITICAL- Rabbitmq instance {instance} has a disk usage alarm!OK- rabbitmq node disk has no alarms
|
||||
hostgroup_name: prometheus-hosts
|
||||
- check_rabbitmq_node_memory_alarm:
|
||||
use: generic-service
|
||||
service_description: Rabbitmq_node-memory-alarm
|
||||
check_command: check_prom_alert!rabbitmq_node_memory_alarm!CRITICAL- Rabbitmq instance {instance} has a memory alarm!OK- rabbitmq node memory has no alarms
|
||||
hostgroup_name: prometheus-hosts
|
||||
- check_rabbitmq_availability:
|
||||
use: generic-service
|
||||
service_description: Rabbitmq_high-availability
|
||||
check_command: check_prom_alert!rabbitmq_less_than_3_nodes!CRITICAL- Rabbitmq has less than 3 nodes to serve!OK- rabbitmq has atleast 3 nodes serving
|
||||
hostgroup_name: prometheus-hosts
|
||||
- check_queue_message_return_percent:
|
||||
use: generic-service
|
||||
service_description: Rabbitmq_message-return-percent
|
||||
check_command: check_prom_alert!rabbitmq_queue_messages_returned_high!CRITICAL- Rabbitmq has high percent of messages being returned!OK- rabbitmq messages are consumed and low or no returns exist.
|
||||
hostgroup_name: prometheus-hosts
|
||||
- check_queue_consumer_util:
|
||||
use: generic-service
|
||||
service_description: Rabbitmq_consumer-utilization
|
||||
check_command: check_prom_alert!rabbitmq_consumers_low_utilization!CRITICAL- Rabbitmq consumer message consumption rate is slow!OK- rabbitmq message consumption speed is normal
|
||||
hostgroup_name: prometheus-hosts
|
||||
- check_queue_load:
|
||||
use: generic-service
|
||||
service_description: Rabbitmq_rabbitmq-queue-health
|
||||
check_command: check_prom_alert!rabbitmq_high_message_load!CRITICAL- Rabbitmq unacknowledged message count is high!OK- rabbitmq unacknowledged message count is high
|
||||
hostgroup_name: prometheus-hosts
|
||||
- check_es_high_process_open_file_count:
|
||||
use: generic-service
|
||||
service_description: ES_high-process-open-file-count
|
||||
check_command: check_prom_alert!es_high_process_open_files_count!CRITICAL- Elasticsearch {host} has high process open file count!OK- Elasticsearch process open file count is normal.
|
||||
hostgroup_name: prometheus-hosts
|
||||
- check_es_high_process_cpu_percent:
|
||||
use: generic-service
|
||||
service_description: ES_high-process-cpu-percent
|
||||
check_command: check_prom_alert!es_high_process_cpu_percent!CRITICAL- Elasticsearch {instance} has high process CPU percent!OK- Elasticsearch process cpu usage is normal.
|
||||
hostgroup_name: prometheus-hosts
|
||||
- check_es_fs_usage:
|
||||
use: generic-service
|
||||
service_description: ES_high-filesystem-usage
|
||||
check_command: check_prom_alert!es_fs_usage_high!CRITICAL- Elasticsearch {instance} has high filesystem usage!OK- Elasticsearch filesystem usage is normal.
|
||||
hostgroup_name: prometheus-hosts
|
||||
- check_es_unassigned_shards:
|
||||
use: generic-service
|
||||
service_description: ES_unassigned-shards
|
||||
check_command: check_prom_alert!es_unassigned_shards!CRITICAL- Elasticsearch has unassinged shards!OK- Elasticsearch has no unassigned shards.
|
||||
hostgroup_name: prometheus-hosts
|
||||
- check_es_cluster_health_timedout:
|
||||
use: generic-service
|
||||
service_description: ES_cluster-health-timedout
|
||||
check_command: check_prom_alert!es_cluster_health_timed_out!CRITICAL- Elasticsearch Cluster health status call timedout!OK- Elasticsearch cluster health is retrievable.
|
||||
hostgroup_name: prometheus-hosts
|
||||
- check_es_cluster_health_status:
|
||||
use: generic-service
|
||||
service_description: ES_cluster-health-status
|
||||
check_command: check_prom_alert!es_cluster_health_status_alert!CRITICAL- Elasticsearch Cluster is not green. One or more shards or replicas are unallocated!OK- Elasticsearch cluster health is green.
|
||||
hostgroup_name: prometheus-hosts
|
||||
- check_es_cluster_number_nodes_running:
|
||||
use: generic-service
|
||||
service_description: ES_cluster-running-node-count
|
||||
check_command: check_prom_alert!es_cluster_health_too_few_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 nodes running!OK- Elasticsearch cluster has 3 or more nodes running.
|
||||
hostgroup_name: prometheus-hosts
|
||||
- check_es_cluster_number_data_nodes_running:
|
||||
use: generic-service
|
||||
service_description: ES_cluster-running-data-node-count
|
||||
check_command: check_prom_alert!es_cluster_health_too_few_data_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 data nodes running!OK- Elasticsearch cluster has 3 or more data nodes running.
|
||||
hostgroup_name: prometheus-hosts
|
||||
- check_mariadb_table_lock_waits:
|
||||
use: generic-service
|
||||
service_description: Mariadb_table-lock-waits-high
|
||||
check_command: check_prom_alert!mariadb_table_lock_wait_high!CRITICAL- Mariadb has high number of table lock waits!OK- No issues found with table lock waits.
|
||||
hostgroup_name: prometheus-hosts
|
||||
- check_mariadb_node_ready:
|
||||
use: generic-service
|
||||
service_description: Mariadb_node-ready
|
||||
check_command: check_prom_alert!mariadb_node_not_ready!CRITICAL- Mariadb {instance} is not ready!OK- All galera cluster nodes are ready.
|
||||
hostgroup_name: prometheus-hosts
|
||||
- check_mariadb_node_out_of_sync:
|
||||
use: generic-service
|
||||
service_description: Mariadb_node-synchronized
|
||||
check_command: check_prom_alert!mariadb_galera_node_out_of_sync!CRITICAL- Mariadb {instance} is out of sync!OK- All galera cluster nodes are in sync
|
||||
hostgroup_name: prometheus-hosts
|
||||
- check_mariadb_innodb_replication_lag:
|
||||
use: generic-service
|
||||
service_description: Mariadb_innodb-replication-lag
|
||||
check_command: check_prom_alert!mariadb_innodb_replication_fallen_behind!CRITICAL- Innodb replication has fallen behind and not recovering!OK- innodb replication lag is nominal.
|
||||
hostgroup_name: prometheus-hosts
|
||||
- check_filespace_mounts-usage-rate-fullin4hrs:
|
||||
use: notifying_service
|
||||
hostgroup_name: base-os
|
||||
|
@ -1309,3 +1309,183 @@ conf:
|
||||
annotations:
|
||||
description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable restore errors within the last hour'
|
||||
summary: 'A high number of iptable restore errors within Felix are happening'
|
||||
rabbitmq:
|
||||
groups:
|
||||
- name: rabbitmq.rules
|
||||
rules:
|
||||
- alert: rabbitmq_network_pratitions_detected
|
||||
expr: min(partitions) by(instance) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'RabbitMQ at {{ $labels.instance }} has {{ $value }} partitions'
|
||||
summary: 'RabbitMQ Network partitions detected'
|
||||
- alert: rabbitmq_down
|
||||
expr: min(rabbitmq_up) by(instance) != 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'RabbitMQ Server instance {{ $labels.instance }} is down'
|
||||
summary: 'The RabbitMQ Server instance at {{ $labels.instance }} has been down the last 10 mins'
|
||||
- alert: rabbitmq_file_descriptor_usage_high
|
||||
expr: fd_used * 100 /fd_total > 80
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'RabbitMQ Server instance {{ $labels.instance }} has high file descriptor usage of {{ $value }} percent.'
|
||||
summary: 'RabbitMQ file descriptors usage is high for last 10 mins'
|
||||
- alert: rabbitmq_node_disk_free_alarm
|
||||
expr: node_disk_free_alarm > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'RabbitMQ Server instance {{ $labels.instance }} has low disk free space available.'
|
||||
summary: 'RabbitMQ disk space usage is high'
|
||||
- alert: rabbitmq_node_memory_alarm
|
||||
expr: node_mem_alarm > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'RabbitMQ Server instance {{ $labels.instance }} has low free memory.'
|
||||
summary: 'RabbitMQ memory usage is high'
|
||||
- alert: rabbitmq_less_than_3_nodes
|
||||
expr: running < 3
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'RabbitMQ Server has less than 3 nodes running.'
|
||||
summary: 'RabbitMQ server is at risk of loosing data'
|
||||
- alert: rabbitmq_queue_messages_returned_high
|
||||
expr: queue_messages_returned_total/queue_messages_published_total * 100 > 50
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'RabbitMQ Server is returing more than 50 percent of messages received.'
|
||||
summary: 'RabbitMQ server is returning more than 50 percent of messages received.'
|
||||
- alert: rabbitmq_consumers_low_utilization
|
||||
expr: queue_consumer_utilisation < .4
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'RabbitMQ consumers message consumption speed is low'
|
||||
summary: 'RabbitMQ consumers message consumption speed is low'
|
||||
- alert: rabbitmq_high_message_load
|
||||
expr: queue_messages_total > 17000 or increase(queue_messages_total[5m]) > 4000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'RabbitMQ has high message load. Total Queue depth > 17000 or growth more than 4000 messages.'
|
||||
summary: 'RabbitMQ has high message load'
|
||||
elasticsearch:
|
||||
groups:
|
||||
- name: elasticsearch.rules
|
||||
rules:
|
||||
- alert: es_high_process_open_files_count
|
||||
expr: sum(elasticsearch_process_open_files_count) by (host) > 64000
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'Elasticsearch at {{ $labels.host }} has more than 64000 process open file count.'
|
||||
summary: 'Elasticsearch has a very high process open file count.'
|
||||
- alert: es_high_process_cpu_percent
|
||||
expr: elasticsearch_process_cpu_percent > 95
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'Elasticsearch at {{ $labels.instance }} has high process cpu percent of {{ $value }}.'
|
||||
summary: 'Elasticsearch process cpu usage is more than 95 percent.'
|
||||
- alert: es_fs_usage_high
|
||||
expr: (100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes) / elasticsearch_filesystem_data_size_bytes) > 80
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'Elasticsearch at {{ $labels.instance }} has filesystem usage of {{ $value }}.'
|
||||
summary: 'Elasticsearch filesystem usage is high.'
|
||||
- alert: es_unassigned_shards
|
||||
expr: elasticsearch_cluster_health_unassigned_shards > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'Elasticsearch has {{ $value }} unassigned shards.'
|
||||
summary: 'Elasticsearch has unassigned shards and hence a unhealthy cluster state.'
|
||||
- alert: es_cluster_health_timed_out
|
||||
expr: elasticsearch_cluster_health_timed_out > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'Elasticsearch cluster health status call timedout {{ $value }} times.'
|
||||
summary: 'Elasticsearch cluster health status calls are timing out.'
|
||||
- alert: es_cluster_health_status_alert
|
||||
expr: elasticsearch_cluster_health_status > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'Elasticsearch cluster health status is not green. One or more shards or replicas are unallocated.'
|
||||
summary: 'Elasticsearch cluster health status is not green.'
|
||||
- alert: es_cluster_health_too_few_nodes_running
|
||||
expr: elasticsearch_cluster_health_number_of_nodes < 3
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'There are only {{$value}} < 3 ElasticSearch nodes running'
|
||||
summary: 'ElasticSearch running on less than 3 nodes'
|
||||
- alert: es_cluster_health_too_few_data_nodes_running
|
||||
expr: elasticsearch_cluster_health_number_of_data_nodes < 3
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'There are only {{$value}} < 3 ElasticSearch data nodes running'
|
||||
summary: 'ElasticSearch running on less than 3 data nodes'
|
||||
mariadb:
|
||||
groups:
|
||||
- name: mariadb.rules
|
||||
rules:
|
||||
- alert: mariadb_table_lock_wait_high
|
||||
expr: 100 * mysql_global_status_table_locks_waited/(mysql_global_status_table_locks_waited + mysql_global_status_table_locks_immediate) > 30
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'Mariadb has high table lock waits of {{ $value }} percentage'
|
||||
summary: 'Mariadb table lock waits are high'
|
||||
- alert: mariadb_node_not_ready
|
||||
expr: mysql_global_status_wsrep_ready != 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{$labels.job}} on {{$labels.instance}} is not ready.'
|
||||
summary: 'Galera cluster node not ready'
|
||||
- alert: mariadb_galera_node_out_of_sync
|
||||
expr: mysql_global_status_wsrep_local_state != 4 AND mysql_global_variables_wsrep_desync == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{$labels.job}} on {{$labels.instance}} is not in sync ({{$value}} != 4)'
|
||||
summary: 'Galera cluster node out of sync'
|
||||
- alert: mariadb_innodb_replication_fallen_behind
|
||||
expr: (mysql_global_variables_innodb_replication_delay > 30) AND on (instance) (predict_linear(mysql_global_variables_innodb_replication_delay[5m], 60*2) > 0)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 'The mysql innodb replication has fallen behind and is not recovering'
|
||||
summary: 'MySQL innodb replication is lagging'
|
||||
|
Loading…
x
Reference in New Issue
Block a user