Merge "Nagios: Add support for arbitrary object definitions via overrides"
This commit is contained in:
commit
9632d8719f
@ -28,7 +28,10 @@ data:
|
|||||||
{{- end }}
|
{{- end }}
|
||||||
{{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" .Values.conf.nagios.nagios.template "key" "nagios.cfg" "format" "Secret") | indent 2 }}
|
{{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" .Values.conf.nagios.nagios.template "key" "nagios.cfg" "format" "Secret") | indent 2 }}
|
||||||
{{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" .Values.conf.nagios.cgi.template "key" "cgi.cfg" "format" "Secret") | indent 2 }}
|
{{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" .Values.conf.nagios.cgi.template "key" "cgi.cfg" "format" "Secret") | indent 2 }}
|
||||||
{{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" .Values.conf.nagios.objects.template "key" "nagios_objects.cfg" "format" "Secret") | indent 2 }}
|
{{- range $objectType, $config := $envAll.Values.conf.nagios.objects }}
|
||||||
|
{{- $objectFile := printf "%s.cfg" $objectType -}}
|
||||||
|
{{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" $config.template "key" $objectFile "format" "Secret") | indent 2 }}
|
||||||
|
{{- end }}
|
||||||
#NOTE(portdirect): this must be last, to work round helm ~2.7 bug.
|
#NOTE(portdirect): this must be last, to work round helm ~2.7 bug.
|
||||||
{{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" .Values.conf.httpd "key" "httpd.conf" "format" "Secret") | indent 2 }}
|
{{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" .Values.conf.httpd "key" "httpd.conf" "format" "Secret") | indent 2 }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
@ -203,10 +203,13 @@ spec:
|
|||||||
mountPath: /opt/nagios/etc/cgi.cfg
|
mountPath: /opt/nagios/etc/cgi.cfg
|
||||||
subPath: cgi.cfg
|
subPath: cgi.cfg
|
||||||
readOnly: true
|
readOnly: true
|
||||||
|
{{- $objectKeys := keys $envAll.Values.conf.nagios.objects -}}
|
||||||
|
{{- range $objectType := $objectKeys }}
|
||||||
- name: nagios-etc
|
- name: nagios-etc
|
||||||
mountPath: /opt/nagios/etc/nagios_objects.cfg
|
mountPath: /opt/nagios/etc/{{$objectType}}.cfg
|
||||||
subPath: nagios_objects.cfg
|
subPath: {{$objectType}}.cfg
|
||||||
readOnly: true
|
readOnly: true
|
||||||
|
{{- end }}
|
||||||
- name: nagios-bin
|
- name: nagios-bin
|
||||||
mountPath: /tmp/nagios-readiness.sh
|
mountPath: /tmp/nagios-readiness.sh
|
||||||
subPath: nagios-readiness.sh
|
subPath: nagios-readiness.sh
|
||||||
|
@ -392,6 +392,7 @@ conf:
|
|||||||
primary_target: 127.0.0.1:3904/events
|
primary_target: 127.0.0.1:3904/events
|
||||||
secondary_target: 127.0.0.1:3904/events
|
secondary_target: 127.0.0.1:3904/events
|
||||||
objects:
|
objects:
|
||||||
|
base:
|
||||||
template: |
|
template: |
|
||||||
define host {
|
define host {
|
||||||
address 127.0.0.1
|
address 127.0.0.1
|
||||||
@ -487,121 +488,6 @@ conf:
|
|||||||
command_name check_prom_alert
|
command_name check_prom_alert
|
||||||
}
|
}
|
||||||
|
|
||||||
define command {
|
|
||||||
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_in_4h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} will be full in four hours' --ok_message 'OK- All mountpoints usage rate is normal'
|
|
||||||
command_name check_filespace_mounts-usage-rate-fullin4hrs
|
|
||||||
}
|
|
||||||
|
|
||||||
define command {
|
|
||||||
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_80percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} is more than 80 pecent full' --ok_message 'OK- All mountpoints usage is normal'
|
|
||||||
command_name check_filespace_mounts-usage
|
|
||||||
}
|
|
||||||
|
|
||||||
define command {
|
|
||||||
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_load1_90percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node load average has been more than 90% for the pash hour' --ok_message 'OK- Node load average is normal'
|
|
||||||
command_name check_node_loadavg
|
|
||||||
}
|
|
||||||
|
|
||||||
define command {
|
|
||||||
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_cpu_util_90percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node CPU utilization has been more than 90% for the pash hour' --ok_message 'OK- Node cpu utilization is normal'
|
|
||||||
command_name check_node_cpu_util
|
|
||||||
}
|
|
||||||
|
|
||||||
define command {
|
|
||||||
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_conntrack_usage_80percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node network connections are more than 90% in use' --ok_message 'OK- Network connection utilization is normal'
|
|
||||||
command_name check_network_connections
|
|
||||||
}
|
|
||||||
|
|
||||||
define command {
|
|
||||||
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_memory_load' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node memory usage is more than 85%' --ok_message 'OK- Node memory usage is less than 85%'
|
|
||||||
command_name check_memory_usage
|
|
||||||
}
|
|
||||||
|
|
||||||
define command {
|
|
||||||
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_write_latency' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Disk write latency is high' --ok_message 'OK- Node disk write latency is normal'
|
|
||||||
command_name check_disk_write_latency
|
|
||||||
}
|
|
||||||
|
|
||||||
define command {
|
|
||||||
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_read_latency' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Disk read latency is high' --ok_message 'OK- Node disk read latency is normal'
|
|
||||||
command_name check_disk_read_latency
|
|
||||||
}
|
|
||||||
|
|
||||||
define command {
|
|
||||||
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_entropy_available_low' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- System has low entropy availability' --ok_message 'OK- System entropy availability is sufficient'
|
|
||||||
command_name check_entropy_availability
|
|
||||||
}
|
|
||||||
|
|
||||||
define command {
|
|
||||||
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filedescriptors_full_in_3h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- at current consumption rate no free file descriptors will be available in 3hrs.' --ok_message 'OK- System file descriptor consumption is ok.'
|
|
||||||
command_name check_filedescriptor_usage_rate
|
|
||||||
}
|
|
||||||
|
|
||||||
define command {
|
|
||||||
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_hwmon_high_cpu_temp' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- CPU temperature is 90 percent of critical temperature.' --ok_message 'OK- CPU temperatures are normal.'
|
|
||||||
command_name check_hwmon_high_cpu_temp
|
|
||||||
}
|
|
||||||
|
|
||||||
define command {
|
|
||||||
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_rcv' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network reception.' --ok_message 'OK- network packet receive drops not high.'
|
|
||||||
command_name check_network_receive_drop_high
|
|
||||||
}
|
|
||||||
|
|
||||||
define command {
|
|
||||||
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network transmission.' --ok_message 'OK- network packet tramsmit drops not high.'
|
|
||||||
command_name check_network_transmit_drop_high
|
|
||||||
}
|
|
||||||
|
|
||||||
define command {
|
|
||||||
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network reception.' --ok_message 'OK- network reception errors not high.'
|
|
||||||
command_name check_network_receive_errors_high
|
|
||||||
}
|
|
||||||
|
|
||||||
define command {
|
|
||||||
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network transmission.' --ok_message 'OK- network transmission errors not high.'
|
|
||||||
command_name check_network_transmit_errors_high
|
|
||||||
}
|
|
||||||
|
|
||||||
define command {
|
|
||||||
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_vmstat_paging_rate_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Memory paging rate over 5 minutes is high.' --ok_message 'OK- Memory paging rate over 5 minutes is ok.'
|
|
||||||
command_name check_vmstat_paging_rate
|
|
||||||
}
|
|
||||||
|
|
||||||
define command {
|
|
||||||
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_xfs_block_allocation_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- XFS block allocation is more than 80 percent of available.' --ok_message 'OK- XFS block allocation is less than 80 percent of available.'
|
|
||||||
command_name check_xfs_block_allocation
|
|
||||||
}
|
|
||||||
|
|
||||||
define command {
|
|
||||||
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_bond_slaves_down' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- {master} is missing slave interfaces.' --ok_message 'OK- Network bonds have slave interfaces functional.'
|
|
||||||
command_name check_network_bond_status
|
|
||||||
}
|
|
||||||
|
|
||||||
define command {
|
|
||||||
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_numa_memory_used' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NUMA memory usage is more than 80 percent of available.' --ok_message 'OK- NUMA memory usage is normal.'
|
|
||||||
command_name check_numa_memory_usage
|
|
||||||
}
|
|
||||||
|
|
||||||
define command {
|
|
||||||
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_ntp_clock_skew_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NTP clock skew is more than 2 seconds.' --ok_message 'OK- NTP clock skew is less than 2 seconds.'
|
|
||||||
command_name check_ntp_sync
|
|
||||||
}
|
|
||||||
|
|
||||||
define command {
|
|
||||||
command_line $USER1$/check_exporter_health_metric.py --exporter_namespace "ceph" --label_selector "application=ceph,component=manager" --health_metric ceph_health_status --critical 2 --warning 1
|
|
||||||
command_name check_ceph_health
|
|
||||||
}
|
|
||||||
|
|
||||||
define command {
|
|
||||||
command_line $USER1$/query_elasticsearch.py $USER9$ '$ARG1$' '$ARG2$' '$ARG3$' '$ARG4$' '$ARG5$' --simple_query '$ARG6$' --simple_query_fields '$ARG7$' --match '$ARG8$' --range '$ARG9$'
|
|
||||||
command_name check_es_query
|
|
||||||
}
|
|
||||||
|
|
||||||
define command {
|
|
||||||
command_line $USER1$/query_elasticsearch.py $USER9$ '$ARG1$' '$ARG2$' '$ARG3$' '$ARG4$' '$ARG5$' --simple_query '$ARG6$' --simple_query_fields '$ARG7$' --query_file '/opt/nagios/etc/objects/query_es_clauses.json' --query_clause '$ARG8$' --match '$ARG9$' --range '$ARG10$'
|
|
||||||
command_name check_es_query_w_file
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
check_interval 60
|
check_interval 60
|
||||||
contact_groups snmp_and_http_notifying_contact_group
|
contact_groups snmp_and_http_notifying_contact_group
|
||||||
@ -613,13 +499,20 @@ conf:
|
|||||||
retry_interval 30
|
retry_interval 30
|
||||||
use generic-service
|
use generic-service
|
||||||
}
|
}
|
||||||
|
kubernetes:
|
||||||
|
template: |
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!prom_exporter_calico_unavailable!CRITICAL- Calico exporter is not collecting metrics for alerting!OK- Calico exporter metrics are available.
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Prometheus-exporter_Calico
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
check_command check_ceph_health
|
check_command check_prom_alert!prom_exporter_kube_state_metrics_unavailable!CRITICAL- kube-state-metrics exporter is not collecting metrics for alerting!OK- kube-state-metrics exporter metrics are available.
|
||||||
check_interval 300
|
hostgroup_name prometheus-hosts
|
||||||
hostgroup_name base-os
|
service_description Prometheus-exporter_Kube-state-metrics
|
||||||
service_description CEPH_health
|
use generic-service
|
||||||
use notifying_service
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
@ -766,213 +659,6 @@ conf:
|
|||||||
use notifying_service
|
use notifying_service
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!os_glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description API_glance
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!os_nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description API_nova
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!os_keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description API_keystone
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!os_neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description API_neutron
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!os_neutron_metadata_agent_availability!CRITICAL- Some Neutron metadata agents are not available!OK- All the neutron metadata agents are up
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Service_neutron-metadata-agent
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!os_neutron_openvswitch_agent_availability!CRITICAL- Some Neutron openvswitch agents are not available!OK- All the neutron openvswitch agents are up
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Service_neutron-openvswitch-agent
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!os_neutron_dhcp_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron dhcp agents are up
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Service_neutron-dhcp-agent
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!os_neutron_l3_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron l3 agents are up
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Service_neutron-l3-agent
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!os_swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description API_swift
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description API_cinder
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!os_heat_api_availability!CRITICAL- Heat API at {url} is not available!OK- Heat API is available
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description API_heat
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description API_cinder
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!os_cinder_scheduler_availability!CRITICAL- Cinder scheduler is not available!OK- Cinder scheduler is available
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Service_cinder-scheduler
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!os_nova_compute_down!CRITICAL- nova-compute services are down on certain hosts!OK- nova-compute services are up on all hosts
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Service_nova-compute
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!os_nova_conductor_down!CRITICAL- nova-conductor services are down on certain hosts!OK- nova-conductor services are up on all hosts
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Service_nova-conductor
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!os_nova_consoleauth_down!CRITICAL- nova-consoleauth services are down on certain hosts!OK- nova-consoleauth services are up on all hosts
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Service_nova-consoleauth
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!openstack_nova_scheduler_down!CRITICAL- nova-scheduler services are down on certain hosts!OK- nova-scheduler services are up on all hosts
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Service_nova-scheduler
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!os_vm_vcpu_usage_high!CRITICAL- vcpu usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs vcpu usage is less than 80 percent of available.
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description OS-Total-Quota_VCPU-usage
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!os_vm_ram_usage_high!CRITICAL- RAM usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs RAM usage is less than 80 percent of available.
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description OS-Total-Quota_RAM-usage
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!os_vm_disk_usage_high!CRITICAL- Disk usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs Disk usage is less than 80 percent of available.
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description OS-Total-Quota_Disk-usage
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!ceph_monitor_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description CEPH_quorum
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!ceph_cluster_usage_high!CRITICAL- ceph cluster storage is more than 80 percent!OK- ceph storage is less than 80 percent
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description CEPH_storage-usage
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!ceph_placement_group_degrade_pct_high!CRITICAL- ceph cluster PGs down are more than 80 percent!OK- ceph PG degradation is less than 80 percent
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description CEPH_PGs-degradation
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!ceph_osd_down!CRITICAL- One or more CEPH OSDs are down for more than 5 minutes!OK- All the CEPH OSDs are up
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description CEPH_OSDs-down
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!ceph_monitor_clock_skew_high!CRITICAL- CEPH clock skew is more than 2 seconds!OK- CEPH clock skew is less than 2 seconds
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description CEPH_Clock-skew
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!fluentd_not_running!CRITICAL- fluentd is not running on {instance}!OK- Flunetd is working on all nodes
|
|
||||||
check_interval 60
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Fluentd_status
|
|
||||||
use notifying_service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
check_command check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="DELETE"!CRITICAL- ETCD {instance} has a high HTTP DELETE operations failure!OK- ETCD at {instance} has low or no failures for HTTP DELETE
|
check_command check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="DELETE"!CRITICAL- ETCD {instance} has a high HTTP DELETE operations failure!OK- ETCD at {instance} has low or no failures for HTTP DELETE
|
||||||
check_interval 60
|
check_interval 60
|
||||||
@ -1031,210 +717,8 @@ conf:
|
|||||||
service_description Calico_datapane_failures_high
|
service_description Calico_datapane_failures_high
|
||||||
use notifying_service
|
use notifying_service
|
||||||
}
|
}
|
||||||
|
node:
|
||||||
define service {
|
template: |
|
||||||
check_command check_prom_alert!rabbitmq_network_pratitions_detected!CRITICAL- Rabbitmq instance {instance} has network partitions!OK- no network partitions detected in rabbitmq
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Rabbitmq_network-partitions-exist
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!rabbitmq_down!CRITICAL- Rabbitmq instance {instance} is down!OK- rabbitmq is available
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Rabbitmq_up
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!rabbitmq_file_descriptor_usage_high!CRITICAL- Rabbitmq instance {instance} has file desciptor usage more than 80 percent!OK- rabbitmq file descriptor usage is normal
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Rabbitmq_file-descriptor-usage
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!rabbitmq_node_disk_free_alarm!CRITICAL- Rabbitmq instance {instance} has a disk usage alarm!OK- rabbitmq node disk has no alarms
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Rabbitmq_node-disk-alarm
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!rabbitmq_node_memory_alarm!CRITICAL- Rabbitmq instance {instance} has a memory alarm!OK- rabbitmq node memory has no alarms
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Rabbitmq_node-memory-alarm
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!rabbitmq_less_than_3_nodes!CRITICAL- Rabbitmq has less than 3 nodes to serve!OK- rabbitmq has atleast 3 nodes serving
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Rabbitmq_high-availability
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!rabbitmq_queue_messages_returned_high!CRITICAL- Rabbitmq has high percent of messages being returned!OK- rabbitmq messages are consumed and low or no returns exist.
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Rabbitmq_message-return-percent
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!rabbitmq_consumers_low_utilization!CRITICAL- Rabbitmq consumer message consumption rate is slow!OK- rabbitmq message consumption speed is normal
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Rabbitmq_consumer-utilization
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!rabbitmq_high_message_load!CRITICAL- Rabbitmq unacknowledged message count is high!OK- rabbitmq unacknowledged message count is high
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Rabbitmq_rabbitmq-queue-health
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!es_high_process_open_files_count!CRITICAL- Elasticsearch {host} has high process open file count!OK- Elasticsearch process open file count is normal.
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description ES_high-process-open-file-count
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!es_high_process_cpu_percent!CRITICAL- Elasticsearch {instance} has high process CPU percent!OK- Elasticsearch process cpu usage is normal.
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description ES_high-process-cpu-percent
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!es_fs_usage_high!CRITICAL- Elasticsearch {instance} has high filesystem usage!OK- Elasticsearch filesystem usage is normal.
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description ES_high-filesystem-usage
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!es_unassigned_shards!CRITICAL- Elasticsearch has unassinged shards!OK- Elasticsearch has no unassigned shards.
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description ES_unassigned-shards
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!es_cluster_health_timed_out!CRITICAL- Elasticsearch Cluster health status call timedout!OK- Elasticsearch cluster health is retrievable.
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description ES_cluster-health-timedout
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!es_cluster_health_status_alert!CRITICAL- Elasticsearch cluster health status is not green. One or more shards or replicas are unallocated!OK- Elasticsearch cluster health is green.
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description ES_cluster-health-status
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!es_cluster_health_too_few_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 nodes running!OK- Elasticsearch cluster has 3 or more nodes running.
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description ES_cluster-running-node-count
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!es_cluster_health_too_few_data_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 data nodes running!OK- Elasticsearch cluster has 3 or more data nodes running.
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description ES_cluster-running-data-node-count
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!mariadb_table_lock_wait_high!CRITICAL- Mariadb has high number of table lock waits!OK- No issues found with table lock waits.
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Mariadb_table-lock-waits-high
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!mariadb_node_not_ready!CRITICAL- Mariadb {instance} is not ready!OK- All galera cluster nodes are ready.
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Mariadb_node-ready
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!mariadb_galera_node_out_of_sync!CRITICAL- Mariadb {instance} is out of sync!OK- All galera cluster nodes are in sync
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Mariadb_node-synchronized
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!mariadb_innodb_replication_fallen_behind!CRITICAL- Innodb replication has fallen behind and not recovering!OK- innodb replication lag is nominal.
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Mariadb_innodb-replication-lag
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!pg_replication_fallen_behind!CRITICAL- Postgres Replication lag is over 2 minutes!OK- postgresql replication lag is nominal.
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Postgresql_replication-lag
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!pg_connections_too_high!CRITICAL- Postgres has more than 95% of available connections in use.!OK- postgresql open connections are within bounds.
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Postgresql_connections
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!pg_deadlocks_detected!CRITICAL- Postgres server is experiencing deadlocks!OK- postgresql is not showing any deadlocks.
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Postgresql_deadlocks
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!prom_exporter_ceph_unavailable!CRITICAL- CEPH exporter is not collecting metrics for alerting!OK- CEPH exporter metrics are available.
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Prometheus-exporter_CEPH
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!prom_exporter_openstack_unavailable!CRITICAL- Openstack exporter is not collecting metrics for alerting!OK- Openstack exporter metrics are available.
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Prometheus-exporter_Openstack
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!prom_exporter_mariadb_unavailable!CRITICAL- MariaDB exporter is not collecting metrics for alerting!OK- MariaDB exporter metrics are available.
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Prometheus-exporter_MariaDB
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!prom_exporter_kube_state_metrics_unavailable!CRITICAL- kube-state-metrics exporter is not collecting metrics for alerting!OK- kube-state-metrics exporter metrics are available.
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Prometheus-exporter_Kube-state-metrics
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
check_command check_prom_alert!prom_exporter_postgresql_unavailable!CRITICAL- Postgresql exporter is not collecting metrics for alerting!OK- Postgresql exporter metrics are available.
|
|
||||||
hostgroup_name prometheus-hosts
|
|
||||||
service_description Prometheus-exporter_Postgresql
|
|
||||||
use generic-service
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
check_command check_prom_alert!prom_exporter_node_unavailable!CRITICAL- Node exporter is not collecting metrics for alerting!OK- Node exporter metrics are available.
|
check_command check_prom_alert!prom_exporter_node_unavailable!CRITICAL- Node exporter is not collecting metrics for alerting!OK- Node exporter metrics are available.
|
||||||
hostgroup_name prometheus-hosts
|
hostgroup_name prometheus-hosts
|
||||||
@ -1242,25 +726,104 @@ conf:
|
|||||||
use generic-service
|
use generic-service
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define command {
|
||||||
check_command check_prom_alert!prom_exporter_calico_unavailable!CRITICAL- Calico exporter is not collecting metrics for alerting!OK- Calico exporter metrics are available.
|
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_in_4h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} will be full in four hours' --ok_message 'OK- All mountpoints usage rate is normal'
|
||||||
hostgroup_name prometheus-hosts
|
command_name check_filespace_mounts-usage-rate-fullin4hrs
|
||||||
service_description Prometheus-exporter_Calico
|
|
||||||
use generic-service
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define command {
|
||||||
check_command check_prom_alert!prom_exporter_elasticsearch_unavailable!CRITICAL- Elasticsearch exporter is not collecting metrics for alerting!OK- Elasticsearch exporter metrics are available.
|
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_80percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} is more than 80 pecent full' --ok_message 'OK- All mountpoints usage is normal'
|
||||||
hostgroup_name prometheus-hosts
|
command_name check_filespace_mounts-usage
|
||||||
service_description Prometheus-exporter_Elasticsearch
|
|
||||||
use generic-service
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define command {
|
||||||
check_command check_prom_alert!prom_exporter_fluentd_unavailable!CRITICAL- Fluentd exporter is not collecting metrics for alerting!OK- Fluentd exporter metrics are available.
|
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_load1_90percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node load average has been more than 90% for the pash hour' --ok_message 'OK- Node load average is normal'
|
||||||
hostgroup_name prometheus-hosts
|
command_name check_node_loadavg
|
||||||
service_description Prometheus-exporter_Fluentd
|
}
|
||||||
use generic-service
|
|
||||||
|
define command {
|
||||||
|
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_cpu_util_90percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node CPU utilization has been more than 90% for the pash hour' --ok_message 'OK- Node cpu utilization is normal'
|
||||||
|
command_name check_node_cpu_util
|
||||||
|
}
|
||||||
|
|
||||||
|
define command {
|
||||||
|
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_conntrack_usage_80percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node network connections are more than 90% in use' --ok_message 'OK- Network connection utilization is normal'
|
||||||
|
command_name check_network_connections
|
||||||
|
}
|
||||||
|
|
||||||
|
define command {
|
||||||
|
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_memory_load' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node memory usage is more than 85%' --ok_message 'OK- Node memory usage is less than 85%'
|
||||||
|
command_name check_memory_usage
|
||||||
|
}
|
||||||
|
|
||||||
|
define command {
|
||||||
|
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_write_latency' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Disk write latency is high' --ok_message 'OK- Node disk write latency is normal'
|
||||||
|
command_name check_disk_write_latency
|
||||||
|
}
|
||||||
|
|
||||||
|
define command {
|
||||||
|
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_read_latency' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Disk read latency is high' --ok_message 'OK- Node disk read latency is normal'
|
||||||
|
command_name check_disk_read_latency
|
||||||
|
}
|
||||||
|
|
||||||
|
define command {
|
||||||
|
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_entropy_available_low' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- System has low entropy availability' --ok_message 'OK- System entropy availability is sufficient'
|
||||||
|
command_name check_entropy_availability
|
||||||
|
}
|
||||||
|
|
||||||
|
define command {
|
||||||
|
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filedescriptors_full_in_3h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- at current consumption rate no free file descriptors will be available in 3hrs.' --ok_message 'OK- System file descriptor consumption is ok.'
|
||||||
|
command_name check_filedescriptor_usage_rate
|
||||||
|
}
|
||||||
|
|
||||||
|
define command {
|
||||||
|
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_hwmon_high_cpu_temp' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- CPU temperature is 90 percent of critical temperature.' --ok_message 'OK- CPU temperatures are normal.'
|
||||||
|
command_name check_hwmon_high_cpu_temp
|
||||||
|
}
|
||||||
|
|
||||||
|
define command {
|
||||||
|
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_rcv' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network reception.' --ok_message 'OK- network packet receive drops not high.'
|
||||||
|
command_name check_network_receive_drop_high
|
||||||
|
}
|
||||||
|
|
||||||
|
define command {
|
||||||
|
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network transmission.' --ok_message 'OK- network packet tramsmit drops not high.'
|
||||||
|
command_name check_network_transmit_drop_high
|
||||||
|
}
|
||||||
|
|
||||||
|
define command {
|
||||||
|
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network reception.' --ok_message 'OK- network reception errors not high.'
|
||||||
|
command_name check_network_receive_errors_high
|
||||||
|
}
|
||||||
|
|
||||||
|
define command {
|
||||||
|
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network transmission.' --ok_message 'OK- network transmission errors not high.'
|
||||||
|
command_name check_network_transmit_errors_high
|
||||||
|
}
|
||||||
|
|
||||||
|
define command {
|
||||||
|
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_vmstat_paging_rate_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Memory paging rate over 5 minutes is high.' --ok_message 'OK- Memory paging rate over 5 minutes is ok.'
|
||||||
|
command_name check_vmstat_paging_rate
|
||||||
|
}
|
||||||
|
|
||||||
|
define command {
|
||||||
|
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_xfs_block_allocation_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- XFS block allocation is more than 80 percent of available.' --ok_message 'OK- XFS block allocation is less than 80 percent of available.'
|
||||||
|
command_name check_xfs_block_allocation
|
||||||
|
}
|
||||||
|
|
||||||
|
define command {
|
||||||
|
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_bond_slaves_down' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- {master} is missing slave interfaces.' --ok_message 'OK- Network bonds have slave interfaces functional.'
|
||||||
|
command_name check_network_bond_status
|
||||||
|
}
|
||||||
|
|
||||||
|
define command {
|
||||||
|
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_numa_memory_used' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NUMA memory usage is more than 80 percent of available.' --ok_message 'OK- NUMA memory usage is normal.'
|
||||||
|
command_name check_numa_memory_usage
|
||||||
|
}
|
||||||
|
|
||||||
|
define command {
|
||||||
|
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_ntp_clock_skew_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NTP clock skew is more than 2 seconds.' --ok_message 'OK- NTP clock skew is less than 2 seconds.'
|
||||||
|
command_name check_ntp_sync
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
@ -1404,6 +967,67 @@ conf:
|
|||||||
service_description NTP_sync
|
service_description NTP_sync
|
||||||
use notifying_service
|
use notifying_service
|
||||||
}
|
}
|
||||||
|
ceph:
|
||||||
|
template: |
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!prom_exporter_ceph_unavailable!CRITICAL- CEPH exporter is not collecting metrics for alerting!OK- CEPH exporter metrics are available.
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Prometheus-exporter_CEPH
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define command {
|
||||||
|
command_line $USER1$/check_exporter_health_metric.py --exporter_api $USER10$ --health_metric ceph_health_status --critical 2 --warning 1
|
||||||
|
command_name check_ceph_health
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_ceph_health
|
||||||
|
check_interval 300
|
||||||
|
hostgroup_name base-os
|
||||||
|
service_description CEPH_health
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!ceph_monitor_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description CEPH_quorum
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!ceph_cluster_usage_high!CRITICAL- ceph cluster storage is more than 80 percent!OK- ceph storage is less than 80 percent
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description CEPH_storage-usage
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!ceph_placement_group_degrade_pct_high!CRITICAL- ceph cluster PGs down are more than 80 percent!OK- ceph PG degradation is less than 80 percent
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description CEPH_PGs-degradation
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!ceph_osd_down!CRITICAL- One or more CEPH OSDs are down for more than 5 minutes!OK- All the CEPH OSDs are up
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description CEPH_OSDs-down
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!ceph_monitor_clock_skew_high!CRITICAL- CEPH clock skew is more than 2 seconds!OK- CEPH clock skew is less than 2 seconds
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description CEPH_Clock-skew
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
nagios:
|
nagios:
|
||||||
template: |
|
template: |
|
||||||
accept_passive_host_checks=1
|
accept_passive_host_checks=1
|
||||||
@ -1416,7 +1040,10 @@ conf:
|
|||||||
bare_update_check=0
|
bare_update_check=0
|
||||||
cached_host_check_horizon=15
|
cached_host_check_horizon=15
|
||||||
cached_service_check_horizon=15
|
cached_service_check_horizon=15
|
||||||
cfg_file=/opt/nagios/etc/nagios_objects.cfg
|
{{- $objectKeys := keys .Values.conf.nagios.objects -}}
|
||||||
|
{{- range $object := $objectKeys }}
|
||||||
|
cfg_file=/opt/nagios/etc/{{$object}}.cfg
|
||||||
|
{{- end }}
|
||||||
cfg_file=/opt/nagios/etc/objects/commands.cfg
|
cfg_file=/opt/nagios/etc/objects/commands.cfg
|
||||||
cfg_file=/opt/nagios/etc/objects/contacts.cfg
|
cfg_file=/opt/nagios/etc/objects/contacts.cfg
|
||||||
cfg_file=/opt/nagios/etc/objects/timeperiods.cfg
|
cfg_file=/opt/nagios/etc/objects/timeperiods.cfg
|
||||||
|
93
nagios/values_overrides/elasticsearch-objects.yaml
Normal file
93
nagios/values_overrides/elasticsearch-objects.yaml
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
conf:
|
||||||
|
nagios:
|
||||||
|
objects:
|
||||||
|
fluent:
|
||||||
|
template: |
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!fluentd_not_running!CRITICAL- fluentd is not running on {instance}!OK- Flunetd is working on all nodes
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Fluentd_status
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!prom_exporter_fluentd_unavailable!CRITICAL- Fluentd exporter is not collecting metrics for alerting!OK- Fluentd exporter metrics are available.
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Prometheus-exporter_Fluentd
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
elasticsearch:
|
||||||
|
template: |
|
||||||
|
define command {
|
||||||
|
command_line $USER1$/query_elasticsearch.py $USER9$ '$ARG1$' '$ARG2$' '$ARG3$' '$ARG4$' '$ARG5$' --simple_query '$ARG6$' --simple_query_fields '$ARG7$' --match '$ARG8$' --range '$ARG9$'
|
||||||
|
command_name check_es_query
|
||||||
|
}
|
||||||
|
|
||||||
|
define command {
|
||||||
|
command_line $USER1$/query_elasticsearch.py $USER9$ '$ARG1$' '$ARG2$' '$ARG3$' '$ARG4$' '$ARG5$' --simple_query '$ARG6$' --simple_query_fields '$ARG7$' --query_file '/opt/nagios/etc/objects/query_es_clauses.json' --query_clause '$ARG8$' --match '$ARG9$' --range '$ARG10$'
|
||||||
|
command_name check_es_query_w_file
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!prom_exporter_elasticsearch_unavailable!CRITICAL- Elasticsearch exporter is not collecting metrics for alerting!OK- Elasticsearch exporter metrics are available.
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Prometheus-exporter_Elasticsearch
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!es_high_process_open_files_count!CRITICAL- Elasticsearch {host} has high process open file count!OK- Elasticsearch process open file count is normal.
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description ES_high-process-open-file-count
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!es_high_process_cpu_percent!CRITICAL- Elasticsearch {instance} has high process CPU percent!OK- Elasticsearch process cpu usage is normal.
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description ES_high-process-cpu-percent
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!es_fs_usage_high!CRITICAL- Elasticsearch {instance} has high filesystem usage!OK- Elasticsearch filesystem usage is normal.
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description ES_high-filesystem-usage
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!es_unassigned_shards!CRITICAL- Elasticsearch has unassinged shards!OK- Elasticsearch has no unassigned shards.
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description ES_unassigned-shards
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!es_cluster_health_timed_out!CRITICAL- Elasticsearch Cluster health status call timedout!OK- Elasticsearch cluster health is retrievable.
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description ES_cluster-health-timedout
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!es_cluster_health_status_alert!CRITICAL- Elasticsearch cluster health status is not green. One or more shards or replicas are unallocated!OK- Elasticsearch cluster health is green.
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description ES_cluster-health-status
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!es_cluster_health_too_few_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 nodes running!OK- Elasticsearch cluster has 3 or more nodes running.
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description ES_cluster-running-node-count
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!es_cluster_health_too_few_data_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 data nodes running!OK- Elasticsearch cluster has 3 or more data nodes running.
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description ES_cluster-running-data-node-count
|
||||||
|
use generic-service
|
||||||
|
}
|
270
nagios/values_overrides/openstack-objects.yaml
Normal file
270
nagios/values_overrides/openstack-objects.yaml
Normal file
@ -0,0 +1,270 @@
|
|||||||
|
conf:
|
||||||
|
nagios:
|
||||||
|
objects:
|
||||||
|
mariadb:
|
||||||
|
template: |
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!prom_exporter_mariadb_unavailable!CRITICAL- MariaDB exporter is not collecting metrics for alerting!OK- MariaDB exporter metrics are available.
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Prometheus-exporter_MariaDB
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!mariadb_table_lock_wait_high!CRITICAL- Mariadb has high number of table lock waits!OK- No issues found with table lock waits.
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Mariadb_table-lock-waits-high
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!mariadb_node_not_ready!CRITICAL- Mariadb {instance} is not ready!OK- All galera cluster nodes are ready.
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Mariadb_node-ready
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!mariadb_galera_node_out_of_sync!CRITICAL- Mariadb {instance} is out of sync!OK- All galera cluster nodes are in sync
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Mariadb_node-synchronized
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!mariadb_innodb_replication_fallen_behind!CRITICAL- Innodb replication has fallen behind and not recovering!OK- innodb replication lag is nominal.
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Mariadb_innodb-replication-lag
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
rabbitmq:
|
||||||
|
template: |
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!rabbitmq_network_pratitions_detected!CRITICAL- Rabbitmq instance {instance} has network partitions!OK- no network partitions detected in rabbitmq
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Rabbitmq_network-partitions-exist
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!rabbitmq_down!CRITICAL- Rabbitmq instance {instance} is down!OK- rabbitmq is available
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Rabbitmq_up
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!rabbitmq_file_descriptor_usage_high!CRITICAL- Rabbitmq instance {instance} has file desciptor usage more than 80 percent!OK- rabbitmq file descriptor usage is normal
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Rabbitmq_file-descriptor-usage
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!rabbitmq_node_disk_free_alarm!CRITICAL- Rabbitmq instance {instance} has a disk usage alarm!OK- rabbitmq node disk has no alarms
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Rabbitmq_node-disk-alarm
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!rabbitmq_node_memory_alarm!CRITICAL- Rabbitmq instance {instance} has a memory alarm!OK- rabbitmq node memory has no alarms
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Rabbitmq_node-memory-alarm
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!rabbitmq_less_than_3_nodes!CRITICAL- Rabbitmq has less than 3 nodes to serve!OK- rabbitmq has atleast 3 nodes serving
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Rabbitmq_high-availability
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!rabbitmq_queue_messages_returned_high!CRITICAL- Rabbitmq has high percent of messages being returned!OK- rabbitmq messages are consumed and low or no returns exist.
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Rabbitmq_message-return-percent
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!rabbitmq_consumers_low_utilization!CRITICAL- Rabbitmq consumer message consumption rate is slow!OK- rabbitmq message consumption speed is normal
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Rabbitmq_consumer-utilization
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!rabbitmq_high_message_load!CRITICAL- Rabbitmq unacknowledged message count is high!OK- rabbitmq unacknowledged message count is high
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Rabbitmq_rabbitmq-queue-health
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
openstack:
|
||||||
|
template: |
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!os_glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description API_glance
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!os_nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description API_nova
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!os_keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description API_keystone
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!os_neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description API_neutron
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!os_neutron_metadata_agent_availability!CRITICAL- Some Neutron metadata agents are not available!OK- All the neutron metadata agents are up
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Service_neutron-metadata-agent
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!os_neutron_openvswitch_agent_availability!CRITICAL- Some Neutron openvswitch agents are not available!OK- All the neutron openvswitch agents are up
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Service_neutron-openvswitch-agent
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!os_neutron_dhcp_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron dhcp agents are up
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Service_neutron-dhcp-agent
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!os_neutron_l3_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron l3 agents are up
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Service_neutron-l3-agent
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!os_swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description API_swift
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description API_cinder
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!os_heat_api_availability!CRITICAL- Heat API at {url} is not available!OK- Heat API is available
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description API_heat
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description API_cinder
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!os_cinder_scheduler_availability!CRITICAL- Cinder scheduler is not available!OK- Cinder scheduler is available
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Service_cinder-scheduler
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!os_nova_compute_down!CRITICAL- nova-compute services are down on certain hosts!OK- nova-compute services are up on all hosts
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Service_nova-compute
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!os_nova_conductor_down!CRITICAL- nova-conductor services are down on certain hosts!OK- nova-conductor services are up on all hosts
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Service_nova-conductor
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!os_nova_consoleauth_down!CRITICAL- nova-consoleauth services are down on certain hosts!OK- nova-consoleauth services are up on all hosts
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Service_nova-consoleauth
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!openstack_nova_scheduler_down!CRITICAL- nova-scheduler services are down on certain hosts!OK- nova-scheduler services are up on all hosts
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Service_nova-scheduler
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!os_vm_vcpu_usage_high!CRITICAL- vcpu usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs vcpu usage is less than 80 percent of available.
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description OS-Total-Quota_VCPU-usage
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!os_vm_ram_usage_high!CRITICAL- RAM usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs RAM usage is less than 80 percent of available.
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description OS-Total-Quota_RAM-usage
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!os_vm_disk_usage_high!CRITICAL- Disk usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs Disk usage is less than 80 percent of available.
|
||||||
|
check_interval 60
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description OS-Total-Quota_Disk-usage
|
||||||
|
use notifying_service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!prom_exporter_openstack_unavailable!CRITICAL- Openstack exporter is not collecting metrics for alerting!OK- Openstack exporter metrics are available.
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Prometheus-exporter_Openstack
|
||||||
|
use generic-service
|
||||||
|
}
|
32
nagios/values_overrides/postgresql-objects.yaml
Normal file
32
nagios/values_overrides/postgresql-objects.yaml
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
conf:
|
||||||
|
nagios:
|
||||||
|
objects:
|
||||||
|
postgresql:
|
||||||
|
template: |
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!prom_exporter_postgresql_unavailable!CRITICAL- Postgresql exporter is not collecting metrics for alerting!OK- Postgresql exporter metrics are available.
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Prometheus-exporter_Postgresql
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!pg_replication_fallen_behind!CRITICAL- Postgres Replication lag is over 2 minutes!OK- postgresql replication lag is nominal.
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Postgresql_replication-lag
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!pg_connections_too_high!CRITICAL- Postgres has more than 95% of available connections in use.!OK- postgresql open connections are within bounds.
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Postgresql_connections
|
||||||
|
use generic-service
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
check_command check_prom_alert!pg_deadlocks_detected!CRITICAL- Postgres server is experiencing deadlocks!OK- postgresql is not showing any deadlocks.
|
||||||
|
hostgroup_name prometheus-hosts
|
||||||
|
service_description Postgresql_deadlocks
|
||||||
|
use generic-service
|
||||||
|
}
|
45
tools/deployment/common/nagios.sh
Executable file
45
tools/deployment/common/nagios.sh
Executable file
@ -0,0 +1,45 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright 2017 The Openstack-Helm Authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
# not use this file except in compliance with the License. You may obtain
|
||||||
|
# a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
# License for the specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
|
||||||
|
set -xe
|
||||||
|
|
||||||
|
#NOTE: Lint and package chart
|
||||||
|
make nagios
|
||||||
|
|
||||||
|
#NOTE: Deploy command
|
||||||
|
tee /tmp/nagios.yaml << EOF
|
||||||
|
conf:
|
||||||
|
nagios:
|
||||||
|
query_es_clauses:
|
||||||
|
test_es_query:
|
||||||
|
hello: world
|
||||||
|
EOF
|
||||||
|
helm upgrade --install nagios ./nagios \
|
||||||
|
--namespace=osh-infra \
|
||||||
|
--values=/tmp/nagios.yaml \
|
||||||
|
--values=nagios/values_overrides/openstack-objects.yaml \
|
||||||
|
--values=nagios/values_overrides/postgresql-objects.yaml \
|
||||||
|
--values=nagios/values_overrides/elasticsearch-objects.yaml
|
||||||
|
|
||||||
|
#NOTE: Wait for deploy
|
||||||
|
./tools/deployment/common/wait-for-pods.sh osh-infra
|
||||||
|
|
||||||
|
#NOTE: Validate Deployment info
|
||||||
|
helm status nagios
|
||||||
|
|
||||||
|
#NOTE: Verify elasticsearch query clauses are functional by execing into pod
|
||||||
|
NAGIOS_POD=$(kubectl -n osh-infra get pods -l='application=nagios,component=monitoring' --output=jsonpath='{.items[0].metadata.name}')
|
||||||
|
kubectl exec $NAGIOS_POD -n osh-infra -c nagios -- cat /opt/nagios/etc/objects/query_es_clauses.json | python -m json.tool
|
@ -1,32 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Copyright 2017 The Openstack-Helm Authors.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
||||||
# not use this file except in compliance with the License. You may obtain
|
|
||||||
# a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
||||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
||||||
# License for the specific language governing permissions and limitations
|
|
||||||
# under the License.
|
|
||||||
|
|
||||||
set -xe
|
|
||||||
|
|
||||||
#NOTE: Lint and package chart
|
|
||||||
make nagios
|
|
||||||
|
|
||||||
#NOTE: Deploy command
|
|
||||||
helm upgrade --install nagios ./nagios \
|
|
||||||
--namespace=osh-infra
|
|
||||||
|
|
||||||
#NOTE: Wait for deploy
|
|
||||||
./tools/deployment/common/wait-for-pods.sh osh-infra
|
|
||||||
|
|
||||||
#NOTE: Validate Deployment info
|
|
||||||
helm status nagios
|
|
||||||
|
|
||||||
helm test nagios
|
|
1
tools/deployment/osh-infra-monitoring/120-nagios.sh
Symbolic link
1
tools/deployment/osh-infra-monitoring/120-nagios.sh
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../common/nagios.sh
|
Loading…
Reference in New Issue
Block a user