87ff958fb8
This updates the Prometheus pod container status alerts. This ensures there are alerts defined for ImagePullBackOff, ErrImagePull, and CreateContainerConfigError errors. This also updates the Nagios service checks to include correct checks for those alerts Change-Id: I91544e7dff8c6aac8c79cd8aa7d8f7bc03adaa9a
1213 lines
61 KiB
YAML
1213 lines
61 KiB
YAML
# Copyright 2017 The Openstack-Helm Authors.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
# Default values for nagios.
|
|
# This is a YAML-formatted file.
|
|
# Declare variables to be passed into your templates.
|
|
|
|
images:
|
|
tags:
|
|
apache_proxy: docker.io/httpd:2.4
|
|
nagios: quay.io/attcomdev/nagios:410fcb08d2586e98e18ced317dab4157eb27456e
|
|
dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.2.1
|
|
image_repo_sync: docker.io/docker:17.07.0
|
|
pull_policy: IfNotPresent
|
|
local_registry:
|
|
active: false
|
|
exclude:
|
|
- dep_check
|
|
- image_repo_sync
|
|
|
|
labels:
|
|
nagios:
|
|
node_selector_key: openstack-control-plane
|
|
node_selector_value: enabled
|
|
job:
|
|
node_selector_key: openstack-control-plane
|
|
node_selector_value: enabled
|
|
|
|
dependencies:
|
|
dynamic:
|
|
common:
|
|
jobs:
|
|
- nagios-image-repo-sync
|
|
services:
|
|
- service: local_image_registry
|
|
endpoint: node
|
|
static:
|
|
image_repo_sync:
|
|
services:
|
|
- service: local_image_registry
|
|
endpoint: internal
|
|
nagios:
|
|
services: null
|
|
|
|
secrets:
|
|
nagios:
|
|
admin: nagios-admin-creds
|
|
tls:
|
|
nagios:
|
|
nagios:
|
|
public: nagios-tls-public
|
|
|
|
endpoints:
|
|
cluster_domain_suffix: cluster.local
|
|
local_image_registry:
|
|
name: docker-registry
|
|
namespace: docker-registry
|
|
hosts:
|
|
default: localhost
|
|
internal: docker-registry
|
|
node: localhost
|
|
host_fqdn_override:
|
|
default: null
|
|
port:
|
|
registry:
|
|
node: 5000
|
|
monitoring:
|
|
name: prometheus
|
|
auth:
|
|
admin:
|
|
username: admin
|
|
password: changeme
|
|
hosts:
|
|
default: prom-metrics
|
|
public: prometheus
|
|
host_fqdn_override:
|
|
default: null
|
|
path:
|
|
default: null
|
|
scheme:
|
|
default: http
|
|
port:
|
|
http:
|
|
default: 80
|
|
nagios:
|
|
name: nagios
|
|
namespace: null
|
|
auth:
|
|
admin:
|
|
username: nagiosadmin
|
|
password: password
|
|
hosts:
|
|
default: nagios-metrics
|
|
public: nagios
|
|
host_fqdn_override:
|
|
default: null
|
|
# NOTE(srwilkers): this chart supports TLS for fqdn over-ridden public
|
|
# endpoints using the following format:
|
|
# public:
|
|
# host: null
|
|
# tls:
|
|
# crt: null
|
|
# key: null
|
|
path:
|
|
default: null
|
|
scheme:
|
|
default: http
|
|
port:
|
|
nagios:
|
|
default: 8000
|
|
http:
|
|
default: 80
|
|
ldap:
|
|
hosts:
|
|
default: ldap
|
|
auth:
|
|
admin:
|
|
bind: "cn=admin,dc=cluster,dc=local"
|
|
password: password
|
|
host_fqdn_override:
|
|
default: null
|
|
path:
|
|
default: "/ou=People,dc=cluster,dc=local"
|
|
scheme:
|
|
default: ldap
|
|
port:
|
|
ldap:
|
|
default: 389
|
|
elasticsearch:
|
|
name: elasticsearch
|
|
namespace: null
|
|
auth:
|
|
admin:
|
|
username: admin
|
|
password: changeme
|
|
hosts:
|
|
default: elasticsearch-logging
|
|
host_fqdn_override:
|
|
default: null
|
|
path:
|
|
default: /
|
|
scheme:
|
|
default: http
|
|
port:
|
|
http:
|
|
default: 80
|
|
ceph_mgr:
|
|
namespace: null
|
|
hosts:
|
|
default: ceph-mgr
|
|
host_fqdn_override:
|
|
default: null
|
|
port:
|
|
mgr:
|
|
default: 7000
|
|
metrics:
|
|
default: 9283
|
|
scheme:
|
|
default: http
|
|
|
|
network:
|
|
nagios:
|
|
ingress:
|
|
public: true
|
|
classes:
|
|
namespace: "nginx"
|
|
cluster: "nginx-cluster"
|
|
annotations:
|
|
nginx.ingress.kubernetes.io/rewrite-target: /
|
|
nginx.ingress.kubernetes.io/affinity: cookie
|
|
nginx.ingress.kubernetes.io/session-cookie-name: kube-ingress-session-nagios
|
|
nginx.ingress.kubernetes.io/session-cookie-hash: sha1
|
|
node_port:
|
|
enabled: false
|
|
port: 30925
|
|
|
|
pod:
|
|
lifecycle:
|
|
upgrades:
|
|
revision_history: 3
|
|
pod_replacement_strategy: RollingUpdate
|
|
rolling_update:
|
|
max_unavailable: 1
|
|
max_surge: 3
|
|
termination_grace_period:
|
|
nagios:
|
|
timeout: 30
|
|
replicas:
|
|
nagios: 1
|
|
resources:
|
|
enabled: false
|
|
nagios:
|
|
limits:
|
|
memory: "1024Mi"
|
|
cpu: "2000m"
|
|
requests:
|
|
memory: "128Mi"
|
|
cpu: "100m"
|
|
apache_proxy:
|
|
limits:
|
|
memory: "1024Mi"
|
|
cpu: "2000m"
|
|
requests:
|
|
memory: "128Mi"
|
|
cpu: "100m"
|
|
jobs:
|
|
image_repo_sync:
|
|
limits:
|
|
memory: "1024Mi"
|
|
cpu: "2000m"
|
|
requests:
|
|
memory: "128Mi"
|
|
cpu: "100m"
|
|
|
|
manifests:
|
|
configmap_bin: true
|
|
configmap_etc: true
|
|
deployment: true
|
|
ingress: true
|
|
job_image_repo_sync: true
|
|
network_policy: false
|
|
secret_nagios: true
|
|
secret_ingress_tls: true
|
|
service: true
|
|
service_ingress: true
|
|
|
|
conf:
|
|
httpd: |
|
|
ServerRoot "/usr/local/apache2"
|
|
|
|
Listen 80
|
|
|
|
LoadModule mpm_event_module modules/mod_mpm_event.so
|
|
LoadModule authn_file_module modules/mod_authn_file.so
|
|
LoadModule authn_core_module modules/mod_authn_core.so
|
|
LoadModule authz_host_module modules/mod_authz_host.so
|
|
LoadModule authz_groupfile_module modules/mod_authz_groupfile.so
|
|
LoadModule authz_user_module modules/mod_authz_user.so
|
|
LoadModule authz_core_module modules/mod_authz_core.so
|
|
LoadModule access_compat_module modules/mod_access_compat.so
|
|
LoadModule auth_basic_module modules/mod_auth_basic.so
|
|
LoadModule ldap_module modules/mod_ldap.so
|
|
LoadModule authnz_ldap_module modules/mod_authnz_ldap.so
|
|
LoadModule reqtimeout_module modules/mod_reqtimeout.so
|
|
LoadModule filter_module modules/mod_filter.so
|
|
LoadModule proxy_html_module modules/mod_proxy_html.so
|
|
LoadModule log_config_module modules/mod_log_config.so
|
|
LoadModule env_module modules/mod_env.so
|
|
LoadModule headers_module modules/mod_headers.so
|
|
LoadModule setenvif_module modules/mod_setenvif.so
|
|
LoadModule version_module modules/mod_version.so
|
|
LoadModule proxy_module modules/mod_proxy.so
|
|
LoadModule proxy_connect_module modules/mod_proxy_connect.so
|
|
LoadModule proxy_http_module modules/mod_proxy_http.so
|
|
LoadModule proxy_balancer_module modules/mod_proxy_balancer.so
|
|
LoadModule slotmem_shm_module modules/mod_slotmem_shm.so
|
|
LoadModule slotmem_plain_module modules/mod_slotmem_plain.so
|
|
LoadModule unixd_module modules/mod_unixd.so
|
|
LoadModule status_module modules/mod_status.so
|
|
LoadModule autoindex_module modules/mod_autoindex.so
|
|
|
|
<IfModule unixd_module>
|
|
User daemon
|
|
Group daemon
|
|
</IfModule>
|
|
|
|
<Directory />
|
|
AllowOverride none
|
|
Require all denied
|
|
</Directory>
|
|
|
|
<Files ".ht*">
|
|
Require all denied
|
|
</Files>
|
|
|
|
ErrorLog /dev/stderr
|
|
|
|
LogLevel warn
|
|
|
|
<IfModule log_config_module>
|
|
LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined
|
|
LogFormat "%h %l %u %t \"%r\" %>s %b" common
|
|
|
|
<IfModule logio_module>
|
|
LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %I %O" combinedio
|
|
</IfModule>
|
|
|
|
CustomLog /dev/stdout common
|
|
|
|
CustomLog /dev/stdout combined
|
|
</IfModule>
|
|
|
|
<Directory "/usr/local/apache2/cgi-bin">
|
|
AllowOverride None
|
|
Options None
|
|
Require all granted
|
|
</Directory>
|
|
|
|
<IfModule headers_module>
|
|
RequestHeader unset Proxy early
|
|
</IfModule>
|
|
|
|
<IfModule proxy_html_module>
|
|
Include conf/extra/proxy-html.conf
|
|
</IfModule>
|
|
|
|
<VirtualHost *:80>
|
|
<Location />
|
|
ProxyPass http://localhost:{{ tuple "nagios" "internal" "nagios" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}/
|
|
ProxyPassReverse http://localhost:{{ tuple "nagios" "internal" "nagios" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}/
|
|
</Location>
|
|
<Proxy *>
|
|
AuthName "Nagios"
|
|
AuthType Basic
|
|
AuthBasicProvider file ldap
|
|
AuthUserFile /usr/local/apache2/conf/.htpasswd
|
|
AuthLDAPBindDN {{ .Values.endpoints.ldap.auth.admin.bind }}
|
|
AuthLDAPBindPassword {{ .Values.endpoints.ldap.auth.admin.password }}
|
|
AuthLDAPURL {{ tuple "ldap" "default" "ldap" . | include "helm-toolkit.endpoints.keystone_endpoint_uri_lookup" | quote }}
|
|
Require valid-user
|
|
</Proxy>
|
|
</VirtualHost>
|
|
nagios:
|
|
contacts:
|
|
- notifying_contact:
|
|
name: notifying_contact
|
|
contact_name: notifying_contact
|
|
alias: notifying contact
|
|
service_notification_period: 24x7
|
|
host_notification_period: 24x7
|
|
service_notification_options: w,u,c,r,f,s
|
|
host_notification_options: d,u,r,f,s
|
|
register: 0
|
|
- snmp_notifying_contact:
|
|
use: notifying_contact
|
|
name: snmp_notifying_contact
|
|
contact_name: snmp_notifying_contact
|
|
alias: snmp contact
|
|
service_notification_commands: send_service_snmp_trap
|
|
host_notification_commands: send_host_snmp_trap
|
|
- http_notifying_contact:
|
|
use: notifying_contact
|
|
name: http_notifying_contact
|
|
contact_name: http_notifying_contact
|
|
alias: HTTP contact
|
|
service_notification_commands: send_service_http_post
|
|
host_notification_commands: send_host_http_post
|
|
contactgroups:
|
|
- snmp_and_http_notifying_contact_group:
|
|
contactgroup_name: snmp_and_http_notifying_contact_group
|
|
alias: SNMP and HTTP notifying group
|
|
members: snmp_notifying_contact,http_notifying_contact
|
|
hosts:
|
|
- prometheus:
|
|
use: linux-server
|
|
host_name: prometheus
|
|
alias: "Prometheus Monitoring"
|
|
address: 127.0.0.1
|
|
hostgroups: prometheus-hosts
|
|
check_command: check-prometheus-host-alive
|
|
host_groups:
|
|
- prometheus-hosts:
|
|
hostgroup_name: prometheus-hosts
|
|
alias: "Prometheus Virtual Host"
|
|
- all:
|
|
hostgroup_name: all
|
|
alias: "all"
|
|
- base-os:
|
|
hostgroup_name: base-os
|
|
alias: "base-os"
|
|
commands:
|
|
- send_service_snmp_trap:
|
|
command_name: send_service_snmp_trap
|
|
command_line: "$USER1$/send_service_trap.sh '$USER8$' '$HOSTNAME$' '$SERVICEDESC$' $SERVICESTATEID$ '$SERVICEOUTPUT$' '$USER4$' '$USER5$'"
|
|
- send_host_snmp_trap:
|
|
command_name: send_host_snmp_trap
|
|
command_line: "$USER1$/send_host_trap.sh '$USER8$' '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$USER4$' '$USER5$'"
|
|
- send_service_http_post:
|
|
command_name: send_service_http_post
|
|
command_line: "$USER1$/send_http_post_event.py --type service --hostname '$HOSTNAME$' --servicedesc '$SERVICEDESC$' --state_id $SERVICESTATEID$ --output '$SERVICEOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$'"
|
|
- send_host_http_post:
|
|
command_name: send_host_http_post
|
|
command_line: "$USER1$/send_http_post_event.py --type host --hostname '$HOSTNAME$' --state_id $HOSTSTATEID$ --output '$HOSTOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$'"
|
|
- check_prometheus_host_alive:
|
|
command_name: check-prometheus-host-alive
|
|
command_line: "$USER1$/check_rest_get_api.py --url $USER2$ --warning_response_seconds 5 --critical_response_seconds 10"
|
|
- check_prom_alert_with_labels:
|
|
command_name: check_prom_alert_with_labels
|
|
command_line: "$USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --labels_csv '$ARG2$' --msg_format '$ARG3$' --ok_message '$ARG4$'"
|
|
- check_prom_alert:
|
|
command_name: check_prom_alert
|
|
command_line: "$USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --msg_format '$ARG2$' --ok_message '$ARG3$'"
|
|
- check_filespace_mounts-usage-rate-fullin4hrs:
|
|
command_name: check_filespace_mounts-usage-rate-fullin4hrs
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_in_4h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} will be full in four hours' --ok_message 'OK- All mountpoints usage rate is normal'
|
|
- check_filespace_mounts-usage:
|
|
command_name: check_filespace_mounts-usage
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_80percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} is more than 80 pecent full' --ok_message 'OK- All mountpoints usage is normal'
|
|
- check_node_loadavg:
|
|
command_name: check_node_loadavg
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_load1_90percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node load average has been more than 90% for the pash hour' --ok_message 'OK- Node load average is normal'
|
|
- check_node_cpu_util:
|
|
command_name: check_node_cpu_util
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_cpu_util_90percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node CPU utilization has been more than 90% for the pash hour' --ok_message 'OK- Node cpu utilization is normal'
|
|
- check_network_connections:
|
|
command_name: check_network_connections
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_conntrack_usage_80percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node network connections are more than 90% in use' --ok_message 'OK- Network connection utilization is normal'
|
|
- check_memory_usage:
|
|
command_name: check_memory_usage
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_memory_load' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node memory usage is more than 85%' --ok_message 'OK- Node memory usage is less than 85%'
|
|
- check_disk_write_latency:
|
|
command_name: check_disk_write_latency
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_write_latency' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Disk write latency is high' --ok_message 'OK- Node disk write latency is normal'
|
|
- check_disk_read_latency:
|
|
command_name: check_disk_read_latency
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_read_latency' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Disk read latency is high' --ok_message 'OK- Node disk read latency is normal'
|
|
- check_entropy_availability:
|
|
command_name: check_entropy_availability
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_entropy_available_low' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- System has low entropy availability' --ok_message 'OK- System entropy availability is sufficient'
|
|
- check_filedescriptor_usage_rate:
|
|
command_name: check_filedescriptor_usage_rate
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filedescriptors_full_in_3h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- at current consumption rate no free file descriptors will be available in 3hrs.' --ok_message 'OK- System file descriptor consumption is ok.'
|
|
- check_hwmon_high_cpu_temp:
|
|
command_name: check_hwmon_high_cpu_temp
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_hwmon_high_cpu_temp' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- CPU temperature is 90 percent of critical temperature.' --ok_message 'OK- CPU temperatures are normal.'
|
|
- check_network_receive_drop_high:
|
|
command_name: check_network_receive_drop_high
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_rcv' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network reception.' --ok_message 'OK- network packet receive drops not high.'
|
|
- check_network_transmit_drop_high:
|
|
command_name: check_network_transmit_drop_high
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network transmission.' --ok_message 'OK- network packet tramsmit drops not high.'
|
|
- check_network_receive_errors_high:
|
|
command_name: check_network_receive_errors_high
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network reception.' --ok_message 'OK- network reception errors not high.'
|
|
- check_network_transmit_errors_high:
|
|
command_name: check_network_transmit_errors_high
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network transmission.' --ok_message 'OK- network transmission errors not high.'
|
|
- check_vmstat_paging_rate:
|
|
command_name: check_vmstat_paging_rate
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_vmstat_paging_rate_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Memory paging rate over 5 minutes is high.' --ok_message 'OK- Memory paging rate over 5 minutes is ok.'
|
|
- check_xfs_block_allocation:
|
|
command_name: check_xfs_block_allocation
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_xfs_block_allocation_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- XFS block allocation is more than 80 percent of available.' --ok_message 'OK- XFS block allocation is less than 80 percent of available.'
|
|
- check_network_bond_status:
|
|
command_name: check_network_bond_status
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_bond_slaves_down' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- {master} is missing slave interfaces.' --ok_message 'OK- Network bonds have slave interfaces functional.'
|
|
- check_numa_memory_usage:
|
|
command_name: check_numa_memory_usage
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_numa_memory_used' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NUMA memory usage is more than 80 percent of available.' --ok_message 'OK- NUMA memory usage is normal.'
|
|
- check_ntp_sync:
|
|
command_name: check_ntp_sync
|
|
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_ntp_clock_skew_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NTP clock skew is more than 2 seconds.' --ok_message 'OK- NTP clock skew is less than 2 seconds.'
|
|
- check_ceph_health:
|
|
command_name: check_ceph_health
|
|
command_line: $USER1$/check_exporter_health_metric.py --exporter_api $USER10$ --health_metric ceph_health_status --critical 2 --warning 1
|
|
- check_prometheus_hosts:
|
|
command_name: check_prometheus_hosts
|
|
command_line: $USER1$/check_update_prometheus_hosts.py --prometheus_api $USER2$ --object_file_loc /opt/nagios/etc/objects/prometheus_discovery_objects.cfg
|
|
- check_es_query:
|
|
command_name: check_es_query
|
|
command_line: $USER1$/query_elasticsearch.py $USER9$ '$ARG1$' '$ARG2$' '$ARG3$' '$ARG4$' '$ARG5$' --simple_query '$ARG6$' --simple_query_fields '$ARG7$' --match '$ARG8$' --range '$ARG9$'
|
|
- check_es_query_w_file:
|
|
command_name: check_es_query_w_file
|
|
command_line: $USER1$/query_elasticsearch.py $USER9$ '$ARG1$' '$ARG2$' '$ARG3$' '$ARG4$' '$ARG5$' --simple_query '$ARG6$' --simple_query_fields '$ARG7$' --query_file '/opt/nagios/etc/objects/query_es_clauses.json' --query_clause '$ARG8$' --match '$ARG9$' --range '$ARG10$'
|
|
services:
|
|
- notifying_service:
|
|
name: notifying_service
|
|
use: generic-service
|
|
flap_detection_enabled: 0
|
|
process_perf_data: 0
|
|
contact_groups: snmp_and_http_notifying_contact_group
|
|
check_interval: 60
|
|
notification_interval: 120
|
|
retry_interval: 30
|
|
register: 0
|
|
- check_ceph_health:
|
|
use: notifying_service
|
|
hostgroup_name: base-os
|
|
service_description: "CEPH_health"
|
|
check_command: check_ceph_health
|
|
check_interval: 300
|
|
- check_hosts_health:
|
|
use: generic-service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Nodes_health"
|
|
check_command: check_prom_alert!K8SNodesNotReady!CRITICAL- One or more nodes are not ready.
|
|
check_interval: 60
|
|
- check_prometheus_replicas:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Prometheus_replica-count"
|
|
check_command: check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="prometheus"!statefulset {statefulset} has lesser than configured replicas
|
|
check_interval: 60
|
|
- check_alertmanager_replicas:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "PrometheusAlertmanager_replica-count"
|
|
check_command: check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="alertmanager"!statefulset {statefulset} has lesser than configured replicas
|
|
check_interval: 60
|
|
- check_statefulset_replicas:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Statefulset_replica-count"
|
|
check_command: check_prom_alert!replicas_unavailable_statefulset!CRITICAL- statefulset {statefulset} has lesser than configured replicas!OK- All statefulsets have configured amount of replicas
|
|
check_interval: 60
|
|
- check_daemonset_misscheduled:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Daemonset_misscheduled"
|
|
check_command: check_prom_alert!daemonsets_misscheduled!CRITICAL- Daemonset {daemonset} is incorrectly scheudled!OK- No daemonset misscheduling detected
|
|
check_interval: 60
|
|
- check_daemonset_not-scheduled:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Daemonset_not-scheduled"
|
|
check_command: check_prom_alert!daemonsets_not_scheduled!CRITICAL- Daemonset {daemonset} is missing to be scheduled in some nodes!OK- All daemonset scheduling is as desired
|
|
check_interval: 60
|
|
- check_daemonset_unavailable:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Daemonset_pods-unavailable"
|
|
check_command: check_prom_alert!daemonset_pods_unavailable!CRITICAL- Daemonset {daemonset} has pods unavailable!OK- All daemonset pods available
|
|
check_interval: 60
|
|
- check_deployment_replicas_unavailable:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Deployment_replicas-unavailable"
|
|
check_command: check_prom_alert!deployment_replicas_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas!OK- All deployments have desired replicas
|
|
check_interval: 60
|
|
- check_volume_claim_high_utilization:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Volume_claim_high_utilization"
|
|
check_command: check_prom_alert!volume_claim_capacity_high_utilization!CRITICAL- Volume claim {persistentvolumeclaim} has exceed 80% utilization!OK- All volume claims less than 80% utilization
|
|
check_interval: 60
|
|
- check_deployment_rollingupdate_replicas_unavailable:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "RollingUpdate_Deployment-replicas-unavailable"
|
|
check_command: check_prom_alert!rollingupdate_deployment_replica_less_than_spec_max_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas during a rolling update!OK- All deployments have desired replicas
|
|
check_interval: 60
|
|
- check_job_status_failed:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Job_status-failed"
|
|
check_command: check_prom_alert!job_status_failed!CRITICAL- Job {exported_job} has failed!OK- No Job failures
|
|
check_interval: 60
|
|
- check_pod_status_pending:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Pod_status-pending"
|
|
check_command: check_prom_alert!pod_status_pending!CRITICAL- Pod {pod} in namespace {namespace} has been in pending status for more than 10 minutes!OK- No pods in pending status
|
|
check_interval: 60
|
|
- check_pod_status_error_image_pull:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Pod_status-error-image-pull"
|
|
check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status
|
|
check_interval: 60
|
|
- check_pod_status_error_image_pull_backoff:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Pod_status-error-image-pull"
|
|
check_command: check_prom_alert! pod_status_error_image_pull_backoff!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ImagePullBackOff for more than 10 minutes!OK- No pods in error status
|
|
check_interval: 60
|
|
- check_pod_status_error_container_config_error:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Pod_status-error-image-pull"
|
|
check_command: check_prom_alert! pod_error_config_error!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of CreateContainerConfigError for more than 10 minutes!OK- No pods in error status
|
|
check_interval: 60
|
|
- check_pod_error_crash_loop_back_off:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Pod_status-crashLoopBackOff"
|
|
check_command: check_prom_alert!pod_error_crash_loop_back_off!CRITICAL- Pod {pod} in namespace {namespace} has been in error status of CrashLoopBackOff for more than 10 minutes!OK- No pods in crashLoopBackOff status
|
|
check_interval: 60
|
|
- check_replicaset_missing_replicas:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Replicaset_missing-replicas"
|
|
check_command: check_prom_alert!replicaset_missing_replicas!CRITICAL- Replicaset {replicaset} is missing replicas!OK- No replicas missing from replicaset
|
|
check_interval: 60
|
|
- check_pod_container_terminated:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Pod_status-container-terminated"
|
|
check_command: check_prom_alert!pod_container_terminated!CRITICAL- pod {pod} in namespace {namespace} has a container in terminated state!OK- pod container status looks good
|
|
check_interval: 60
|
|
- check_glance_api:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "API_glance"
|
|
check_command: check_prom_alert!os_glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available
|
|
check_interval: 60
|
|
- check_nova_api:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "API_nova"
|
|
check_command: check_prom_alert!os_nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available
|
|
check_interval: 60
|
|
- check_keystone_api:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "API_keystone"
|
|
check_command: check_prom_alert!os_keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available
|
|
check_interval: 60
|
|
- check_neutron_api:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "API_neutron"
|
|
check_command: check_prom_alert!os_neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available
|
|
check_interval: 60
|
|
- check_neutron_metadata_agent:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Service_neutron-metadata-agent"
|
|
check_command: check_prom_alert!os_neutron_metadata_agent_availability!CRITICAL- Some Neutron metadata agents are not available!OK- All the neutron metadata agents are up
|
|
check_interval: 60
|
|
- check_neutron_openvswitch_agent:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Service_neutron-openvswitch-agent"
|
|
check_command: check_prom_alert!os_neutron_openvswitch_agent_availability!CRITICAL- Some Neutron openvswitch agents are not available!OK- All the neutron openvswitch agents are up
|
|
check_interval: 60
|
|
- check_neutron_dhcp_agent:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Service_neutron-dhcp-agent"
|
|
check_command: check_prom_alert!os_neutron_dhcp_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron dhcp agents are up
|
|
check_interval: 60
|
|
- check_neutron_l3_agent:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Service_neutron-l3-agent"
|
|
check_command: check_prom_alert!os_neutron_l3_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron l3 agents are up
|
|
check_interval: 60
|
|
- check_swift_api:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "API_swift"
|
|
check_command: check_prom_alert!os_swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available
|
|
check_interval: 60
|
|
- check_cinder_api:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "API_cinder"
|
|
check_command: check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available
|
|
- check_glance_api:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "API_heat"
|
|
check_command: check_prom_alert!os_heat_api_availability!CRITICAL- Heat API at {url} is not available!OK- Heat API is available
|
|
check_interval: 60
|
|
- check_cinder_api:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "API_cinder"
|
|
check_command: check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available
|
|
check_interval: 60
|
|
- check_service_cinder_scheduler:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Service_cinder-scheduler"
|
|
check_command: check_prom_alert!os_cinder_scheduler_availability!CRITICAL- Cinder scheduler is not available!OK- Cinder scheduler is available
|
|
check_interval: 60
|
|
- check_service_nova_compute:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Service_nova-compute"
|
|
check_command: check_prom_alert!os_nova_compute_down!CRITICAL- nova-compute services are down on certain hosts!OK- nova-compute services are up on all hosts
|
|
check_interval: 60
|
|
- check_service_nova_conductor:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Service_nova-conductor"
|
|
check_command: check_prom_alert!os_nova_conductor_down!CRITICAL- nova-conductor services are down on certain hosts!OK- nova-conductor services are up on all hosts
|
|
check_interval: 60
|
|
- check_service_nova_consoleauth:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Service_nova-consoleauth"
|
|
check_command: check_prom_alert!os_nova_consoleauth_down!CRITICAL- nova-consoleauth services are down on certain hosts!OK- nova-consoleauth services are up on all hosts
|
|
check_interval: 60
|
|
- check_service_nova_scheduler:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Service_nova-scheduler"
|
|
check_command: check_prom_alert!openstack_nova_scheduler_down!CRITICAL- nova-scheduler services are down on certain hosts!OK- nova-scheduler services are up on all hosts
|
|
check_interval: 60
|
|
- check_os_vm_vcpu_usage:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "OS-Total-Quota_VCPU-usage"
|
|
check_command: check_prom_alert!os_vm_vcpu_usage_high!CRITICAL- vcpu usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs vcpu usage is less than 80 percent of available.
|
|
check_interval: 60
|
|
- check_os_vm_ram_usage:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "OS-Total-Quota_RAM-usage"
|
|
check_command: check_prom_alert!os_vm_ram_usage_high!CRITICAL- RAM usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs RAM usage is less than 80 percent of available.
|
|
check_interval: 60
|
|
- check_os_vm_disk_usage:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "OS-Total-Quota_Disk-usage"
|
|
check_command: check_prom_alert!os_vm_disk_usage_high!CRITICAL- Disk usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs Disk usage is less than 80 percent of available.
|
|
check_interval: 60
|
|
- check_ceph_monitor_quorum:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "CEPH_quorum"
|
|
check_command: check_prom_alert!ceph_monitor_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists
|
|
check_interval: 60
|
|
- check_ceph_storage_usage:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "CEPH_storage-usage"
|
|
check_command: check_prom_alert!ceph_cluster_usage_high!CRITICAL- ceph cluster storage is more than 80 percent!OK- ceph storage is less than 80 percent
|
|
check_interval: 60
|
|
- check_ceph_pgs_degradation:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "CEPH_PGs-degradation"
|
|
check_command: check_prom_alert!ceph_placement_group_degrade_pct_high!CRITICAL- ceph cluster PGs down are more than 80 percent!OK- ceph PG degradation is less than 80 percent
|
|
check_interval: 60
|
|
- check_ceph_osds_down:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "CEPH_OSDs-down"
|
|
check_command: check_prom_alert!ceph_osd_down_pct_high!CRITICAL- CEPH OSDs down are more than 80 percent!OK- CEPH OSDs down is less than 80 percent
|
|
check_interval: 60
|
|
- check_ceph_monitor_clock_skew:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "CEPH_Clock-skew"
|
|
check_command: check_prom_alert!ceph_monitor_clock_skew_high!CRITICAL- CEPH clock skew is more than 2 seconds!OK- CEPH clock skew is less than 2 seconds
|
|
check_interval: 60
|
|
- check_fluentd_up:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: "Fluentd_status"
|
|
check_command: check_prom_alert!fluentd_not_running!CRITICAL- fluentd is not running on {instance}!OK- Flunetd is working on all nodes
|
|
check_interval: 60
|
|
- check_etcd_high_http_deletes_failed:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: ETCD_high-http-delete-failures
|
|
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="DELETE"!CRITICAL- ETCD {instance} has a high HTTP DELETE operations failure!OK- ETCD at {instance} has low or no failures for HTTP DELETE
|
|
check_interval: 60
|
|
- check_etcd_high_http_get_failed:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: ETCD_high-http-get-failures
|
|
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method=~"GET|QGET"!CRITICAL- ETCD {instance} has a high HTTP GET operations failure!OK- ETCD at {instance} has low or no failures for HTTP GET
|
|
check_interval: 60
|
|
- check_etcd_high_http_updates_failed:
|
|
use: notifying_service
|
|
hostgroup_name: prometheus-hosts
|
|
service_description: ETCD_high-http-update-failures
|
|
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="PUT"!CRITICAL- ETCD {instance} has a high HTTP PUT operations failure!OK- ETCD at {instance} has low or no failures for HTTP PUT
|
|
check_interval: 60
|
|
- check_felix_iptables_save_errors:
|
|
use: notifying_service
|
|
service_description: Calico_iptables-save-errors
|
|
check_command: check_prom_alert!calico_iptable_save_errors_high_1h!CRITICAL- Felix instance {instance} has seen high iptable save errors within the last hour!OK- iptables save errors are none or low
|
|
hostgroup_name: prometheus-hosts
|
|
- check_felix_ipset_errors:
|
|
use: notifying_service
|
|
service_description: Calico_ipset-errors
|
|
check_command: check_prom_alert!calico_ipset_errors_high_1h!CRITICAL- Felix instance {instance} has seen high ipset errors within the last hour!OK- ipset errors are none or low
|
|
hostgroup_name: prometheus-hosts
|
|
- check_felix_int_dataplane_iface_msg_batch_size:
|
|
use: notifying_service
|
|
service_description: Calico_interface-message-batch-size
|
|
check_command: check_prom_alert!calico_datapane_iface_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane interface message batch size!OK- dataplane interface message batch size are low
|
|
hostgroup_name: prometheus-hosts
|
|
- check_felix_int_dataplane_addr_msg_batch_size:
|
|
use: notifying_service
|
|
service_description: Calico_address-message-batch-size
|
|
check_command: check_prom_alert!calico_datapane_address_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane address message batch size!OK- dataplane address message batch size are low
|
|
hostgroup_name: prometheus-hosts
|
|
- check_felix_int_dataplane_failures:
|
|
use: notifying_service
|
|
service_description: Calico_datapane_failures_high
|
|
check_command: check_prom_alert!calico_datapane_failures_high_1h!CRITICAL- Felix instance {instance} has seen high dataplane failures within the last hour!OK- datapane failures are none or low
|
|
hostgroup_name: prometheus-hosts
|
|
- check_rabbitmq_network_partitions_detected:
|
|
use: generic-service
|
|
service_description: Rabbitmq_network-partitions-exist
|
|
check_command: check_prom_alert!rabbitmq_network_pratitions_detected!CRITICAL- Rabbitmq instance {instance} has network partitions!OK- no network partitions detected in rabbitmq
|
|
hostgroup_name: prometheus-hosts
|
|
- check_rabbitmq_available:
|
|
use: generic-service
|
|
service_description: Rabbitmq_up
|
|
check_command: check_prom_alert!rabbitmq_down!CRITICAL- Rabbitmq instance {instance} is down!OK- rabbitmq is available
|
|
hostgroup_name: prometheus-hosts
|
|
- check_rabbitmq_fd_usage:
|
|
use: generic-service
|
|
service_description: Rabbitmq_file-descriptor-usage
|
|
check_command: check_prom_alert!rabbitmq_file_descriptor_usage_high!CRITICAL- Rabbitmq instance {instance} has file desciptor usage more than 80 percent!OK- rabbitmq file descriptor usage is normal
|
|
hostgroup_name: prometheus-hosts
|
|
- check_rabbitmq_node_disk_alarm:
|
|
use: generic-service
|
|
service_description: Rabbitmq_node-disk-alarm
|
|
check_command: check_prom_alert!rabbitmq_node_disk_free_alarm!CRITICAL- Rabbitmq instance {instance} has a disk usage alarm!OK- rabbitmq node disk has no alarms
|
|
hostgroup_name: prometheus-hosts
|
|
- check_rabbitmq_node_memory_alarm:
|
|
use: generic-service
|
|
service_description: Rabbitmq_node-memory-alarm
|
|
check_command: check_prom_alert!rabbitmq_node_memory_alarm!CRITICAL- Rabbitmq instance {instance} has a memory alarm!OK- rabbitmq node memory has no alarms
|
|
hostgroup_name: prometheus-hosts
|
|
- check_rabbitmq_availability:
|
|
use: generic-service
|
|
service_description: Rabbitmq_high-availability
|
|
check_command: check_prom_alert!rabbitmq_less_than_3_nodes!CRITICAL- Rabbitmq has less than 3 nodes to serve!OK- rabbitmq has atleast 3 nodes serving
|
|
hostgroup_name: prometheus-hosts
|
|
- check_queue_message_return_percent:
|
|
use: generic-service
|
|
service_description: Rabbitmq_message-return-percent
|
|
check_command: check_prom_alert!rabbitmq_queue_messages_returned_high!CRITICAL- Rabbitmq has high percent of messages being returned!OK- rabbitmq messages are consumed and low or no returns exist.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_queue_consumer_util:
|
|
use: generic-service
|
|
service_description: Rabbitmq_consumer-utilization
|
|
check_command: check_prom_alert!rabbitmq_consumers_low_utilization!CRITICAL- Rabbitmq consumer message consumption rate is slow!OK- rabbitmq message consumption speed is normal
|
|
hostgroup_name: prometheus-hosts
|
|
- check_queue_load:
|
|
use: generic-service
|
|
service_description: Rabbitmq_rabbitmq-queue-health
|
|
check_command: check_prom_alert!rabbitmq_high_message_load!CRITICAL- Rabbitmq unacknowledged message count is high!OK- rabbitmq unacknowledged message count is high
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_high_process_open_file_count:
|
|
use: generic-service
|
|
service_description: ES_high-process-open-file-count
|
|
check_command: check_prom_alert!es_high_process_open_files_count!CRITICAL- Elasticsearch {host} has high process open file count!OK- Elasticsearch process open file count is normal.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_high_process_cpu_percent:
|
|
use: generic-service
|
|
service_description: ES_high-process-cpu-percent
|
|
check_command: check_prom_alert!es_high_process_cpu_percent!CRITICAL- Elasticsearch {instance} has high process CPU percent!OK- Elasticsearch process cpu usage is normal.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_fs_usage:
|
|
use: generic-service
|
|
service_description: ES_high-filesystem-usage
|
|
check_command: check_prom_alert!es_fs_usage_high!CRITICAL- Elasticsearch {instance} has high filesystem usage!OK- Elasticsearch filesystem usage is normal.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_unassigned_shards:
|
|
use: generic-service
|
|
service_description: ES_unassigned-shards
|
|
check_command: check_prom_alert!es_unassigned_shards!CRITICAL- Elasticsearch has unassinged shards!OK- Elasticsearch has no unassigned shards.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_cluster_health_timedout:
|
|
use: generic-service
|
|
service_description: ES_cluster-health-timedout
|
|
check_command: check_prom_alert!es_cluster_health_timed_out!CRITICAL- Elasticsearch Cluster health status call timedout!OK- Elasticsearch cluster health is retrievable.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_cluster_health_status:
|
|
use: generic-service
|
|
service_description: ES_cluster-health-status
|
|
check_command: check_prom_alert!es_cluster_health_status_alert!CRITICAL- Elasticsearch cluster health status is not green. One or more shards or replicas are unallocated!OK- Elasticsearch cluster health is green.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_cluster_number_nodes_running:
|
|
use: generic-service
|
|
service_description: ES_cluster-running-node-count
|
|
check_command: check_prom_alert!es_cluster_health_too_few_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 nodes running!OK- Elasticsearch cluster has 3 or more nodes running.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_es_cluster_number_data_nodes_running:
|
|
use: generic-service
|
|
service_description: ES_cluster-running-data-node-count
|
|
check_command: check_prom_alert!es_cluster_health_too_few_data_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 data nodes running!OK- Elasticsearch cluster has 3 or more data nodes running.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_mariadb_table_lock_waits:
|
|
use: generic-service
|
|
service_description: Mariadb_table-lock-waits-high
|
|
check_command: check_prom_alert!mariadb_table_lock_wait_high!CRITICAL- Mariadb has high number of table lock waits!OK- No issues found with table lock waits.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_mariadb_node_ready:
|
|
use: generic-service
|
|
service_description: Mariadb_node-ready
|
|
check_command: check_prom_alert!mariadb_node_not_ready!CRITICAL- Mariadb {instance} is not ready!OK- All galera cluster nodes are ready.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_mariadb_node_out_of_sync:
|
|
use: generic-service
|
|
service_description: Mariadb_node-synchronized
|
|
check_command: check_prom_alert!mariadb_galera_node_out_of_sync!CRITICAL- Mariadb {instance} is out of sync!OK- All galera cluster nodes are in sync
|
|
hostgroup_name: prometheus-hosts
|
|
- check_mariadb_innodb_replication_lag:
|
|
use: generic-service
|
|
service_description: Mariadb_innodb-replication-lag
|
|
check_command: check_prom_alert!mariadb_innodb_replication_fallen_behind!CRITICAL- Innodb replication has fallen behind and not recovering!OK- innodb replication lag is nominal.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_prometheus_hosts:
|
|
use: notifying_service
|
|
service_description: Prometheus_hosts-update
|
|
check_command: check_prometheus_hosts
|
|
hostgroup_name: prometheus-hosts
|
|
check_interval: 900
|
|
- check_postgresql_replication_lag:
|
|
use: generic-service
|
|
service_description: Postgresql_replication-lag
|
|
check_command: check_prom_alert!pg_replication_fallen_behind!CRITICAL- Postgres Replication lag is over 2 minutes!OK- postgresql replication lag is nominal.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_postgresql_connections:
|
|
use: generic-service
|
|
service_description: Postgresql_connections
|
|
check_command: check_prom_alert!pg_connections_too_high!CRITICAL- Postgres has more than 95% of available connections in use.!OK- postgresql open connections are within bounds.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_postgresql_deadlocks:
|
|
use: generic-service
|
|
service_description: Postgresql_deadlocks
|
|
check_command: check_prom_alert!pg_deadlocks_detected!CRITICAL- Postgres server is experiencing deadlocks!OK- postgresql is not showing any deadlocks.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_prom_exporter_ceph:
|
|
use: generic-service
|
|
service_description: Prometheus-exporter_CEPH
|
|
check_command: check_prom_alert!prom_exporter_ceph_unavailable!CRITICAL- CEPH exporter is not collecting metrics for alerting!OK- CEPH exporter metrics are available.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_prom_exporter_openstack:
|
|
use: generic-service
|
|
service_description: Prometheus-exporter_Openstack
|
|
check_command: check_prom_alert!prom_exporter_openstack_unavailable!CRITICAL- Openstack exporter is not collecting metrics for alerting!OK- Openstack exporter metrics are available.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_prom_exporter_mariadb:
|
|
use: generic-service
|
|
service_description: Prometheus-exporter_MariaDB
|
|
check_command: check_prom_alert!prom_exporter_mariadb_unavailable!CRITICAL- MariaDB exporter is not collecting metrics for alerting!OK- MariaDB exporter metrics are available.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_prom_exporter_kube_state_metrics:
|
|
use: generic-service
|
|
service_description: Prometheus-exporter_Kube-state-metrics
|
|
check_command: check_prom_alert!prom_exporter_kube_state_metrics_unavailable!CRITICAL- kube-state-metrics exporter is not collecting metrics for alerting!OK- kube-state-metrics exporter metrics are available.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_prom_exporter_postgresql:
|
|
use: generic-service
|
|
service_description: Prometheus-exporter_Postgresql
|
|
check_command: check_prom_alert!prom_exporter_postgresql_unavailable!CRITICAL- Postgresql exporter is not collecting metrics for alerting!OK- Postgresql exporter metrics are available.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_prom_exporter_node:
|
|
use: generic-service
|
|
service_description: Prometheus-exporter_Node
|
|
check_command: check_prom_alert!prom_exporter_node_unavailable!CRITICAL- Node exporter is not collecting metrics for alerting!OK- Node exporter metrics are available.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_prom_exporter_calico:
|
|
use: generic-service
|
|
service_description: Prometheus-exporter_Calico
|
|
check_command: check_prom_alert!prom_exporter_calico_unavailable!CRITICAL- Calico exporter is not collecting metrics for alerting!OK- Calico exporter metrics are available.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_prom_exporter_elasticsearch:
|
|
use: generic-service
|
|
service_description: Prometheus-exporter_Elasticsearch
|
|
check_command: check_prom_alert!prom_exporter_elasticsearch_unavailable!CRITICAL- Elasticsearch exporter is not collecting metrics for alerting!OK- Elasticsearch exporter metrics are available.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_prom_exporter_fluentd:
|
|
use: generic-service
|
|
service_description: Prometheus-exporter_Fluentd
|
|
check_command: check_prom_alert!prom_exporter_fluentd_unavailable!CRITICAL- Fluentd exporter is not collecting metrics for alerting!OK- Fluentd exporter metrics are available.
|
|
hostgroup_name: prometheus-hosts
|
|
- check_filespace_mounts-usage-rate-fullin4hrs:
|
|
use: notifying_service
|
|
hostgroup_name: base-os
|
|
service_description: "Filespace_mounts-usage-rate-fullin4hrs"
|
|
check_command: check_filespace_mounts-usage-rate-fullin4hrs
|
|
check_interval: 60
|
|
- check_filespace_mounts-usage:
|
|
use: notifying_service
|
|
hostgroup_name: base-os
|
|
service_description: "Filespace_mounts-usage"
|
|
check_command: check_filespace_mounts-usage
|
|
check_interval: 60
|
|
- check_node_loadavg:
|
|
use: notifying_service
|
|
service_description: CPU_Load-average
|
|
check_command: check_node_loadavg
|
|
hostgroup_name: base-os
|
|
- check_node_cpu_util:
|
|
use: notifying_service
|
|
service_description: CPU_utilization
|
|
check_command: check_node_cpu_util
|
|
hostgroup_name: base-os
|
|
- check_network_connections:
|
|
use: notifying_service
|
|
service_description: Network_connections
|
|
check_command: check_network_connections
|
|
hostgroup_name: base-os
|
|
- check_memory_usage:
|
|
use: notifying_service
|
|
service_description: Memory_usage
|
|
check_command: check_memory_usage
|
|
hostgroup_name: base-os
|
|
- check_disk_write_latency:
|
|
use: notifying_service
|
|
service_description: Disk_write-latency
|
|
check_command: check_disk_write_latency
|
|
hostgroup_name: base-os
|
|
- check_disk_read_latency:
|
|
use: notifying_service
|
|
service_description: Disk_read-latency
|
|
check_command: check_disk_read_latency
|
|
hostgroup_name: base-os
|
|
- check_entropy_availability:
|
|
use: notifying_service
|
|
service_description: Entropy_availability
|
|
check_command: check_entropy_availability
|
|
hostgroup_name: base-os
|
|
- check_filedescriptor_usage_rate:
|
|
use: notifying_service
|
|
service_description: FileDescriptors_usage-rate-high
|
|
check_command: check_filedescriptor_usage_rate
|
|
hostgroup_name: base-os
|
|
- check_hwmon_high_cpu_temp:
|
|
use: notifying_service
|
|
service_description: HW_cpu-temp-high
|
|
check_command: check_hwmon_high_cpu_temp
|
|
hostgroup_name: base-os
|
|
- check_network_receive_drop_high:
|
|
use: notifying_service
|
|
service_description: Network_receive-drop-high
|
|
check_command: check_network_receive_drop_high
|
|
hostgroup_name: base-os
|
|
- check_network_transmit_drop_high:
|
|
use: notifying_service
|
|
service_description: Network_transmit-drop-high
|
|
check_command: check_network_transmit_drop_high
|
|
hostgroup_name: base-os
|
|
- check_network_receive_errors_high:
|
|
use: notifying_service
|
|
service_description: Network_receive-errors-high
|
|
check_command: check_network_receive_errors_high
|
|
hostgroup_name: base-os
|
|
- check_network_transmit_errors_high:
|
|
use: notifying_service
|
|
service_description: Network_transmit-errors-high
|
|
check_command: check_network_transmit_errors_high
|
|
hostgroup_name: base-os
|
|
- check_vmstat_paging_rate:
|
|
use: notifying_service
|
|
service_description: Memory_vmstat-paging-rate
|
|
check_command: check_vmstat_paging_rate
|
|
hostgroup_name: base-os
|
|
- check_xfs_block_allocation:
|
|
use: notifying_service
|
|
service_description: XFS_block-allocation
|
|
check_command: check_xfs_block_allocation
|
|
hostgroup_name: base-os
|
|
- check_network_bond_status:
|
|
use: notifying_service
|
|
service_description: Network_bondstatus
|
|
check_command: check_network_bond_status
|
|
hostgroup_name: base-os
|
|
- check_numa_memory_usage:
|
|
use: notifying_service
|
|
service_description: Memory_NUMA-usage
|
|
check_command: check_numa_memory_usage
|
|
hostgroup_name: base-os
|
|
- check_ntp_sync:
|
|
use: notifying_service
|
|
service_description: NTP_sync
|
|
check_command: check_ntp_sync
|
|
hostgroup_name: base-os
|
|
nagios:
|
|
log_file: /opt/nagios/var/log/nagios.log
|
|
cfg_file:
|
|
- /opt/nagios/etc/nagios_objects.cfg
|
|
- /opt/nagios/etc/objects/commands.cfg
|
|
- /opt/nagios/etc/objects/contacts.cfg
|
|
- /opt/nagios/etc/objects/timeperiods.cfg
|
|
- /opt/nagios/etc/objects/templates.cfg
|
|
- /opt/nagios/etc/objects/prometheus_discovery_objects.cfg
|
|
object_cache_file: /opt/nagios/var/objects.cache
|
|
precached_object_file: /opt/nagios/var/objects.precache
|
|
resource_file: /opt/nagios/etc/resource.cfg
|
|
status_file: /opt/nagios/var/status.dat
|
|
status_update_interval: 10
|
|
nagios_user: nagios
|
|
nagios_group: nagios
|
|
check_external_commands: 1
|
|
command_file: /opt/nagios/var/rw/nagios.cmd
|
|
lock_file: /var/run/nagios.lock
|
|
temp_file: /opt/nagios/var/nagios.tmp
|
|
temp_path: /tmp
|
|
event_broker_options: -1
|
|
log_rotation_method: d
|
|
log_archive_path: /opt/nagios/var/log/archives
|
|
use_syslog: 0
|
|
log_notifications: 0
|
|
log_service_retries: 1
|
|
log_host_retries: 1
|
|
log_event_handlers: 1
|
|
log_initial_states: 0
|
|
log_current_states: 1
|
|
log_external_commands: 1
|
|
log_passive_checks: 1
|
|
service_inter_check_delay_method: s
|
|
max_service_check_spread: 30
|
|
service_interleave_factor: s
|
|
host_inter_check_delay_method: s
|
|
max_host_check_spread: 30
|
|
max_concurrent_checks: 300
|
|
check_result_reaper_frequency: 10
|
|
max_check_result_reaper_time: 30
|
|
check_result_path: /opt/nagios/var/spool/checkresults
|
|
max_check_result_file_age: 3600
|
|
cached_host_check_horizon: 15
|
|
cached_service_check_horizon: 15
|
|
enable_predictive_host_dependency_checks: 1
|
|
enable_predictive_service_dependency_checks: 1
|
|
soft_state_dependencies: 0
|
|
auto_reschedule_checks: 0
|
|
auto_rescheduling_interval: 30
|
|
auto_rescheduling_window: 180
|
|
service_check_timeout: 60
|
|
host_check_timeout: 60
|
|
event_handler_timeout: 60
|
|
notification_timeout: 60
|
|
ocsp_timeout: 5
|
|
perfdata_timeout: 5
|
|
retain_state_information: 1
|
|
state_retention_file: /opt/nagios/var/retention.dat
|
|
retention_update_interval: 60
|
|
use_retained_program_state: 1
|
|
use_retained_scheduling_info: 1
|
|
retained_host_attribute_mask: 0
|
|
retained_service_attribute_mask: 0
|
|
retained_process_host_attribute_mask: 0
|
|
retained_process_service_attribute_mask: 0
|
|
retained_contact_host_attribute_mask: 0
|
|
retained_contact_service_attribute_mask: 0
|
|
interval_length: 1
|
|
check_workers: 4
|
|
check_for_updates: 1
|
|
bare_update_check: 0
|
|
use_aggressive_host_checking: 0
|
|
execute_service_checks: 1
|
|
accept_passive_service_checks: 1
|
|
execute_host_checks: 1
|
|
accept_passive_host_checks: 1
|
|
enable_notifications: 1
|
|
enable_event_handlers: 1
|
|
process_performance_data: 0
|
|
obsess_over_services: 0
|
|
obsess_over_hosts: 0
|
|
translate_passive_host_checks: 0
|
|
passive_host_checks_are_soft: 0
|
|
check_for_orphaned_services: 1
|
|
check_for_orphaned_hosts: 1
|
|
check_service_freshness: 1
|
|
service_freshness_check_interval: 60
|
|
check_host_freshness: 0
|
|
host_freshness_check_interval: 60
|
|
additional_freshness_latency: 15
|
|
enable_flap_detection: 1
|
|
low_service_flap_threshold: 5.0
|
|
high_service_flap_threshold: 20.0
|
|
low_host_flap_threshold: 5.0
|
|
high_host_flap_threshold: 20.0
|
|
date_format: us
|
|
use_regexp_matching: 1
|
|
use_true_regexp_matching: 0
|
|
daemon_dumps_core: 0
|
|
use_large_installation_tweaks: 0
|
|
enable_environment_macros: 0
|
|
debug_level: 0
|
|
debug_verbosity: 1
|
|
debug_file: /opt/nagios/var/nagios.debug
|
|
max_debug_file_size: 1000000
|
|
allow_empty_hostgroup_assignment: 1
|
|
illegal_macro_output_chars: "`~$&|'<>\""
|
|
cgi:
|
|
main_config_file: /opt/nagios/etc/nagios.cfg
|
|
physical_html_path: /opt/nagios/share
|
|
url_html_path: /nagios
|
|
show_context_help: 0
|
|
use_pending_states: 1
|
|
use_authentication: 0
|
|
use_ssl_authentication: 0
|
|
authorized_for_system_information: "*"
|
|
authorized_for_configuration_information: "*"
|
|
authorized_for_system_commands: nagiosadmin
|
|
authorized_for_all_services: "*"
|
|
authorized_for_all_hosts: "*"
|
|
authorized_for_all_service_commands: "*"
|
|
authorized_for_all_host_commands: "*"
|
|
default_statuswrl_layout: 4
|
|
ping_syntax: /bin/ping -n -U -c 5 $HOSTADDRESS$
|
|
refresh_rate: 90
|
|
result_limit: 100
|
|
escape_html_tags: 1
|
|
action_url_target: _blank
|
|
notes_url_target: _blank
|
|
lock_author_names: 1
|
|
navbar_search_for_addresses: 1
|
|
navbar_search_for_aliases: 1
|
|
notification:
|
|
snmp:
|
|
primary_target: 127.0.0.1:15162
|
|
secondary_target: 127.0.0.1:15162
|
|
http:
|
|
primary_target: 127.0.0.1:3904/events
|
|
secondary_target: 127.0.0.1:3904/events
|
|
query_es_clauses: null
|