diff --git a/nagios/templates/configmap-etc.yaml b/nagios/templates/configmap-etc.yaml index 03d7e4446..2ed3ea834 100644 --- a/nagios/templates/configmap-etc.yaml +++ b/nagios/templates/configmap-etc.yaml @@ -28,7 +28,10 @@ data: {{- end }} {{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" .Values.conf.nagios.nagios.template "key" "nagios.cfg" "format" "Secret") | indent 2 }} {{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" .Values.conf.nagios.cgi.template "key" "cgi.cfg" "format" "Secret") | indent 2 }} -{{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" .Values.conf.nagios.objects.template "key" "nagios_objects.cfg" "format" "Secret") | indent 2 }} +{{- range $objectType, $config := $envAll.Values.conf.nagios.objects }} +{{- $objectFile := printf "%s.cfg" $objectType -}} +{{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" $config.template "key" $objectFile "format" "Secret") | indent 2 }} +{{- end }} #NOTE(portdirect): this must be last, to work round helm ~2.7 bug. {{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" .Values.conf.httpd "key" "httpd.conf" "format" "Secret") | indent 2 }} {{- end }} diff --git a/nagios/templates/deployment.yaml b/nagios/templates/deployment.yaml index ec160f084..0fa3cd3c3 100644 --- a/nagios/templates/deployment.yaml +++ b/nagios/templates/deployment.yaml @@ -203,10 +203,13 @@ spec: mountPath: /opt/nagios/etc/cgi.cfg subPath: cgi.cfg readOnly: true + {{- $objectKeys := keys $envAll.Values.conf.nagios.objects -}} + {{- range $objectType := $objectKeys }} - name: nagios-etc - mountPath: /opt/nagios/etc/nagios_objects.cfg - subPath: nagios_objects.cfg + mountPath: /opt/nagios/etc/{{$objectType}}.cfg + subPath: {{$objectType}}.cfg readOnly: true + {{- end }} - name: nagios-bin mountPath: /tmp/nagios-readiness.sh subPath: nagios-readiness.sh diff --git a/nagios/values.yaml b/nagios/values.yaml index 5a4b74721..1603db1c2 100644 --- a/nagios/values.yaml +++ b/nagios/values.yaml @@ -392,1018 +392,642 @@ conf: primary_target: 127.0.0.1:3904/events secondary_target: 127.0.0.1:3904/events objects: - template: | - define host { - address 127.0.0.1 - alias Prometheus Monitoring - check_command check-prometheus-host-alive - host_name {{ tuple "monitoring" "public" . | include "helm-toolkit.endpoints.hostname_short_endpoint_lookup" }} - hostgroups prometheus-hosts - use linux-server - } - - define contact { - alias notifying contact - contact_name notifying_contact - host_notification_options d,u,r,f,s - host_notification_period 24x7 - name notifying_contact - register 0 - service_notification_options w,u,c,r,f,s - service_notification_period 24x7 - } - - define contact { - alias snmp contact - contact_name snmp_notifying_contact - host_notification_commands send_host_snmp_trap - name snmp_notifying_contact - service_notification_commands send_service_snmp_trap - use notifying_contact - } - - define contact { - alias HTTP contact - contact_name http_notifying_contact - host_notification_commands send_host_http_post - name http_notifying_contact - service_notification_commands send_service_http_post - use notifying_contact - } - - define contactgroup { - alias SNMP and HTTP notifying group - contactgroup_name snmp_and_http_notifying_contact_group - members snmp_notifying_contact,http_notifying_contact - } - - define hostgroup { - alias Prometheus Virtual Host - hostgroup_name prometheus-hosts - } - - define hostgroup { - alias all - hostgroup_name all - } - - define hostgroup { - alias base-os - hostgroup_name base-os - } - - define command { - command_line $USER1$/send_service_trap.sh '$USER8$' '$HOSTNAME$' '$SERVICEDESC$' $SERVICESTATEID$ '$SERVICEOUTPUT$' '$USER4$' '$USER5$' - command_name send_service_snmp_trap - } - - define command { - command_line $USER1$/send_host_trap.sh '$USER8$' '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$USER4$' '$USER5$' - command_name send_host_snmp_trap - } - - define command { - command_line $USER1$/send_http_post_event.py --type service --hostname '$HOSTNAME$' --servicedesc '$SERVICEDESC$' --state_id $SERVICESTATEID$ --output '$SERVICEOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$' - command_name send_service_http_post - } - - define command { - command_line $USER1$/send_http_post_event.py --type host --hostname '$HOSTNAME$' --state_id $HOSTSTATEID$ --output '$HOSTOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$' - command_name send_host_http_post - } - - define command { - command_line $USER1$/check_rest_get_api.py --url $USER2$ --warning_response_seconds 5 --critical_response_seconds 10 - command_name check-prometheus-host-alive - } - - define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --labels_csv '$ARG2$' --msg_format '$ARG3$' --ok_message '$ARG4$' - command_name check_prom_alert_with_labels - } - - define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --msg_format '$ARG2$' --ok_message '$ARG3$' - command_name check_prom_alert - } - - define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_in_4h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} will be full in four hours' --ok_message 'OK- All mountpoints usage rate is normal' - command_name check_filespace_mounts-usage-rate-fullin4hrs - } - - define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_80percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} is more than 80 pecent full' --ok_message 'OK- All mountpoints usage is normal' - command_name check_filespace_mounts-usage - } - - define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_load1_90percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node load average has been more than 90% for the pash hour' --ok_message 'OK- Node load average is normal' - command_name check_node_loadavg - } - - define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_cpu_util_90percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node CPU utilization has been more than 90% for the pash hour' --ok_message 'OK- Node cpu utilization is normal' - command_name check_node_cpu_util - } - - define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_conntrack_usage_80percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node network connections are more than 90% in use' --ok_message 'OK- Network connection utilization is normal' - command_name check_network_connections - } - - define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_memory_load' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node memory usage is more than 85%' --ok_message 'OK- Node memory usage is less than 85%' - command_name check_memory_usage - } - - define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_write_latency' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Disk write latency is high' --ok_message 'OK- Node disk write latency is normal' - command_name check_disk_write_latency - } - - define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_read_latency' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Disk read latency is high' --ok_message 'OK- Node disk read latency is normal' - command_name check_disk_read_latency - } - - define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_entropy_available_low' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- System has low entropy availability' --ok_message 'OK- System entropy availability is sufficient' - command_name check_entropy_availability - } - - define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filedescriptors_full_in_3h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- at current consumption rate no free file descriptors will be available in 3hrs.' --ok_message 'OK- System file descriptor consumption is ok.' - command_name check_filedescriptor_usage_rate - } - - define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_hwmon_high_cpu_temp' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- CPU temperature is 90 percent of critical temperature.' --ok_message 'OK- CPU temperatures are normal.' - command_name check_hwmon_high_cpu_temp - } - - define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_rcv' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network reception.' --ok_message 'OK- network packet receive drops not high.' - command_name check_network_receive_drop_high - } - - define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network transmission.' --ok_message 'OK- network packet tramsmit drops not high.' - command_name check_network_transmit_drop_high - } - - define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network reception.' --ok_message 'OK- network reception errors not high.' - command_name check_network_receive_errors_high - } - - define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network transmission.' --ok_message 'OK- network transmission errors not high.' - command_name check_network_transmit_errors_high - } - - define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_vmstat_paging_rate_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Memory paging rate over 5 minutes is high.' --ok_message 'OK- Memory paging rate over 5 minutes is ok.' - command_name check_vmstat_paging_rate - } - - define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_xfs_block_allocation_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- XFS block allocation is more than 80 percent of available.' --ok_message 'OK- XFS block allocation is less than 80 percent of available.' - command_name check_xfs_block_allocation - } - - define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_bond_slaves_down' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- {master} is missing slave interfaces.' --ok_message 'OK- Network bonds have slave interfaces functional.' - command_name check_network_bond_status - } - - define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_numa_memory_used' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NUMA memory usage is more than 80 percent of available.' --ok_message 'OK- NUMA memory usage is normal.' - command_name check_numa_memory_usage - } - - define command { - command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_ntp_clock_skew_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NTP clock skew is more than 2 seconds.' --ok_message 'OK- NTP clock skew is less than 2 seconds.' - command_name check_ntp_sync - } - - define command { - command_line $USER1$/check_exporter_health_metric.py --exporter_namespace "ceph" --label_selector "application=ceph,component=manager" --health_metric ceph_health_status --critical 2 --warning 1 - command_name check_ceph_health - } - - define command { - command_line $USER1$/query_elasticsearch.py $USER9$ '$ARG1$' '$ARG2$' '$ARG3$' '$ARG4$' '$ARG5$' --simple_query '$ARG6$' --simple_query_fields '$ARG7$' --match '$ARG8$' --range '$ARG9$' - command_name check_es_query - } - - define command { - command_line $USER1$/query_elasticsearch.py $USER9$ '$ARG1$' '$ARG2$' '$ARG3$' '$ARG4$' '$ARG5$' --simple_query '$ARG6$' --simple_query_fields '$ARG7$' --query_file '/opt/nagios/etc/objects/query_es_clauses.json' --query_clause '$ARG8$' --match '$ARG9$' --range '$ARG10$' - command_name check_es_query_w_file - } - - define service { - check_interval 60 - contact_groups snmp_and_http_notifying_contact_group - flap_detection_enabled 0 - name notifying_service - notification_interval 120 - process_perf_data 0 - register 0 - retry_interval 30 - use generic-service - } - - define service { - check_command check_ceph_health - check_interval 300 - hostgroup_name base-os - service_description CEPH_health - use notifying_service - } - - define service { - check_command check_prom_alert!K8SNodesNotReady!CRITICAL- One or more nodes are not ready. - check_interval 60 - hostgroup_name prometheus-hosts - service_description Nodes_health - use generic-service - } - - define service { - check_command check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="prometheus"!statefulset {statefulset} has lesser than configured replicas - check_interval 60 - hostgroup_name prometheus-hosts - service_description Prometheus_replica-count - use notifying_service - } - - define service { - check_command check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="alertmanager"!statefulset {statefulset} has lesser than configured replicas - check_interval 60 - hostgroup_name prometheus-hosts - service_description PrometheusAlertmanager_replica-count - use notifying_service - } - - define service { - check_command check_prom_alert!replicas_unavailable_statefulset!CRITICAL- statefulset {statefulset} has lesser than configured replicas!OK- All statefulsets have configured amount of replicas - check_interval 60 - hostgroup_name prometheus-hosts - service_description Statefulset_replica-count - use notifying_service - } - - define service { - check_command check_prom_alert!daemonsets_misscheduled!CRITICAL- Daemonset {daemonset} is incorrectly scheudled!OK- No daemonset misscheduling detected - check_interval 60 - hostgroup_name prometheus-hosts - service_description Daemonset_misscheduled - use notifying_service - } - - define service { - check_command check_prom_alert!daemonsets_not_scheduled!CRITICAL- Daemonset {daemonset} is missing to be scheduled in some nodes!OK- All daemonset scheduling is as desired - check_interval 60 - hostgroup_name prometheus-hosts - service_description Daemonset_not-scheduled - use notifying_service - } - - define service { - check_command check_prom_alert!daemonset_pods_unavailable!CRITICAL- Daemonset {daemonset} has pods unavailable!OK- All daemonset pods available - check_interval 60 - hostgroup_name prometheus-hosts - service_description Daemonset_pods-unavailable - use notifying_service - } - - define service { - check_command check_prom_alert!deployment_replicas_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas!OK- All deployments have desired replicas - check_interval 60 - hostgroup_name prometheus-hosts - service_description Deployment_replicas-unavailable - use notifying_service - } - - define service { - check_command check_prom_alert!volume_claim_capacity_high_utilization!CRITICAL- Volume claim {persistentvolumeclaim} has exceed 80% utilization!OK- All volume claims less than 80% utilization - check_interval 60 - hostgroup_name prometheus-hosts - service_description Volume_claim_high_utilization - use notifying_service - } - - define service { - check_command check_prom_alert!rollingupdate_deployment_replica_less_than_spec_max_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas during a rolling update!OK- All deployments have desired replicas - check_interval 60 - hostgroup_name prometheus-hosts - service_description RollingUpdate_Deployment-replicas-unavailable - use notifying_service - } - - define service { - check_command check_prom_alert!job_status_failed!CRITICAL- Job {exported_job} has failed!OK- No Job failures - check_interval 60 - hostgroup_name prometheus-hosts - service_description Job_status-failed - use notifying_service - } - - define service { - check_command check_prom_alert!pod_status_pending!CRITICAL- Pod {pod} in namespace {namespace} has been in pending status for more than 10 minutes!OK- No pods in pending status - check_interval 60 - hostgroup_name prometheus-hosts - service_description Pod_status-pending - use notifying_service - } - - define service { - check_command check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status - check_interval 60 - hostgroup_name prometheus-hosts - service_description Pod_status-error-image-pull - use notifying_service - } - - define service { - check_command check_prom_alert! pod_status_error_image_pull_backoff!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ImagePullBackOff for more than 10 minutes!OK- No pods in error status - check_interval 60 - hostgroup_name prometheus-hosts - service_description Pod_status-error-image-pull - use notifying_service - } - - define service { - check_command check_prom_alert! pod_error_config_error!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of CreateContainerConfigError for more than 10 minutes!OK- No pods in error status - check_interval 60 - hostgroup_name prometheus-hosts - service_description Pod_status-error-image-pull - use notifying_service - } - - define service { - check_command check_prom_alert!pod_error_crash_loop_back_off!CRITICAL- Pod {pod} in namespace {namespace} has been in error status of CrashLoopBackOff for more than 10 minutes!OK- No pods in crashLoopBackOff status - check_interval 60 - hostgroup_name prometheus-hosts - service_description Pod_status-crashLoopBackOff - use notifying_service - } - - define service { - check_command check_prom_alert!replicaset_missing_replicas!CRITICAL- Replicaset {replicaset} is missing replicas!OK- No replicas missing from replicaset - check_interval 60 - hostgroup_name prometheus-hosts - service_description Replicaset_missing-replicas - use notifying_service - } - - define service { - check_command check_prom_alert!pod_container_terminated!CRITICAL- pod {pod} in namespace {namespace} has a container in terminated state!OK- pod container status looks good - check_interval 60 - hostgroup_name prometheus-hosts - service_description Pod_status-container-terminated - use notifying_service - } - - define service { - check_command check_prom_alert!os_glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available - check_interval 60 - hostgroup_name prometheus-hosts - service_description API_glance - use notifying_service - } - - define service { - check_command check_prom_alert!os_nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available - check_interval 60 - hostgroup_name prometheus-hosts - service_description API_nova - use notifying_service - } - - define service { - check_command check_prom_alert!os_keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available - check_interval 60 - hostgroup_name prometheus-hosts - service_description API_keystone - use notifying_service - } - - define service { - check_command check_prom_alert!os_neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available - check_interval 60 - hostgroup_name prometheus-hosts - service_description API_neutron - use notifying_service - } - - define service { - check_command check_prom_alert!os_neutron_metadata_agent_availability!CRITICAL- Some Neutron metadata agents are not available!OK- All the neutron metadata agents are up - check_interval 60 - hostgroup_name prometheus-hosts - service_description Service_neutron-metadata-agent - use notifying_service - } - - define service { - check_command check_prom_alert!os_neutron_openvswitch_agent_availability!CRITICAL- Some Neutron openvswitch agents are not available!OK- All the neutron openvswitch agents are up - check_interval 60 - hostgroup_name prometheus-hosts - service_description Service_neutron-openvswitch-agent - use notifying_service - } - - define service { - check_command check_prom_alert!os_neutron_dhcp_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron dhcp agents are up - check_interval 60 - hostgroup_name prometheus-hosts - service_description Service_neutron-dhcp-agent - use notifying_service - } - - define service { - check_command check_prom_alert!os_neutron_l3_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron l3 agents are up - check_interval 60 - hostgroup_name prometheus-hosts - service_description Service_neutron-l3-agent - use notifying_service - } - - define service { - check_command check_prom_alert!os_swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available - check_interval 60 - hostgroup_name prometheus-hosts - service_description API_swift - use notifying_service - } - - define service { - check_command check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available - hostgroup_name prometheus-hosts - service_description API_cinder - use notifying_service - } - - define service { - check_command check_prom_alert!os_heat_api_availability!CRITICAL- Heat API at {url} is not available!OK- Heat API is available - check_interval 60 - hostgroup_name prometheus-hosts - service_description API_heat - use notifying_service - } - - define service { - check_command check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available - check_interval 60 - hostgroup_name prometheus-hosts - service_description API_cinder - use notifying_service - } - - define service { - check_command check_prom_alert!os_cinder_scheduler_availability!CRITICAL- Cinder scheduler is not available!OK- Cinder scheduler is available - check_interval 60 - hostgroup_name prometheus-hosts - service_description Service_cinder-scheduler - use notifying_service - } - - define service { - check_command check_prom_alert!os_nova_compute_down!CRITICAL- nova-compute services are down on certain hosts!OK- nova-compute services are up on all hosts - check_interval 60 - hostgroup_name prometheus-hosts - service_description Service_nova-compute - use notifying_service - } - - define service { - check_command check_prom_alert!os_nova_conductor_down!CRITICAL- nova-conductor services are down on certain hosts!OK- nova-conductor services are up on all hosts - check_interval 60 - hostgroup_name prometheus-hosts - service_description Service_nova-conductor - use notifying_service - } - - define service { - check_command check_prom_alert!os_nova_consoleauth_down!CRITICAL- nova-consoleauth services are down on certain hosts!OK- nova-consoleauth services are up on all hosts - check_interval 60 - hostgroup_name prometheus-hosts - service_description Service_nova-consoleauth - use notifying_service - } - - define service { - check_command check_prom_alert!openstack_nova_scheduler_down!CRITICAL- nova-scheduler services are down on certain hosts!OK- nova-scheduler services are up on all hosts - check_interval 60 - hostgroup_name prometheus-hosts - service_description Service_nova-scheduler - use notifying_service - } - - define service { - check_command check_prom_alert!os_vm_vcpu_usage_high!CRITICAL- vcpu usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs vcpu usage is less than 80 percent of available. - check_interval 60 - hostgroup_name prometheus-hosts - service_description OS-Total-Quota_VCPU-usage - use notifying_service - } - - define service { - check_command check_prom_alert!os_vm_ram_usage_high!CRITICAL- RAM usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs RAM usage is less than 80 percent of available. - check_interval 60 - hostgroup_name prometheus-hosts - service_description OS-Total-Quota_RAM-usage - use notifying_service - } - - define service { - check_command check_prom_alert!os_vm_disk_usage_high!CRITICAL- Disk usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs Disk usage is less than 80 percent of available. - check_interval 60 - hostgroup_name prometheus-hosts - service_description OS-Total-Quota_Disk-usage - use notifying_service - } - - define service { - check_command check_prom_alert!ceph_monitor_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists - check_interval 60 - hostgroup_name prometheus-hosts - service_description CEPH_quorum - use notifying_service - } - - define service { - check_command check_prom_alert!ceph_cluster_usage_high!CRITICAL- ceph cluster storage is more than 80 percent!OK- ceph storage is less than 80 percent - check_interval 60 - hostgroup_name prometheus-hosts - service_description CEPH_storage-usage - use notifying_service - } - - define service { - check_command check_prom_alert!ceph_placement_group_degrade_pct_high!CRITICAL- ceph cluster PGs down are more than 80 percent!OK- ceph PG degradation is less than 80 percent - check_interval 60 - hostgroup_name prometheus-hosts - service_description CEPH_PGs-degradation - use notifying_service - } - - define service { - check_command check_prom_alert!ceph_osd_down!CRITICAL- One or more CEPH OSDs are down for more than 5 minutes!OK- All the CEPH OSDs are up - check_interval 60 - hostgroup_name prometheus-hosts - service_description CEPH_OSDs-down - use notifying_service - } - - define service { - check_command check_prom_alert!ceph_monitor_clock_skew_high!CRITICAL- CEPH clock skew is more than 2 seconds!OK- CEPH clock skew is less than 2 seconds - check_interval 60 - hostgroup_name prometheus-hosts - service_description CEPH_Clock-skew - use notifying_service - } - - define service { - check_command check_prom_alert!fluentd_not_running!CRITICAL- fluentd is not running on {instance}!OK- Flunetd is working on all nodes - check_interval 60 - hostgroup_name prometheus-hosts - service_description Fluentd_status - use notifying_service - } - - define service { - check_command check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="DELETE"!CRITICAL- ETCD {instance} has a high HTTP DELETE operations failure!OK- ETCD at {instance} has low or no failures for HTTP DELETE - check_interval 60 - hostgroup_name prometheus-hosts - service_description ETCD_high-http-delete-failures - use notifying_service - } - - define service { - check_command check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method=~"GET|QGET"!CRITICAL- ETCD {instance} has a high HTTP GET operations failure!OK- ETCD at {instance} has low or no failures for HTTP GET - check_interval 60 - hostgroup_name prometheus-hosts - service_description ETCD_high-http-get-failures - use notifying_service - } - - define service { - check_command check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="PUT"!CRITICAL- ETCD {instance} has a high HTTP PUT operations failure!OK- ETCD at {instance} has low or no failures for HTTP PUT - check_interval 60 - hostgroup_name prometheus-hosts - service_description ETCD_high-http-update-failures - use notifying_service - } - - define service { - check_command check_prom_alert!calico_iptable_save_errors_high_1h!CRITICAL- Felix instance {instance} has seen high iptable save errors within the last hour!OK- iptables save errors are none or low - hostgroup_name prometheus-hosts - service_description Calico_iptables-save-errors - use notifying_service - } - - define service { - check_command check_prom_alert!calico_ipset_errors_high_1h!CRITICAL- Felix instance {instance} has seen high ipset errors within the last hour!OK- ipset errors are none or low - hostgroup_name prometheus-hosts - service_description Calico_ipset-errors - use notifying_service - } - - define service { - check_command check_prom_alert!calico_datapane_iface_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane interface message batch size!OK- dataplane interface message batch size are low - hostgroup_name prometheus-hosts - service_description Calico_interface-message-batch-size - use notifying_service - } - - define service { - check_command check_prom_alert!calico_datapane_address_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane address message batch size!OK- dataplane address message batch size are low - hostgroup_name prometheus-hosts - service_description Calico_address-message-batch-size - use notifying_service - } - - define service { - check_command check_prom_alert!calico_datapane_failures_high_1h!CRITICAL- Felix instance {instance} has seen high dataplane failures within the last hour!OK- datapane failures are none or low - hostgroup_name prometheus-hosts - service_description Calico_datapane_failures_high - use notifying_service - } - - define service { - check_command check_prom_alert!rabbitmq_network_pratitions_detected!CRITICAL- Rabbitmq instance {instance} has network partitions!OK- no network partitions detected in rabbitmq - hostgroup_name prometheus-hosts - service_description Rabbitmq_network-partitions-exist - use generic-service - } - - define service { - check_command check_prom_alert!rabbitmq_down!CRITICAL- Rabbitmq instance {instance} is down!OK- rabbitmq is available - hostgroup_name prometheus-hosts - service_description Rabbitmq_up - use generic-service - } - - define service { - check_command check_prom_alert!rabbitmq_file_descriptor_usage_high!CRITICAL- Rabbitmq instance {instance} has file desciptor usage more than 80 percent!OK- rabbitmq file descriptor usage is normal - hostgroup_name prometheus-hosts - service_description Rabbitmq_file-descriptor-usage - use generic-service - } - - define service { - check_command check_prom_alert!rabbitmq_node_disk_free_alarm!CRITICAL- Rabbitmq instance {instance} has a disk usage alarm!OK- rabbitmq node disk has no alarms - hostgroup_name prometheus-hosts - service_description Rabbitmq_node-disk-alarm - use generic-service - } - - define service { - check_command check_prom_alert!rabbitmq_node_memory_alarm!CRITICAL- Rabbitmq instance {instance} has a memory alarm!OK- rabbitmq node memory has no alarms - hostgroup_name prometheus-hosts - service_description Rabbitmq_node-memory-alarm - use generic-service - } - - define service { - check_command check_prom_alert!rabbitmq_less_than_3_nodes!CRITICAL- Rabbitmq has less than 3 nodes to serve!OK- rabbitmq has atleast 3 nodes serving - hostgroup_name prometheus-hosts - service_description Rabbitmq_high-availability - use generic-service - } - - define service { - check_command check_prom_alert!rabbitmq_queue_messages_returned_high!CRITICAL- Rabbitmq has high percent of messages being returned!OK- rabbitmq messages are consumed and low or no returns exist. - hostgroup_name prometheus-hosts - service_description Rabbitmq_message-return-percent - use generic-service - } - - define service { - check_command check_prom_alert!rabbitmq_consumers_low_utilization!CRITICAL- Rabbitmq consumer message consumption rate is slow!OK- rabbitmq message consumption speed is normal - hostgroup_name prometheus-hosts - service_description Rabbitmq_consumer-utilization - use generic-service - } - - define service { - check_command check_prom_alert!rabbitmq_high_message_load!CRITICAL- Rabbitmq unacknowledged message count is high!OK- rabbitmq unacknowledged message count is high - hostgroup_name prometheus-hosts - service_description Rabbitmq_rabbitmq-queue-health - use generic-service - } - - define service { - check_command check_prom_alert!es_high_process_open_files_count!CRITICAL- Elasticsearch {host} has high process open file count!OK- Elasticsearch process open file count is normal. - hostgroup_name prometheus-hosts - service_description ES_high-process-open-file-count - use generic-service - } - - define service { - check_command check_prom_alert!es_high_process_cpu_percent!CRITICAL- Elasticsearch {instance} has high process CPU percent!OK- Elasticsearch process cpu usage is normal. - hostgroup_name prometheus-hosts - service_description ES_high-process-cpu-percent - use generic-service - } - - define service { - check_command check_prom_alert!es_fs_usage_high!CRITICAL- Elasticsearch {instance} has high filesystem usage!OK- Elasticsearch filesystem usage is normal. - hostgroup_name prometheus-hosts - service_description ES_high-filesystem-usage - use generic-service - } - - define service { - check_command check_prom_alert!es_unassigned_shards!CRITICAL- Elasticsearch has unassinged shards!OK- Elasticsearch has no unassigned shards. - hostgroup_name prometheus-hosts - service_description ES_unassigned-shards - use generic-service - } - - define service { - check_command check_prom_alert!es_cluster_health_timed_out!CRITICAL- Elasticsearch Cluster health status call timedout!OK- Elasticsearch cluster health is retrievable. - hostgroup_name prometheus-hosts - service_description ES_cluster-health-timedout - use generic-service - } - - define service { - check_command check_prom_alert!es_cluster_health_status_alert!CRITICAL- Elasticsearch cluster health status is not green. One or more shards or replicas are unallocated!OK- Elasticsearch cluster health is green. - hostgroup_name prometheus-hosts - service_description ES_cluster-health-status - use generic-service - } - - define service { - check_command check_prom_alert!es_cluster_health_too_few_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 nodes running!OK- Elasticsearch cluster has 3 or more nodes running. - hostgroup_name prometheus-hosts - service_description ES_cluster-running-node-count - use generic-service - } - - define service { - check_command check_prom_alert!es_cluster_health_too_few_data_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 data nodes running!OK- Elasticsearch cluster has 3 or more data nodes running. - hostgroup_name prometheus-hosts - service_description ES_cluster-running-data-node-count - use generic-service - } - - define service { - check_command check_prom_alert!mariadb_table_lock_wait_high!CRITICAL- Mariadb has high number of table lock waits!OK- No issues found with table lock waits. - hostgroup_name prometheus-hosts - service_description Mariadb_table-lock-waits-high - use generic-service - } - - define service { - check_command check_prom_alert!mariadb_node_not_ready!CRITICAL- Mariadb {instance} is not ready!OK- All galera cluster nodes are ready. - hostgroup_name prometheus-hosts - service_description Mariadb_node-ready - use generic-service - } - - define service { - check_command check_prom_alert!mariadb_galera_node_out_of_sync!CRITICAL- Mariadb {instance} is out of sync!OK- All galera cluster nodes are in sync - hostgroup_name prometheus-hosts - service_description Mariadb_node-synchronized - use generic-service - } - - define service { - check_command check_prom_alert!mariadb_innodb_replication_fallen_behind!CRITICAL- Innodb replication has fallen behind and not recovering!OK- innodb replication lag is nominal. - hostgroup_name prometheus-hosts - service_description Mariadb_innodb-replication-lag - use generic-service - } - - define service { - check_command check_prom_alert!pg_replication_fallen_behind!CRITICAL- Postgres Replication lag is over 2 minutes!OK- postgresql replication lag is nominal. - hostgroup_name prometheus-hosts - service_description Postgresql_replication-lag - use generic-service - } - - define service { - check_command check_prom_alert!pg_connections_too_high!CRITICAL- Postgres has more than 95% of available connections in use.!OK- postgresql open connections are within bounds. - hostgroup_name prometheus-hosts - service_description Postgresql_connections - use generic-service - } - - define service { - check_command check_prom_alert!pg_deadlocks_detected!CRITICAL- Postgres server is experiencing deadlocks!OK- postgresql is not showing any deadlocks. - hostgroup_name prometheus-hosts - service_description Postgresql_deadlocks - use generic-service - } - - define service { - check_command check_prom_alert!prom_exporter_ceph_unavailable!CRITICAL- CEPH exporter is not collecting metrics for alerting!OK- CEPH exporter metrics are available. - hostgroup_name prometheus-hosts - service_description Prometheus-exporter_CEPH - use generic-service - } - - define service { - check_command check_prom_alert!prom_exporter_openstack_unavailable!CRITICAL- Openstack exporter is not collecting metrics for alerting!OK- Openstack exporter metrics are available. - hostgroup_name prometheus-hosts - service_description Prometheus-exporter_Openstack - use generic-service - } - - define service { - check_command check_prom_alert!prom_exporter_mariadb_unavailable!CRITICAL- MariaDB exporter is not collecting metrics for alerting!OK- MariaDB exporter metrics are available. - hostgroup_name prometheus-hosts - service_description Prometheus-exporter_MariaDB - use generic-service - } - - define service { - check_command check_prom_alert!prom_exporter_kube_state_metrics_unavailable!CRITICAL- kube-state-metrics exporter is not collecting metrics for alerting!OK- kube-state-metrics exporter metrics are available. - hostgroup_name prometheus-hosts - service_description Prometheus-exporter_Kube-state-metrics - use generic-service - } - - define service { - check_command check_prom_alert!prom_exporter_postgresql_unavailable!CRITICAL- Postgresql exporter is not collecting metrics for alerting!OK- Postgresql exporter metrics are available. - hostgroup_name prometheus-hosts - service_description Prometheus-exporter_Postgresql - use generic-service - } - - define service { - check_command check_prom_alert!prom_exporter_node_unavailable!CRITICAL- Node exporter is not collecting metrics for alerting!OK- Node exporter metrics are available. - hostgroup_name prometheus-hosts - service_description Prometheus-exporter_Node - use generic-service - } - - define service { - check_command check_prom_alert!prom_exporter_calico_unavailable!CRITICAL- Calico exporter is not collecting metrics for alerting!OK- Calico exporter metrics are available. - hostgroup_name prometheus-hosts - service_description Prometheus-exporter_Calico - use generic-service - } - - define service { - check_command check_prom_alert!prom_exporter_elasticsearch_unavailable!CRITICAL- Elasticsearch exporter is not collecting metrics for alerting!OK- Elasticsearch exporter metrics are available. - hostgroup_name prometheus-hosts - service_description Prometheus-exporter_Elasticsearch - use generic-service - } - - define service { - check_command check_prom_alert!prom_exporter_fluentd_unavailable!CRITICAL- Fluentd exporter is not collecting metrics for alerting!OK- Fluentd exporter metrics are available. - hostgroup_name prometheus-hosts - service_description Prometheus-exporter_Fluentd - use generic-service - } - - define service { - check_command check_filespace_mounts-usage-rate-fullin4hrs - check_interval 60 - hostgroup_name base-os - service_description Filespace_mounts-usage-rate-fullin4hrs - use notifying_service - } - - define service { - check_command check_filespace_mounts-usage - check_interval 60 - hostgroup_name base-os - service_description Filespace_mounts-usage - use notifying_service - } - - define service { - check_command check_node_loadavg - hostgroup_name base-os - service_description CPU_Load-average - use notifying_service - } - - define service { - check_command check_node_cpu_util - hostgroup_name base-os - service_description CPU_utilization - use notifying_service - } - - define service { - check_command check_network_connections - hostgroup_name base-os - service_description Network_connections - use notifying_service - } - - define service { - check_command check_memory_usage - hostgroup_name base-os - service_description Memory_usage - use notifying_service - } - - define service { - check_command check_disk_write_latency - hostgroup_name base-os - service_description Disk_write-latency - use notifying_service - } - - define service { - check_command check_disk_read_latency - hostgroup_name base-os - service_description Disk_read-latency - use notifying_service - } - - define service { - check_command check_entropy_availability - hostgroup_name base-os - service_description Entropy_availability - use notifying_service - } - - define service { - check_command check_filedescriptor_usage_rate - hostgroup_name base-os - service_description FileDescriptors_usage-rate-high - use notifying_service - } - - define service { - check_command check_hwmon_high_cpu_temp - hostgroup_name base-os - service_description HW_cpu-temp-high - use notifying_service - } - - define service { - check_command check_network_receive_drop_high - hostgroup_name base-os - service_description Network_receive-drop-high - use notifying_service - } - - define service { - check_command check_network_transmit_drop_high - hostgroup_name base-os - service_description Network_transmit-drop-high - use notifying_service - } - - define service { - check_command check_network_receive_errors_high - hostgroup_name base-os - service_description Network_receive-errors-high - use notifying_service - } - - define service { - check_command check_network_transmit_errors_high - hostgroup_name base-os - service_description Network_transmit-errors-high - use notifying_service - } - - define service { - check_command check_vmstat_paging_rate - hostgroup_name base-os - service_description Memory_vmstat-paging-rate - use notifying_service - } - - define service { - check_command check_xfs_block_allocation - hostgroup_name base-os - service_description XFS_block-allocation - use notifying_service - } - - define service { - check_command check_network_bond_status - hostgroup_name base-os - service_description Network_bondstatus - use notifying_service - } - - define service { - check_command check_numa_memory_usage - hostgroup_name base-os - service_description Memory_NUMA-usage - use notifying_service - } - - define service { - check_command check_ntp_sync - hostgroup_name base-os - service_description NTP_sync - use notifying_service - } + base: + template: | + define host { + address 127.0.0.1 + alias Prometheus Monitoring + check_command check-prometheus-host-alive + host_name {{ tuple "monitoring" "public" . | include "helm-toolkit.endpoints.hostname_short_endpoint_lookup" }} + hostgroups prometheus-hosts + use linux-server + } + + define contact { + alias notifying contact + contact_name notifying_contact + host_notification_options d,u,r,f,s + host_notification_period 24x7 + name notifying_contact + register 0 + service_notification_options w,u,c,r,f,s + service_notification_period 24x7 + } + + define contact { + alias snmp contact + contact_name snmp_notifying_contact + host_notification_commands send_host_snmp_trap + name snmp_notifying_contact + service_notification_commands send_service_snmp_trap + use notifying_contact + } + + define contact { + alias HTTP contact + contact_name http_notifying_contact + host_notification_commands send_host_http_post + name http_notifying_contact + service_notification_commands send_service_http_post + use notifying_contact + } + + define contactgroup { + alias SNMP and HTTP notifying group + contactgroup_name snmp_and_http_notifying_contact_group + members snmp_notifying_contact,http_notifying_contact + } + + define hostgroup { + alias Prometheus Virtual Host + hostgroup_name prometheus-hosts + } + + define hostgroup { + alias all + hostgroup_name all + } + + define hostgroup { + alias base-os + hostgroup_name base-os + } + + define command { + command_line $USER1$/send_service_trap.sh '$USER8$' '$HOSTNAME$' '$SERVICEDESC$' $SERVICESTATEID$ '$SERVICEOUTPUT$' '$USER4$' '$USER5$' + command_name send_service_snmp_trap + } + + define command { + command_line $USER1$/send_host_trap.sh '$USER8$' '$HOSTNAME$' $HOSTSTATEID$ '$HOSTOUTPUT$' '$USER4$' '$USER5$' + command_name send_host_snmp_trap + } + + define command { + command_line $USER1$/send_http_post_event.py --type service --hostname '$HOSTNAME$' --servicedesc '$SERVICEDESC$' --state_id $SERVICESTATEID$ --output '$SERVICEOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$' + command_name send_service_http_post + } + + define command { + command_line $USER1$/send_http_post_event.py --type host --hostname '$HOSTNAME$' --state_id $HOSTSTATEID$ --output '$HOSTOUTPUT$' --monitoring_hostname '$HOSTNAME$' --primary_url '$USER6$' --secondary_url '$USER7$' + command_name send_host_http_post + } + + define command { + command_line $USER1$/check_rest_get_api.py --url $USER2$ --warning_response_seconds 5 --critical_response_seconds 10 + command_name check-prometheus-host-alive + } + + define command { + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --labels_csv '$ARG2$' --msg_format '$ARG3$' --ok_message '$ARG4$' + command_name check_prom_alert_with_labels + } + + define command { + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --msg_format '$ARG2$' --ok_message '$ARG3$' + command_name check_prom_alert + } + + define service { + check_interval 60 + contact_groups snmp_and_http_notifying_contact_group + flap_detection_enabled 0 + name notifying_service + notification_interval 120 + process_perf_data 0 + register 0 + retry_interval 30 + use generic-service + } + kubernetes: + template: | + define service { + check_command check_prom_alert!prom_exporter_calico_unavailable!CRITICAL- Calico exporter is not collecting metrics for alerting!OK- Calico exporter metrics are available. + hostgroup_name prometheus-hosts + service_description Prometheus-exporter_Calico + use generic-service + } + + define service { + check_command check_prom_alert!prom_exporter_kube_state_metrics_unavailable!CRITICAL- kube-state-metrics exporter is not collecting metrics for alerting!OK- kube-state-metrics exporter metrics are available. + hostgroup_name prometheus-hosts + service_description Prometheus-exporter_Kube-state-metrics + use generic-service + } + + define service { + check_command check_prom_alert!K8SNodesNotReady!CRITICAL- One or more nodes are not ready. + check_interval 60 + hostgroup_name prometheus-hosts + service_description Nodes_health + use generic-service + } + + define service { + check_command check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="prometheus"!statefulset {statefulset} has lesser than configured replicas + check_interval 60 + hostgroup_name prometheus-hosts + service_description Prometheus_replica-count + use notifying_service + } + + define service { + check_command check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="alertmanager"!statefulset {statefulset} has lesser than configured replicas + check_interval 60 + hostgroup_name prometheus-hosts + service_description PrometheusAlertmanager_replica-count + use notifying_service + } + + define service { + check_command check_prom_alert!replicas_unavailable_statefulset!CRITICAL- statefulset {statefulset} has lesser than configured replicas!OK- All statefulsets have configured amount of replicas + check_interval 60 + hostgroup_name prometheus-hosts + service_description Statefulset_replica-count + use notifying_service + } + + define service { + check_command check_prom_alert!daemonsets_misscheduled!CRITICAL- Daemonset {daemonset} is incorrectly scheudled!OK- No daemonset misscheduling detected + check_interval 60 + hostgroup_name prometheus-hosts + service_description Daemonset_misscheduled + use notifying_service + } + + define service { + check_command check_prom_alert!daemonsets_not_scheduled!CRITICAL- Daemonset {daemonset} is missing to be scheduled in some nodes!OK- All daemonset scheduling is as desired + check_interval 60 + hostgroup_name prometheus-hosts + service_description Daemonset_not-scheduled + use notifying_service + } + + define service { + check_command check_prom_alert!daemonset_pods_unavailable!CRITICAL- Daemonset {daemonset} has pods unavailable!OK- All daemonset pods available + check_interval 60 + hostgroup_name prometheus-hosts + service_description Daemonset_pods-unavailable + use notifying_service + } + + define service { + check_command check_prom_alert!deployment_replicas_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas!OK- All deployments have desired replicas + check_interval 60 + hostgroup_name prometheus-hosts + service_description Deployment_replicas-unavailable + use notifying_service + } + + define service { + check_command check_prom_alert!volume_claim_capacity_high_utilization!CRITICAL- Volume claim {persistentvolumeclaim} has exceed 80% utilization!OK- All volume claims less than 80% utilization + check_interval 60 + hostgroup_name prometheus-hosts + service_description Volume_claim_high_utilization + use notifying_service + } + + define service { + check_command check_prom_alert!rollingupdate_deployment_replica_less_than_spec_max_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas during a rolling update!OK- All deployments have desired replicas + check_interval 60 + hostgroup_name prometheus-hosts + service_description RollingUpdate_Deployment-replicas-unavailable + use notifying_service + } + + define service { + check_command check_prom_alert!job_status_failed!CRITICAL- Job {exported_job} has failed!OK- No Job failures + check_interval 60 + hostgroup_name prometheus-hosts + service_description Job_status-failed + use notifying_service + } + + define service { + check_command check_prom_alert!pod_status_pending!CRITICAL- Pod {pod} in namespace {namespace} has been in pending status for more than 10 minutes!OK- No pods in pending status + check_interval 60 + hostgroup_name prometheus-hosts + service_description Pod_status-pending + use notifying_service + } + + define service { + check_command check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status + check_interval 60 + hostgroup_name prometheus-hosts + service_description Pod_status-error-image-pull + use notifying_service + } + + define service { + check_command check_prom_alert! pod_status_error_image_pull_backoff!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ImagePullBackOff for more than 10 minutes!OK- No pods in error status + check_interval 60 + hostgroup_name prometheus-hosts + service_description Pod_status-error-image-pull + use notifying_service + } + + define service { + check_command check_prom_alert! pod_error_config_error!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of CreateContainerConfigError for more than 10 minutes!OK- No pods in error status + check_interval 60 + hostgroup_name prometheus-hosts + service_description Pod_status-error-image-pull + use notifying_service + } + + define service { + check_command check_prom_alert!pod_error_crash_loop_back_off!CRITICAL- Pod {pod} in namespace {namespace} has been in error status of CrashLoopBackOff for more than 10 minutes!OK- No pods in crashLoopBackOff status + check_interval 60 + hostgroup_name prometheus-hosts + service_description Pod_status-crashLoopBackOff + use notifying_service + } + + define service { + check_command check_prom_alert!replicaset_missing_replicas!CRITICAL- Replicaset {replicaset} is missing replicas!OK- No replicas missing from replicaset + check_interval 60 + hostgroup_name prometheus-hosts + service_description Replicaset_missing-replicas + use notifying_service + } + + define service { + check_command check_prom_alert!pod_container_terminated!CRITICAL- pod {pod} in namespace {namespace} has a container in terminated state!OK- pod container status looks good + check_interval 60 + hostgroup_name prometheus-hosts + service_description Pod_status-container-terminated + use notifying_service + } + + define service { + check_command check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="DELETE"!CRITICAL- ETCD {instance} has a high HTTP DELETE operations failure!OK- ETCD at {instance} has low or no failures for HTTP DELETE + check_interval 60 + hostgroup_name prometheus-hosts + service_description ETCD_high-http-delete-failures + use notifying_service + } + + define service { + check_command check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method=~"GET|QGET"!CRITICAL- ETCD {instance} has a high HTTP GET operations failure!OK- ETCD at {instance} has low or no failures for HTTP GET + check_interval 60 + hostgroup_name prometheus-hosts + service_description ETCD_high-http-get-failures + use notifying_service + } + + define service { + check_command check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="PUT"!CRITICAL- ETCD {instance} has a high HTTP PUT operations failure!OK- ETCD at {instance} has low or no failures for HTTP PUT + check_interval 60 + hostgroup_name prometheus-hosts + service_description ETCD_high-http-update-failures + use notifying_service + } + + define service { + check_command check_prom_alert!calico_iptable_save_errors_high_1h!CRITICAL- Felix instance {instance} has seen high iptable save errors within the last hour!OK- iptables save errors are none or low + hostgroup_name prometheus-hosts + service_description Calico_iptables-save-errors + use notifying_service + } + + define service { + check_command check_prom_alert!calico_ipset_errors_high_1h!CRITICAL- Felix instance {instance} has seen high ipset errors within the last hour!OK- ipset errors are none or low + hostgroup_name prometheus-hosts + service_description Calico_ipset-errors + use notifying_service + } + + define service { + check_command check_prom_alert!calico_datapane_iface_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane interface message batch size!OK- dataplane interface message batch size are low + hostgroup_name prometheus-hosts + service_description Calico_interface-message-batch-size + use notifying_service + } + + define service { + check_command check_prom_alert!calico_datapane_address_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane address message batch size!OK- dataplane address message batch size are low + hostgroup_name prometheus-hosts + service_description Calico_address-message-batch-size + use notifying_service + } + + define service { + check_command check_prom_alert!calico_datapane_failures_high_1h!CRITICAL- Felix instance {instance} has seen high dataplane failures within the last hour!OK- datapane failures are none or low + hostgroup_name prometheus-hosts + service_description Calico_datapane_failures_high + use notifying_service + } + node: + template: | + define service { + check_command check_prom_alert!prom_exporter_node_unavailable!CRITICAL- Node exporter is not collecting metrics for alerting!OK- Node exporter metrics are available. + hostgroup_name prometheus-hosts + service_description Prometheus-exporter_Node + use generic-service + } + + define command { + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_in_4h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} will be full in four hours' --ok_message 'OK- All mountpoints usage rate is normal' + command_name check_filespace_mounts-usage-rate-fullin4hrs + } + + define command { + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_80percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} is more than 80 pecent full' --ok_message 'OK- All mountpoints usage is normal' + command_name check_filespace_mounts-usage + } + + define command { + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_load1_90percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node load average has been more than 90% for the pash hour' --ok_message 'OK- Node load average is normal' + command_name check_node_loadavg + } + + define command { + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_cpu_util_90percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node CPU utilization has been more than 90% for the pash hour' --ok_message 'OK- Node cpu utilization is normal' + command_name check_node_cpu_util + } + + define command { + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_conntrack_usage_80percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node network connections are more than 90% in use' --ok_message 'OK- Network connection utilization is normal' + command_name check_network_connections + } + + define command { + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_memory_load' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node memory usage is more than 85%' --ok_message 'OK- Node memory usage is less than 85%' + command_name check_memory_usage + } + + define command { + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_write_latency' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Disk write latency is high' --ok_message 'OK- Node disk write latency is normal' + command_name check_disk_write_latency + } + + define command { + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_read_latency' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Disk read latency is high' --ok_message 'OK- Node disk read latency is normal' + command_name check_disk_read_latency + } + + define command { + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_entropy_available_low' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- System has low entropy availability' --ok_message 'OK- System entropy availability is sufficient' + command_name check_entropy_availability + } + + define command { + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filedescriptors_full_in_3h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- at current consumption rate no free file descriptors will be available in 3hrs.' --ok_message 'OK- System file descriptor consumption is ok.' + command_name check_filedescriptor_usage_rate + } + + define command { + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_hwmon_high_cpu_temp' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- CPU temperature is 90 percent of critical temperature.' --ok_message 'OK- CPU temperatures are normal.' + command_name check_hwmon_high_cpu_temp + } + + define command { + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_rcv' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network reception.' --ok_message 'OK- network packet receive drops not high.' + command_name check_network_receive_drop_high + } + + define command { + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network transmission.' --ok_message 'OK- network packet tramsmit drops not high.' + command_name check_network_transmit_drop_high + } + + define command { + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network reception.' --ok_message 'OK- network reception errors not high.' + command_name check_network_receive_errors_high + } + + define command { + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network transmission.' --ok_message 'OK- network transmission errors not high.' + command_name check_network_transmit_errors_high + } + + define command { + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_vmstat_paging_rate_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Memory paging rate over 5 minutes is high.' --ok_message 'OK- Memory paging rate over 5 minutes is ok.' + command_name check_vmstat_paging_rate + } + + define command { + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_xfs_block_allocation_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- XFS block allocation is more than 80 percent of available.' --ok_message 'OK- XFS block allocation is less than 80 percent of available.' + command_name check_xfs_block_allocation + } + + define command { + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_bond_slaves_down' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- {master} is missing slave interfaces.' --ok_message 'OK- Network bonds have slave interfaces functional.' + command_name check_network_bond_status + } + + define command { + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_numa_memory_used' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NUMA memory usage is more than 80 percent of available.' --ok_message 'OK- NUMA memory usage is normal.' + command_name check_numa_memory_usage + } + + define command { + command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_ntp_clock_skew_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NTP clock skew is more than 2 seconds.' --ok_message 'OK- NTP clock skew is less than 2 seconds.' + command_name check_ntp_sync + } + + define service { + check_command check_filespace_mounts-usage-rate-fullin4hrs + check_interval 60 + hostgroup_name base-os + service_description Filespace_mounts-usage-rate-fullin4hrs + use notifying_service + } + + define service { + check_command check_filespace_mounts-usage + check_interval 60 + hostgroup_name base-os + service_description Filespace_mounts-usage + use notifying_service + } + + define service { + check_command check_node_loadavg + hostgroup_name base-os + service_description CPU_Load-average + use notifying_service + } + + define service { + check_command check_node_cpu_util + hostgroup_name base-os + service_description CPU_utilization + use notifying_service + } + + define service { + check_command check_network_connections + hostgroup_name base-os + service_description Network_connections + use notifying_service + } + + define service { + check_command check_memory_usage + hostgroup_name base-os + service_description Memory_usage + use notifying_service + } + + define service { + check_command check_disk_write_latency + hostgroup_name base-os + service_description Disk_write-latency + use notifying_service + } + + define service { + check_command check_disk_read_latency + hostgroup_name base-os + service_description Disk_read-latency + use notifying_service + } + + define service { + check_command check_entropy_availability + hostgroup_name base-os + service_description Entropy_availability + use notifying_service + } + + define service { + check_command check_filedescriptor_usage_rate + hostgroup_name base-os + service_description FileDescriptors_usage-rate-high + use notifying_service + } + + define service { + check_command check_hwmon_high_cpu_temp + hostgroup_name base-os + service_description HW_cpu-temp-high + use notifying_service + } + + define service { + check_command check_network_receive_drop_high + hostgroup_name base-os + service_description Network_receive-drop-high + use notifying_service + } + + define service { + check_command check_network_transmit_drop_high + hostgroup_name base-os + service_description Network_transmit-drop-high + use notifying_service + } + + define service { + check_command check_network_receive_errors_high + hostgroup_name base-os + service_description Network_receive-errors-high + use notifying_service + } + + define service { + check_command check_network_transmit_errors_high + hostgroup_name base-os + service_description Network_transmit-errors-high + use notifying_service + } + + define service { + check_command check_vmstat_paging_rate + hostgroup_name base-os + service_description Memory_vmstat-paging-rate + use notifying_service + } + + define service { + check_command check_xfs_block_allocation + hostgroup_name base-os + service_description XFS_block-allocation + use notifying_service + } + + define service { + check_command check_network_bond_status + hostgroup_name base-os + service_description Network_bondstatus + use notifying_service + } + + define service { + check_command check_numa_memory_usage + hostgroup_name base-os + service_description Memory_NUMA-usage + use notifying_service + } + + define service { + check_command check_ntp_sync + hostgroup_name base-os + service_description NTP_sync + use notifying_service + } + ceph: + template: | + define service { + check_command check_prom_alert!prom_exporter_ceph_unavailable!CRITICAL- CEPH exporter is not collecting metrics for alerting!OK- CEPH exporter metrics are available. + hostgroup_name prometheus-hosts + service_description Prometheus-exporter_CEPH + use generic-service + } + + define command { + command_line $USER1$/check_exporter_health_metric.py --exporter_api $USER10$ --health_metric ceph_health_status --critical 2 --warning 1 + command_name check_ceph_health + } + + define service { + check_command check_ceph_health + check_interval 300 + hostgroup_name base-os + service_description CEPH_health + use notifying_service + } + + define service { + check_command check_prom_alert!ceph_monitor_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists + check_interval 60 + hostgroup_name prometheus-hosts + service_description CEPH_quorum + use notifying_service + } + + define service { + check_command check_prom_alert!ceph_cluster_usage_high!CRITICAL- ceph cluster storage is more than 80 percent!OK- ceph storage is less than 80 percent + check_interval 60 + hostgroup_name prometheus-hosts + service_description CEPH_storage-usage + use notifying_service + } + + define service { + check_command check_prom_alert!ceph_placement_group_degrade_pct_high!CRITICAL- ceph cluster PGs down are more than 80 percent!OK- ceph PG degradation is less than 80 percent + check_interval 60 + hostgroup_name prometheus-hosts + service_description CEPH_PGs-degradation + use notifying_service + } + + define service { + check_command check_prom_alert!ceph_osd_down!CRITICAL- One or more CEPH OSDs are down for more than 5 minutes!OK- All the CEPH OSDs are up + check_interval 60 + hostgroup_name prometheus-hosts + service_description CEPH_OSDs-down + use notifying_service + } + + define service { + check_command check_prom_alert!ceph_monitor_clock_skew_high!CRITICAL- CEPH clock skew is more than 2 seconds!OK- CEPH clock skew is less than 2 seconds + check_interval 60 + hostgroup_name prometheus-hosts + service_description CEPH_Clock-skew + use notifying_service + } nagios: template: | accept_passive_host_checks=1 @@ -1416,7 +1040,10 @@ conf: bare_update_check=0 cached_host_check_horizon=15 cached_service_check_horizon=15 - cfg_file=/opt/nagios/etc/nagios_objects.cfg + {{- $objectKeys := keys .Values.conf.nagios.objects -}} + {{- range $object := $objectKeys }} + cfg_file=/opt/nagios/etc/{{$object}}.cfg + {{- end }} cfg_file=/opt/nagios/etc/objects/commands.cfg cfg_file=/opt/nagios/etc/objects/contacts.cfg cfg_file=/opt/nagios/etc/objects/timeperiods.cfg diff --git a/nagios/values_overrides/elasticsearch-objects.yaml b/nagios/values_overrides/elasticsearch-objects.yaml new file mode 100644 index 000000000..14119a02f --- /dev/null +++ b/nagios/values_overrides/elasticsearch-objects.yaml @@ -0,0 +1,93 @@ +conf: + nagios: + objects: + fluent: + template: | + define service { + check_command check_prom_alert!fluentd_not_running!CRITICAL- fluentd is not running on {instance}!OK- Flunetd is working on all nodes + check_interval 60 + hostgroup_name prometheus-hosts + service_description Fluentd_status + use notifying_service + } + + define service { + check_command check_prom_alert!prom_exporter_fluentd_unavailable!CRITICAL- Fluentd exporter is not collecting metrics for alerting!OK- Fluentd exporter metrics are available. + hostgroup_name prometheus-hosts + service_description Prometheus-exporter_Fluentd + use generic-service + } + elasticsearch: + template: | + define command { + command_line $USER1$/query_elasticsearch.py $USER9$ '$ARG1$' '$ARG2$' '$ARG3$' '$ARG4$' '$ARG5$' --simple_query '$ARG6$' --simple_query_fields '$ARG7$' --match '$ARG8$' --range '$ARG9$' + command_name check_es_query + } + + define command { + command_line $USER1$/query_elasticsearch.py $USER9$ '$ARG1$' '$ARG2$' '$ARG3$' '$ARG4$' '$ARG5$' --simple_query '$ARG6$' --simple_query_fields '$ARG7$' --query_file '/opt/nagios/etc/objects/query_es_clauses.json' --query_clause '$ARG8$' --match '$ARG9$' --range '$ARG10$' + command_name check_es_query_w_file + } + + define service { + check_command check_prom_alert!prom_exporter_elasticsearch_unavailable!CRITICAL- Elasticsearch exporter is not collecting metrics for alerting!OK- Elasticsearch exporter metrics are available. + hostgroup_name prometheus-hosts + service_description Prometheus-exporter_Elasticsearch + use generic-service + } + + define service { + check_command check_prom_alert!es_high_process_open_files_count!CRITICAL- Elasticsearch {host} has high process open file count!OK- Elasticsearch process open file count is normal. + hostgroup_name prometheus-hosts + service_description ES_high-process-open-file-count + use generic-service + } + + define service { + check_command check_prom_alert!es_high_process_cpu_percent!CRITICAL- Elasticsearch {instance} has high process CPU percent!OK- Elasticsearch process cpu usage is normal. + hostgroup_name prometheus-hosts + service_description ES_high-process-cpu-percent + use generic-service + } + + define service { + check_command check_prom_alert!es_fs_usage_high!CRITICAL- Elasticsearch {instance} has high filesystem usage!OK- Elasticsearch filesystem usage is normal. + hostgroup_name prometheus-hosts + service_description ES_high-filesystem-usage + use generic-service + } + + define service { + check_command check_prom_alert!es_unassigned_shards!CRITICAL- Elasticsearch has unassinged shards!OK- Elasticsearch has no unassigned shards. + hostgroup_name prometheus-hosts + service_description ES_unassigned-shards + use generic-service + } + + define service { + check_command check_prom_alert!es_cluster_health_timed_out!CRITICAL- Elasticsearch Cluster health status call timedout!OK- Elasticsearch cluster health is retrievable. + hostgroup_name prometheus-hosts + service_description ES_cluster-health-timedout + use generic-service + } + + define service { + check_command check_prom_alert!es_cluster_health_status_alert!CRITICAL- Elasticsearch cluster health status is not green. One or more shards or replicas are unallocated!OK- Elasticsearch cluster health is green. + hostgroup_name prometheus-hosts + service_description ES_cluster-health-status + use generic-service + } + + define service { + check_command check_prom_alert!es_cluster_health_too_few_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 nodes running!OK- Elasticsearch cluster has 3 or more nodes running. + hostgroup_name prometheus-hosts + service_description ES_cluster-running-node-count + use generic-service + } + + define service { + check_command check_prom_alert!es_cluster_health_too_few_data_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 data nodes running!OK- Elasticsearch cluster has 3 or more data nodes running. + hostgroup_name prometheus-hosts + service_description ES_cluster-running-data-node-count + use generic-service + } diff --git a/nagios/values_overrides/openstack-objects.yaml b/nagios/values_overrides/openstack-objects.yaml new file mode 100644 index 000000000..07222f7b4 --- /dev/null +++ b/nagios/values_overrides/openstack-objects.yaml @@ -0,0 +1,270 @@ +conf: + nagios: + objects: + mariadb: + template: | + define service { + check_command check_prom_alert!prom_exporter_mariadb_unavailable!CRITICAL- MariaDB exporter is not collecting metrics for alerting!OK- MariaDB exporter metrics are available. + hostgroup_name prometheus-hosts + service_description Prometheus-exporter_MariaDB + use generic-service + } + + define service { + check_command check_prom_alert!mariadb_table_lock_wait_high!CRITICAL- Mariadb has high number of table lock waits!OK- No issues found with table lock waits. + hostgroup_name prometheus-hosts + service_description Mariadb_table-lock-waits-high + use generic-service + } + + define service { + check_command check_prom_alert!mariadb_node_not_ready!CRITICAL- Mariadb {instance} is not ready!OK- All galera cluster nodes are ready. + hostgroup_name prometheus-hosts + service_description Mariadb_node-ready + use generic-service + } + + define service { + check_command check_prom_alert!mariadb_galera_node_out_of_sync!CRITICAL- Mariadb {instance} is out of sync!OK- All galera cluster nodes are in sync + hostgroup_name prometheus-hosts + service_description Mariadb_node-synchronized + use generic-service + } + + define service { + check_command check_prom_alert!mariadb_innodb_replication_fallen_behind!CRITICAL- Innodb replication has fallen behind and not recovering!OK- innodb replication lag is nominal. + hostgroup_name prometheus-hosts + service_description Mariadb_innodb-replication-lag + use generic-service + } + rabbitmq: + template: | + define service { + check_command check_prom_alert!rabbitmq_network_pratitions_detected!CRITICAL- Rabbitmq instance {instance} has network partitions!OK- no network partitions detected in rabbitmq + hostgroup_name prometheus-hosts + service_description Rabbitmq_network-partitions-exist + use generic-service + } + + define service { + check_command check_prom_alert!rabbitmq_down!CRITICAL- Rabbitmq instance {instance} is down!OK- rabbitmq is available + hostgroup_name prometheus-hosts + service_description Rabbitmq_up + use generic-service + } + + define service { + check_command check_prom_alert!rabbitmq_file_descriptor_usage_high!CRITICAL- Rabbitmq instance {instance} has file desciptor usage more than 80 percent!OK- rabbitmq file descriptor usage is normal + hostgroup_name prometheus-hosts + service_description Rabbitmq_file-descriptor-usage + use generic-service + } + + define service { + check_command check_prom_alert!rabbitmq_node_disk_free_alarm!CRITICAL- Rabbitmq instance {instance} has a disk usage alarm!OK- rabbitmq node disk has no alarms + hostgroup_name prometheus-hosts + service_description Rabbitmq_node-disk-alarm + use generic-service + } + + define service { + check_command check_prom_alert!rabbitmq_node_memory_alarm!CRITICAL- Rabbitmq instance {instance} has a memory alarm!OK- rabbitmq node memory has no alarms + hostgroup_name prometheus-hosts + service_description Rabbitmq_node-memory-alarm + use generic-service + } + + define service { + check_command check_prom_alert!rabbitmq_less_than_3_nodes!CRITICAL- Rabbitmq has less than 3 nodes to serve!OK- rabbitmq has atleast 3 nodes serving + hostgroup_name prometheus-hosts + service_description Rabbitmq_high-availability + use generic-service + } + + define service { + check_command check_prom_alert!rabbitmq_queue_messages_returned_high!CRITICAL- Rabbitmq has high percent of messages being returned!OK- rabbitmq messages are consumed and low or no returns exist. + hostgroup_name prometheus-hosts + service_description Rabbitmq_message-return-percent + use generic-service + } + + define service { + check_command check_prom_alert!rabbitmq_consumers_low_utilization!CRITICAL- Rabbitmq consumer message consumption rate is slow!OK- rabbitmq message consumption speed is normal + hostgroup_name prometheus-hosts + service_description Rabbitmq_consumer-utilization + use generic-service + } + + define service { + check_command check_prom_alert!rabbitmq_high_message_load!CRITICAL- Rabbitmq unacknowledged message count is high!OK- rabbitmq unacknowledged message count is high + hostgroup_name prometheus-hosts + service_description Rabbitmq_rabbitmq-queue-health + use generic-service + } + openstack: + template: | + define service { + check_command check_prom_alert!os_glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available + check_interval 60 + hostgroup_name prometheus-hosts + service_description API_glance + use notifying_service + } + + define service { + check_command check_prom_alert!os_nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available + check_interval 60 + hostgroup_name prometheus-hosts + service_description API_nova + use notifying_service + } + + define service { + check_command check_prom_alert!os_keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available + check_interval 60 + hostgroup_name prometheus-hosts + service_description API_keystone + use notifying_service + } + + define service { + check_command check_prom_alert!os_neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available + check_interval 60 + hostgroup_name prometheus-hosts + service_description API_neutron + use notifying_service + } + + define service { + check_command check_prom_alert!os_neutron_metadata_agent_availability!CRITICAL- Some Neutron metadata agents are not available!OK- All the neutron metadata agents are up + check_interval 60 + hostgroup_name prometheus-hosts + service_description Service_neutron-metadata-agent + use notifying_service + } + + define service { + check_command check_prom_alert!os_neutron_openvswitch_agent_availability!CRITICAL- Some Neutron openvswitch agents are not available!OK- All the neutron openvswitch agents are up + check_interval 60 + hostgroup_name prometheus-hosts + service_description Service_neutron-openvswitch-agent + use notifying_service + } + + define service { + check_command check_prom_alert!os_neutron_dhcp_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron dhcp agents are up + check_interval 60 + hostgroup_name prometheus-hosts + service_description Service_neutron-dhcp-agent + use notifying_service + } + + define service { + check_command check_prom_alert!os_neutron_l3_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron l3 agents are up + check_interval 60 + hostgroup_name prometheus-hosts + service_description Service_neutron-l3-agent + use notifying_service + } + + define service { + check_command check_prom_alert!os_swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available + check_interval 60 + hostgroup_name prometheus-hosts + service_description API_swift + use notifying_service + } + + define service { + check_command check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available + hostgroup_name prometheus-hosts + service_description API_cinder + use notifying_service + } + + define service { + check_command check_prom_alert!os_heat_api_availability!CRITICAL- Heat API at {url} is not available!OK- Heat API is available + check_interval 60 + hostgroup_name prometheus-hosts + service_description API_heat + use notifying_service + } + + define service { + check_command check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available + check_interval 60 + hostgroup_name prometheus-hosts + service_description API_cinder + use notifying_service + } + + define service { + check_command check_prom_alert!os_cinder_scheduler_availability!CRITICAL- Cinder scheduler is not available!OK- Cinder scheduler is available + check_interval 60 + hostgroup_name prometheus-hosts + service_description Service_cinder-scheduler + use notifying_service + } + + define service { + check_command check_prom_alert!os_nova_compute_down!CRITICAL- nova-compute services are down on certain hosts!OK- nova-compute services are up on all hosts + check_interval 60 + hostgroup_name prometheus-hosts + service_description Service_nova-compute + use notifying_service + } + + define service { + check_command check_prom_alert!os_nova_conductor_down!CRITICAL- nova-conductor services are down on certain hosts!OK- nova-conductor services are up on all hosts + check_interval 60 + hostgroup_name prometheus-hosts + service_description Service_nova-conductor + use notifying_service + } + + define service { + check_command check_prom_alert!os_nova_consoleauth_down!CRITICAL- nova-consoleauth services are down on certain hosts!OK- nova-consoleauth services are up on all hosts + check_interval 60 + hostgroup_name prometheus-hosts + service_description Service_nova-consoleauth + use notifying_service + } + + define service { + check_command check_prom_alert!openstack_nova_scheduler_down!CRITICAL- nova-scheduler services are down on certain hosts!OK- nova-scheduler services are up on all hosts + check_interval 60 + hostgroup_name prometheus-hosts + service_description Service_nova-scheduler + use notifying_service + } + + define service { + check_command check_prom_alert!os_vm_vcpu_usage_high!CRITICAL- vcpu usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs vcpu usage is less than 80 percent of available. + check_interval 60 + hostgroup_name prometheus-hosts + service_description OS-Total-Quota_VCPU-usage + use notifying_service + } + + define service { + check_command check_prom_alert!os_vm_ram_usage_high!CRITICAL- RAM usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs RAM usage is less than 80 percent of available. + check_interval 60 + hostgroup_name prometheus-hosts + service_description OS-Total-Quota_RAM-usage + use notifying_service + } + + define service { + check_command check_prom_alert!os_vm_disk_usage_high!CRITICAL- Disk usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs Disk usage is less than 80 percent of available. + check_interval 60 + hostgroup_name prometheus-hosts + service_description OS-Total-Quota_Disk-usage + use notifying_service + } + + define service { + check_command check_prom_alert!prom_exporter_openstack_unavailable!CRITICAL- Openstack exporter is not collecting metrics for alerting!OK- Openstack exporter metrics are available. + hostgroup_name prometheus-hosts + service_description Prometheus-exporter_Openstack + use generic-service + } diff --git a/nagios/values_overrides/postgresql-objects.yaml b/nagios/values_overrides/postgresql-objects.yaml new file mode 100644 index 000000000..caed1789f --- /dev/null +++ b/nagios/values_overrides/postgresql-objects.yaml @@ -0,0 +1,32 @@ +conf: + nagios: + objects: + postgresql: + template: | + define service { + check_command check_prom_alert!prom_exporter_postgresql_unavailable!CRITICAL- Postgresql exporter is not collecting metrics for alerting!OK- Postgresql exporter metrics are available. + hostgroup_name prometheus-hosts + service_description Prometheus-exporter_Postgresql + use generic-service + } + + define service { + check_command check_prom_alert!pg_replication_fallen_behind!CRITICAL- Postgres Replication lag is over 2 minutes!OK- postgresql replication lag is nominal. + hostgroup_name prometheus-hosts + service_description Postgresql_replication-lag + use generic-service + } + + define service { + check_command check_prom_alert!pg_connections_too_high!CRITICAL- Postgres has more than 95% of available connections in use.!OK- postgresql open connections are within bounds. + hostgroup_name prometheus-hosts + service_description Postgresql_connections + use generic-service + } + + define service { + check_command check_prom_alert!pg_deadlocks_detected!CRITICAL- Postgres server is experiencing deadlocks!OK- postgresql is not showing any deadlocks. + hostgroup_name prometheus-hosts + service_description Postgresql_deadlocks + use generic-service + } diff --git a/tools/deployment/common/nagios.sh b/tools/deployment/common/nagios.sh new file mode 100755 index 000000000..c195a4f3e --- /dev/null +++ b/tools/deployment/common/nagios.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +# Copyright 2017 The Openstack-Helm Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +set -xe + +#NOTE: Lint and package chart +make nagios + +#NOTE: Deploy command +tee /tmp/nagios.yaml << EOF +conf: + nagios: + query_es_clauses: + test_es_query: + hello: world +EOF +helm upgrade --install nagios ./nagios \ + --namespace=osh-infra \ + --values=/tmp/nagios.yaml \ + --values=nagios/values_overrides/openstack-objects.yaml \ + --values=nagios/values_overrides/postgresql-objects.yaml \ + --values=nagios/values_overrides/elasticsearch-objects.yaml + +#NOTE: Wait for deploy +./tools/deployment/common/wait-for-pods.sh osh-infra + +#NOTE: Validate Deployment info +helm status nagios + +#NOTE: Verify elasticsearch query clauses are functional by execing into pod +NAGIOS_POD=$(kubectl -n osh-infra get pods -l='application=nagios,component=monitoring' --output=jsonpath='{.items[0].metadata.name}') +kubectl exec $NAGIOS_POD -n osh-infra -c nagios -- cat /opt/nagios/etc/objects/query_es_clauses.json | python -m json.tool diff --git a/tools/deployment/osh-infra-monitoring/120-nagios.sh b/tools/deployment/osh-infra-monitoring/120-nagios.sh deleted file mode 100755 index bf585f61c..000000000 --- a/tools/deployment/osh-infra-monitoring/120-nagios.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -# Copyright 2017 The Openstack-Helm Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -set -xe - -#NOTE: Lint and package chart -make nagios - -#NOTE: Deploy command -helm upgrade --install nagios ./nagios \ - --namespace=osh-infra - -#NOTE: Wait for deploy -./tools/deployment/common/wait-for-pods.sh osh-infra - -#NOTE: Validate Deployment info -helm status nagios - -helm test nagios diff --git a/tools/deployment/osh-infra-monitoring/120-nagios.sh b/tools/deployment/osh-infra-monitoring/120-nagios.sh new file mode 120000 index 000000000..300a142bb --- /dev/null +++ b/tools/deployment/osh-infra-monitoring/120-nagios.sh @@ -0,0 +1 @@ +../common/nagios.sh \ No newline at end of file