8b29037cec
This is the action item to implement the spec: doc/source/specs/2025.1/chart_versioning.rst Also add overrides env variables - OSH_VALUES_OVERRIDES_PATH - OSH_INFRA_VALUES_OVERRIDES_PATH This commit temporarily disables all jobs that involve scripts in the OSH git repo because they need to be updated to work with the new values_overrides structure in the OSH-infra repo. Once this is merged I4974785c904cf7c8730279854e3ad9b6b7c35498 all these disabled test jobs must be enabled. Depends-On: I327103c18fc0e10e989a17f69b3bff9995c45eb4 Change-Id: I7bfdef3ea2128bbb4e26e3a00161fe30ce29b8e7
96 lines
4.6 KiB
YAML
96 lines
4.6 KiB
YAML
---
|
|
conf:
|
|
nagios:
|
|
objects:
|
|
fluent:
|
|
template: |
|
|
define service {
|
|
check_command check_prom_alert!fluentd_not_running!CRITICAL- fluentd is not running on {instance}!OK- Flunetd is working on all nodes
|
|
check_interval 60
|
|
hostgroup_name prometheus-hosts
|
|
service_description Fluentd_status
|
|
use notifying_service
|
|
}
|
|
|
|
define service {
|
|
check_command check_prom_alert!prom_exporter_fluentd_unavailable!CRITICAL- Fluentd exporter is not collecting metrics for alerting!OK- Fluentd exporter metrics are available.
|
|
hostgroup_name prometheus-hosts
|
|
service_description Prometheus-exporter_Fluentd
|
|
use generic-service
|
|
}
|
|
elasticsearch:
|
|
template: |
|
|
define command {
|
|
command_line $USER1$/query_elasticsearch.py $USER9$ '$ARG1$' '$ARG2$' '$ARG3$' '$ARG4$' '$ARG5$' --simple_query '$ARG6$' --simple_query_fields '$ARG7$' --match '$ARG8$' --range '$ARG9$'
|
|
command_name check_es_query
|
|
}
|
|
|
|
define command {
|
|
command_line $USER1$/query_elasticsearch.py $USER9$ '$ARG1$' '$ARG2$' '$ARG3$' '$ARG4$' '$ARG5$' --simple_query '$ARG6$' --simple_query_fields '$ARG7$' --query_file '/opt/nagios/etc/objects/query_es_clauses.json' --query_clause '$ARG8$' --match '$ARG9$' --range '$ARG10$'
|
|
command_name check_es_query_w_file
|
|
}
|
|
|
|
define service {
|
|
check_command check_prom_alert!prom_exporter_elasticsearch_unavailable!CRITICAL- Elasticsearch exporter is not collecting metrics for alerting!OK- Elasticsearch exporter metrics are available.
|
|
hostgroup_name prometheus-hosts
|
|
service_description Prometheus-exporter_Elasticsearch
|
|
use generic-service
|
|
}
|
|
|
|
define service {
|
|
check_command check_prom_alert!es_high_process_open_files_count!CRITICAL- Elasticsearch {host} has high process open file count!OK- Elasticsearch process open file count is normal.
|
|
hostgroup_name prometheus-hosts
|
|
service_description ES_high-process-open-file-count
|
|
use generic-service
|
|
}
|
|
|
|
define service {
|
|
check_command check_prom_alert!es_high_process_cpu_percent!CRITICAL- Elasticsearch {instance} has high process CPU percent!OK- Elasticsearch process cpu usage is normal.
|
|
hostgroup_name prometheus-hosts
|
|
service_description ES_high-process-cpu-percent
|
|
use generic-service
|
|
}
|
|
|
|
define service {
|
|
check_command check_prom_alert!es_fs_usage_high!CRITICAL- Elasticsearch {instance} has high filesystem usage!OK- Elasticsearch filesystem usage is normal.
|
|
hostgroup_name prometheus-hosts
|
|
service_description ES_high-filesystem-usage
|
|
use generic-service
|
|
}
|
|
|
|
define service {
|
|
check_command check_prom_alert!es_unassigned_shards!CRITICAL- Elasticsearch has unassinged shards!OK- Elasticsearch has no unassigned shards.
|
|
hostgroup_name prometheus-hosts
|
|
service_description ES_unassigned-shards
|
|
use generic-service
|
|
}
|
|
|
|
define service {
|
|
check_command check_prom_alert!es_cluster_health_timed_out!CRITICAL- Elasticsearch Cluster health status call timedout!OK- Elasticsearch cluster health is retrievable.
|
|
hostgroup_name prometheus-hosts
|
|
service_description ES_cluster-health-timedout
|
|
use generic-service
|
|
}
|
|
|
|
define service {
|
|
check_command check_prom_alert!es_cluster_health_status_alert!CRITICAL- Elasticsearch cluster health status is not green. One or more shards or replicas are unallocated!OK- Elasticsearch cluster health is green.
|
|
hostgroup_name prometheus-hosts
|
|
service_description ES_cluster-health-status
|
|
use generic-service
|
|
}
|
|
|
|
define service {
|
|
check_command check_prom_alert!es_cluster_health_too_few_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 nodes running!OK- Elasticsearch cluster has 3 or more nodes running.
|
|
hostgroup_name prometheus-hosts
|
|
service_description ES_cluster-running-node-count
|
|
use generic-service
|
|
}
|
|
|
|
define service {
|
|
check_command check_prom_alert!es_cluster_health_too_few_data_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 data nodes running!OK- Elasticsearch cluster has 3 or more data nodes running.
|
|
hostgroup_name prometheus-hosts
|
|
service_description ES_cluster-running-data-node-count
|
|
use generic-service
|
|
}
|
|
...
|