Merge "Update Prometheus to version 2.0"
This commit is contained in:
commit
8d309f5cff
@ -45,7 +45,7 @@ pod:
|
|||||||
alertmanager:
|
alertmanager:
|
||||||
init_container: null
|
init_container: null
|
||||||
replicas:
|
replicas:
|
||||||
alertmanager: 1
|
alertmanager: 3
|
||||||
lifecycle:
|
lifecycle:
|
||||||
upgrades:
|
upgrades:
|
||||||
revision_history: 3
|
revision_history: 3
|
||||||
|
@ -21,14 +21,15 @@ COMMAND="${@:-start}"
|
|||||||
|
|
||||||
function start () {
|
function start () {
|
||||||
exec /bin/prometheus \
|
exec /bin/prometheus \
|
||||||
-config.file=/etc/config/prometheus.yml \
|
--config.file=/etc/config/prometheus.yml \
|
||||||
-alertmanager.url={{ tuple "alerts" "internal" "api" . | include "helm-toolkit.endpoints.host_and_port_endpoint_uri_lookup" }} \
|
--log.level={{ .Values.conf.prometheus.log.level | quote }} \
|
||||||
-storage.local.path={{ .Values.conf.prometheus.storage.local.path }} \
|
--query.max-concurrency={{ .Values.conf.prometheus.query.max_concurrency }} \
|
||||||
-storage.local.retention={{ .Values.conf.prometheus.storage.local.retention }} \
|
--storage.tsdb.path={{ .Values.conf.prometheus.storage.tsdb.path }} \
|
||||||
-log.format={{ .Values.conf.prometheus.log.format | quote }} \
|
--storage.tsdb.retention={{ .Values.conf.prometheus.storage.tsdb.retention }} \
|
||||||
-log.level={{ .Values.conf.prometheus.log.level | quote }} \
|
{{ if .Values.conf.prometheus.web_admin_api.enabled }}
|
||||||
-query.max-concurrency={{ .Values.conf.prometheus.query.max_concurrency }} \
|
--web.enable-admin-api \
|
||||||
-query.timeout={{ .Values.conf.prometheus.query.timeout }}
|
{{ end }}
|
||||||
|
--query.timeout={{ .Values.conf.prometheus.query.timeout }}
|
||||||
}
|
}
|
||||||
|
|
||||||
function stop () {
|
function stop () {
|
||||||
|
@ -19,7 +19,7 @@
|
|||||||
|
|
||||||
images:
|
images:
|
||||||
tags:
|
tags:
|
||||||
prometheus: docker.io/prom/prometheus:v1.7.1
|
prometheus: docker.io/prom/prometheus:v2.0.0
|
||||||
helm_tests: docker.io/kolla/ubuntu-source-kolla-toolbox:3.0.3
|
helm_tests: docker.io/kolla/ubuntu-source-kolla-toolbox:3.0.3
|
||||||
dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.2.1
|
dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.2.1
|
||||||
image_repo_sync: docker.io/docker:17.07.0
|
image_repo_sync: docker.io/docker:17.07.0
|
||||||
@ -185,15 +185,17 @@ manifests:
|
|||||||
conf:
|
conf:
|
||||||
prometheus:
|
prometheus:
|
||||||
storage:
|
storage:
|
||||||
local:
|
tsdb:
|
||||||
path: /var/lib/prometheus/data
|
path: /var/lib/prometheus/data
|
||||||
retention: 168h0m0s
|
retention: 7d
|
||||||
log:
|
log:
|
||||||
format: logger:stdout?json=true
|
format: logger:stdout?json=true
|
||||||
level: info
|
level: info
|
||||||
query:
|
query:
|
||||||
max_concurrency: 20
|
max_concurrency: 20
|
||||||
timeout: 2m0s
|
timeout: 2m
|
||||||
|
web_admin_api:
|
||||||
|
enabled: true
|
||||||
scrape_configs: |
|
scrape_configs: |
|
||||||
global:
|
global:
|
||||||
scrape_interval: 25s
|
scrape_interval: 25s
|
||||||
@ -409,508 +411,384 @@ conf:
|
|||||||
alerting:
|
alerting:
|
||||||
alertmanagers:
|
alertmanagers:
|
||||||
- kubernetes_sd_configs:
|
- kubernetes_sd_configs:
|
||||||
- role: endpoints
|
- role: pod
|
||||||
scheme: http
|
|
||||||
relabel_configs:
|
relabel_configs:
|
||||||
- action: keep
|
- source_labels: [__meta_kubernetes_pod_label_name]
|
||||||
source_labels:
|
regex: alertmanager
|
||||||
- __meta_kubernetes_service_name
|
action: keep
|
||||||
regex: alerts-api
|
- source_labels: [__meta_kubernetes_namespace]
|
||||||
- action: keep
|
regex: openstack
|
||||||
source_labels:
|
action: keep
|
||||||
- __meta_kubernetes_namespace
|
- source_labels: [__meta_kubernetes_pod_container_port_number]
|
||||||
regex: monitoring
|
regex:
|
||||||
- action: keep
|
action: drop
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_endpoint_port_name
|
|
||||||
regex: alerts-api
|
|
||||||
rules:
|
rules:
|
||||||
alertmanager: |-
|
alertmanager: |-
|
||||||
ALERT AlertmanagerConfigInconsistent
|
groups:
|
||||||
IF count_values by (service) ("config_hash", alertmanager_config_hash)
|
- name: alertmanager.rules
|
||||||
/ on(service) group_left
|
rules:
|
||||||
label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
|
- alert: AlertmanagerConfigInconsistent
|
||||||
FOR 5m
|
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
|
||||||
LABELS {
|
GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
|
||||||
severity = "critical"
|
"alertmanager-$1", "alertmanager", "(.*)") != 1
|
||||||
}
|
for: 5m
|
||||||
ANNOTATIONS {
|
labels:
|
||||||
summary = "Alertmanager configurations are inconsistent",
|
severity: critical
|
||||||
description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync."
|
annotations:
|
||||||
}
|
description: The configuration of the instances of the Alertmanager cluster
|
||||||
|
`{{$labels.service}}` are out of sync.
|
||||||
|
summary: Alertmanager configurations are inconsistent
|
||||||
|
- alert: AlertmanagerDownOrMissing
|
||||||
|
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
|
||||||
|
"alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
description: An unexpected number of Alertmanagers are scraped or Alertmanagers
|
||||||
|
disappeared from discovery.
|
||||||
|
summary: Alertmanager down or not discovered
|
||||||
|
- alert: FailedReload
|
||||||
|
expr: alertmanager_config_last_reload_successful == 0
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
|
||||||
|
}}/{{ $labels.pod}}.
|
||||||
|
summary: Alertmanager configuration reload has failed
|
||||||
|
|
||||||
ALERT AlertmanagerDownOrMissing
|
|
||||||
IF label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)")
|
|
||||||
/ on(job) group_right
|
|
||||||
sum by(job) (up) != 1
|
|
||||||
FOR 5m
|
|
||||||
LABELS {
|
|
||||||
severity = "warning"
|
|
||||||
}
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "Alertmanager down or not discovered",
|
|
||||||
description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery."
|
|
||||||
}
|
|
||||||
|
|
||||||
ALERT FailedReload
|
|
||||||
IF alertmanager_config_last_reload_successful == 0
|
|
||||||
FOR 10m
|
|
||||||
LABELS {
|
|
||||||
severity = "warning"
|
|
||||||
}
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "Alertmanager configuration reload has failed",
|
|
||||||
description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}."
|
|
||||||
}
|
|
||||||
etcd3: |-
|
etcd3: |-
|
||||||
# general cluster availability
|
groups:
|
||||||
# alert if another failed member will result in an unavailable cluster
|
- name: etcd3.rules
|
||||||
ALERT InsufficientMembers
|
rules:
|
||||||
|
- alert: InsufficientMembers
|
||||||
|
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
description: If one more etcd member goes down the cluster will be unavailable
|
||||||
|
summary: etcd cluster insufficient members
|
||||||
|
- alert: NoLeader
|
||||||
|
expr: etcd_server_has_leader{job="etcd"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
description: etcd member {{ $labels.instance }} has no leader
|
||||||
|
summary: etcd member has no leader
|
||||||
|
- alert: HighNumberOfLeaderChanges
|
||||||
|
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
|
||||||
|
changes within the last hour
|
||||||
|
summary: a high number of leader changes within the etcd cluster are happening
|
||||||
|
- alert: HighNumberOfFailedGRPCRequests
|
||||||
|
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method)
|
||||||
|
/ sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
||||||
|
on etcd instance {{ $labels.instance }}'
|
||||||
|
summary: a high number of gRPC requests are failing
|
||||||
|
- alert: HighNumberOfFailedGRPCRequests
|
||||||
|
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method)
|
||||||
|
/ sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
||||||
|
on etcd instance {{ $labels.instance }}'
|
||||||
|
summary: a high number of gRPC requests are failing
|
||||||
|
- alert: GRPCRequestsSlow
|
||||||
|
expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m]))
|
||||||
|
> 0.15
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
|
||||||
|
}} are slow
|
||||||
|
summary: slow gRPC requests
|
||||||
|
- alert: HighNumberOfFailedHTTPRequests
|
||||||
|
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
|
||||||
|
BY (method) > 0.01
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
||||||
|
instance {{ $labels.instance }}'
|
||||||
|
summary: a high number of HTTP requests are failing
|
||||||
|
- alert: HighNumberOfFailedHTTPRequests
|
||||||
|
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
|
||||||
|
BY (method) > 0.05
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
||||||
|
instance {{ $labels.instance }}'
|
||||||
|
summary: a high number of HTTP requests are failing
|
||||||
|
- alert: HTTPRequestsSlow
|
||||||
|
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
||||||
|
> 0.15
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
|
||||||
|
}} are slow
|
||||||
|
summary: slow HTTP requests
|
||||||
|
- alert: EtcdMemberCommunicationSlow
|
||||||
|
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m]))
|
||||||
|
> 0.15
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
description: etcd instance {{ $labels.instance }} member communication with
|
||||||
|
{{ $labels.To }} is slow
|
||||||
|
summary: etcd member communication is slow
|
||||||
|
- alert: HighNumberOfFailedProposals
|
||||||
|
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
|
||||||
|
failures within the last hour
|
||||||
|
summary: a high number of proposals within the etcd cluster are failing
|
||||||
|
- alert: HighFsyncDurations
|
||||||
|
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
|
||||||
|
> 0.5
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
description: etcd instance {{ $labels.instance }} fync durations are high
|
||||||
|
summary: high fsync durations
|
||||||
|
- alert: HighCommitDurations
|
||||||
|
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
|
||||||
|
> 0.25
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
description: etcd instance {{ $labels.instance }} commit durations are high
|
||||||
|
summary: high commit durations
|
||||||
|
|
||||||
IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
|
|
||||||
FOR 3m
|
|
||||||
LABELS {
|
|
||||||
severity = "critical"
|
|
||||||
}
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "etcd cluster insufficient members",
|
|
||||||
description = "If one more etcd member goes down the cluster will be unavailable",
|
|
||||||
}
|
|
||||||
|
|
||||||
# etcd leader alerts
|
|
||||||
# ==================
|
|
||||||
# alert if any etcd instance has no leader
|
|
||||||
ALERT NoLeader
|
|
||||||
IF etcd_server_has_leader{job="etcd"} == 0
|
|
||||||
FOR 1m
|
|
||||||
LABELS {
|
|
||||||
severity = "critical"
|
|
||||||
}
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "etcd member has no leader",
|
|
||||||
description = "etcd member {{ $labels.instance }} has no leader",
|
|
||||||
}
|
|
||||||
|
|
||||||
# alert if there are lots of leader changes
|
|
||||||
ALERT HighNumberOfLeaderChanges
|
|
||||||
IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
|
|
||||||
LABELS {
|
|
||||||
severity = "warning"
|
|
||||||
}
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "a high number of leader changes within the etcd cluster are happening",
|
|
||||||
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour",
|
|
||||||
}
|
|
||||||
|
|
||||||
# gRPC request alerts
|
|
||||||
# ===================
|
|
||||||
# alert if more than 1% of gRPC method calls have failed within the last 5 minutes
|
|
||||||
ALERT HighNumberOfFailedGRPCRequests
|
|
||||||
IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
|
|
||||||
/ sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01
|
|
||||||
FOR 10m
|
|
||||||
LABELS {
|
|
||||||
severity = "warning"
|
|
||||||
}
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "a high number of gRPC requests are failing",
|
|
||||||
description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
|
|
||||||
}
|
|
||||||
|
|
||||||
# alert if more than 5% of gRPC method calls have failed within the last 5 minutes
|
|
||||||
ALERT HighNumberOfFailedGRPCRequests
|
|
||||||
IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
|
|
||||||
/ sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05
|
|
||||||
FOR 5m
|
|
||||||
LABELS {
|
|
||||||
severity = "critical"
|
|
||||||
}
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "a high number of gRPC requests are failing",
|
|
||||||
description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
|
|
||||||
}
|
|
||||||
|
|
||||||
# alert if the 99th percentile of gRPC method calls take more than 150ms
|
|
||||||
ALERT GRPCRequestsSlow
|
|
||||||
IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
|
|
||||||
FOR 10m
|
|
||||||
LABELS {
|
|
||||||
severity = "critical"
|
|
||||||
}
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "slow gRPC requests",
|
|
||||||
description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow",
|
|
||||||
}
|
|
||||||
|
|
||||||
# HTTP requests alerts
|
|
||||||
# ====================
|
|
||||||
# alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes
|
|
||||||
ALERT HighNumberOfFailedHTTPRequests
|
|
||||||
IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
|
|
||||||
/ sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01
|
|
||||||
FOR 10m
|
|
||||||
LABELS {
|
|
||||||
severity = "warning"
|
|
||||||
}
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "a high number of HTTP requests are failing",
|
|
||||||
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
|
|
||||||
}
|
|
||||||
|
|
||||||
# alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes
|
|
||||||
ALERT HighNumberOfFailedHTTPRequests
|
|
||||||
IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
|
|
||||||
/ sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05
|
|
||||||
FOR 5m
|
|
||||||
LABELS {
|
|
||||||
severity = "critical"
|
|
||||||
}
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "a high number of HTTP requests are failing",
|
|
||||||
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
|
|
||||||
}
|
|
||||||
|
|
||||||
# alert if the 99th percentile of HTTP requests take more than 150ms
|
|
||||||
ALERT HTTPRequestsSlow
|
|
||||||
IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
|
|
||||||
FOR 10m
|
|
||||||
LABELS {
|
|
||||||
severity = "warning"
|
|
||||||
}
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "slow HTTP requests",
|
|
||||||
description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow",
|
|
||||||
}
|
|
||||||
|
|
||||||
# etcd member communication alerts
|
|
||||||
# ================================
|
|
||||||
# alert if 99th percentile of round trips take 150ms
|
|
||||||
ALERT EtcdMemberCommunicationSlow
|
|
||||||
IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
|
|
||||||
FOR 10m
|
|
||||||
LABELS {
|
|
||||||
severity = "warning"
|
|
||||||
}
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "etcd member communication is slow",
|
|
||||||
description = "etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow",
|
|
||||||
}
|
|
||||||
|
|
||||||
# etcd proposal alerts
|
|
||||||
# ====================
|
|
||||||
# alert if there are several failed proposals within an hour
|
|
||||||
ALERT HighNumberOfFailedProposals
|
|
||||||
IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
|
|
||||||
LABELS {
|
|
||||||
severity = "warning"
|
|
||||||
}
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "a high number of proposals within the etcd cluster are failing",
|
|
||||||
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
|
|
||||||
}
|
|
||||||
|
|
||||||
# etcd disk io latency alerts
|
|
||||||
# ===========================
|
|
||||||
# alert if 99th percentile of fsync durations is higher than 500ms
|
|
||||||
ALERT HighFsyncDurations
|
|
||||||
IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
|
|
||||||
FOR 10m
|
|
||||||
LABELS {
|
|
||||||
severity = "warning"
|
|
||||||
}
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "high fsync durations",
|
|
||||||
description = "etcd instance {{ $labels.instance }} fync durations are high",
|
|
||||||
}
|
|
||||||
|
|
||||||
# alert if 99th percentile of commit durations is higher than 250ms
|
|
||||||
ALERT HighCommitDurations
|
|
||||||
IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
|
|
||||||
FOR 10m
|
|
||||||
LABELS {
|
|
||||||
severity = "warning"
|
|
||||||
}
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "high commit durations",
|
|
||||||
description = "etcd instance {{ $labels.instance }} commit durations are high",
|
|
||||||
}
|
|
||||||
kube_apiserver: |-
|
kube_apiserver: |-
|
||||||
ALERT K8SApiserverDown
|
groups:
|
||||||
IF absent(up{job="apiserver"} == 1)
|
- name: kube-apiserver.rules
|
||||||
FOR 5m
|
rules:
|
||||||
LABELS {
|
- alert: K8SApiserverDown
|
||||||
severity = "critical"
|
expr: absent(up{job="apiserver"} == 1)
|
||||||
}
|
for: 5m
|
||||||
ANNOTATIONS {
|
labels:
|
||||||
summary = "API server unreachable",
|
severity: critical
|
||||||
description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.",
|
annotations:
|
||||||
}
|
description: Prometheus failed to scrape API server(s), or all API servers have
|
||||||
|
disappeared from service discovery.
|
||||||
# Some verbs excluded because they are expected to be long-lasting:
|
summary: API server unreachable
|
||||||
# WATCHLIST is long-poll, CONNECT is `kubectl exec`.
|
- alert: K8SApiServerLatency
|
||||||
#
|
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"})
|
||||||
# apiserver_request_latencies' unit is microseconds
|
WITHOUT (instance, resource)) / 1e+06 > 1
|
||||||
ALERT K8SApiServerLatency
|
for: 10m
|
||||||
IF histogram_quantile(
|
labels:
|
||||||
0.99,
|
severity: warning
|
||||||
sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"})
|
annotations:
|
||||||
) / 1e6 > 1.0
|
description: 99th percentile Latency for {{ $labels.verb }} requests to the
|
||||||
FOR 10m
|
kube-apiserver is higher than 1s.
|
||||||
LABELS {
|
summary: Kubernetes apiserver latency is high
|
||||||
severity = "warning"
|
|
||||||
}
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "Kubernetes apiserver latency is high",
|
|
||||||
description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",
|
|
||||||
}
|
|
||||||
|
|
||||||
kube_controller_manager: |-
|
kube_controller_manager: |-
|
||||||
ALERT K8SControllerManagerDown
|
groups:
|
||||||
IF absent(up{job="kube-controller-manager"} == 1)
|
- name: kube-controller-manager.rules
|
||||||
FOR 5m
|
rules:
|
||||||
LABELS {
|
- alert: K8SControllerManagerDown
|
||||||
severity = "critical",
|
expr: absent(up{job="kube-controller-manager"} == 1)
|
||||||
}
|
for: 5m
|
||||||
ANNOTATIONS {
|
labels:
|
||||||
summary = "Controller manager is down",
|
severity: critical
|
||||||
description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
|
annotations:
|
||||||
runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager",
|
description: There is no running K8S controller manager. Deployments and replication
|
||||||
}
|
controllers are not making progress.
|
||||||
|
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
|
||||||
|
summary: Controller manager is down
|
||||||
|
|
||||||
kubelet: |-
|
kubelet: |-
|
||||||
ALERT K8SNodeNotReady
|
groups:
|
||||||
IF kube_node_status_ready{condition="true"} == 0
|
- name: kubelet.rules
|
||||||
FOR 1h
|
rules:
|
||||||
LABELS {
|
- alert: K8SNodeNotReady
|
||||||
severity = "warning",
|
expr: kube_node_status_ready{condition="true"} == 0
|
||||||
}
|
for: 1h
|
||||||
ANNOTATIONS {
|
labels:
|
||||||
summary = "Node status is NotReady",
|
severity: warning
|
||||||
description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",
|
annotations:
|
||||||
}
|
description: The Kubelet on {{ $labels.node }} has not checked in with the API,
|
||||||
|
or has set itself to NotReady, for more than an hour
|
||||||
ALERT K8SManyNodesNotReady
|
summary: Node status is NotReady
|
||||||
IF
|
- alert: K8SManyNodesNotReady
|
||||||
count(kube_node_status_ready{condition="true"} == 0) > 1
|
expr: count(kube_node_status_ready{condition="true"} == 0) > 1 and (count(kube_node_status_ready{condition="true"}
|
||||||
AND
|
== 0) / count(kube_node_status_ready{condition="true"})) > 0.2
|
||||||
(
|
for: 1m
|
||||||
count(kube_node_status_ready{condition="true"} == 0)
|
labels:
|
||||||
/
|
severity: critical
|
||||||
count(kube_node_status_ready{condition="true"})
|
annotations:
|
||||||
) > 0.2
|
description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady
|
||||||
FOR 1m
|
state).'
|
||||||
LABELS {
|
summary: Many Kubernetes nodes are Not Ready
|
||||||
severity = "critical",
|
- alert: K8SKubeletDown
|
||||||
}
|
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
|
||||||
ANNOTATIONS {
|
for: 1h
|
||||||
summary = "Many Kubernetes nodes are Not Ready",
|
labels:
|
||||||
description = "{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).",
|
severity: warning
|
||||||
}
|
annotations:
|
||||||
|
description: Prometheus failed to scrape {{ $value }}% of kubelets.
|
||||||
ALERT K8SKubeletDown
|
summary: Many Kubelets cannot be scraped
|
||||||
IF count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
|
- alert: K8SKubeletDown
|
||||||
FOR 1h
|
expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})
|
||||||
LABELS {
|
> 0.1
|
||||||
severity = "warning",
|
for: 1h
|
||||||
}
|
labels:
|
||||||
ANNOTATIONS {
|
severity: critical
|
||||||
summary = "Many Kubelets cannot be scraped",
|
annotations:
|
||||||
description = "Prometheus failed to scrape {{ $value }}% of kubelets.",
|
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
|
||||||
}
|
have disappeared from service discovery.
|
||||||
|
summary: Many Kubelets cannot be scraped
|
||||||
ALERT K8SKubeletDown
|
- alert: K8SKubeletTooManyPods
|
||||||
IF absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
|
expr: kubelet_running_pod_count > 100
|
||||||
FOR 1h
|
labels:
|
||||||
LABELS {
|
severity: warning
|
||||||
severity = "critical",
|
annotations:
|
||||||
}
|
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
|
||||||
ANNOTATIONS {
|
to the limit of 110
|
||||||
summary = "Many Kubelets cannot be scraped",
|
summary: Kubelet is close to pod limit
|
||||||
description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.",
|
|
||||||
}
|
|
||||||
|
|
||||||
ALERT K8SKubeletTooManyPods
|
|
||||||
IF kubelet_running_pod_count > 100
|
|
||||||
LABELS {
|
|
||||||
severity = "warning",
|
|
||||||
}
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "Kubelet is close to pod limit",
|
|
||||||
description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",
|
|
||||||
}
|
|
||||||
|
|
||||||
kubernetes: |-
|
kubernetes: |-
|
||||||
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
|
groups:
|
||||||
|
- name: kubernetes.rules
|
||||||
|
rules:
|
||||||
|
- record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
|
||||||
|
expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""},
|
||||||
|
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
||||||
|
controller, pod_name, container_name)
|
||||||
|
- record: cluster_namespace_controller_pod_container:spec_cpu_shares
|
||||||
|
expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller",
|
||||||
|
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
||||||
|
container_name)
|
||||||
|
- record: cluster_namespace_controller_pod_container:cpu_usage:rate
|
||||||
|
expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]),
|
||||||
|
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
||||||
|
controller, pod_name, container_name)
|
||||||
|
- record: cluster_namespace_controller_pod_container:memory_usage:bytes
|
||||||
|
expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller",
|
||||||
|
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
||||||
|
container_name)
|
||||||
|
- record: cluster_namespace_controller_pod_container:memory_working_set:bytes
|
||||||
|
expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""},
|
||||||
|
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
||||||
|
controller, pod_name, container_name)
|
||||||
|
- record: cluster_namespace_controller_pod_container:memory_rss:bytes
|
||||||
|
expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller",
|
||||||
|
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
||||||
|
container_name)
|
||||||
|
- record: cluster_namespace_controller_pod_container:memory_cache:bytes
|
||||||
|
expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller",
|
||||||
|
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
||||||
|
container_name)
|
||||||
|
- record: cluster_namespace_controller_pod_container:disk_usage:bytes
|
||||||
|
expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller",
|
||||||
|
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
||||||
|
container_name)
|
||||||
|
- record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
|
||||||
|
expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]),
|
||||||
|
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
||||||
|
controller, pod_name, container_name, scope, type)
|
||||||
|
- record: cluster_namespace_controller_pod_container:memory_oom:rate
|
||||||
|
expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]),
|
||||||
|
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
||||||
|
controller, pod_name, container_name, scope, type)
|
||||||
|
- record: cluster:memory_allocation:percent
|
||||||
|
expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster)
|
||||||
|
/ sum(machine_memory_bytes) BY (cluster)
|
||||||
|
- record: cluster:memory_used:percent
|
||||||
|
expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes)
|
||||||
|
BY (cluster)
|
||||||
|
- record: cluster:cpu_allocation:percent
|
||||||
|
expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"}
|
||||||
|
* ON(cluster, instance) machine_cpu_cores) BY (cluster)
|
||||||
|
- record: cluster:node_cpu_use:percent
|
||||||
|
expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores)
|
||||||
|
BY (cluster)
|
||||||
|
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
||||||
|
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le,
|
||||||
|
cluster, job, resource, verb)) / 1e+06
|
||||||
|
labels:
|
||||||
|
quantile: "0.99"
|
||||||
|
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
||||||
|
expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le,
|
||||||
|
cluster, job, resource, verb)) / 1e+06
|
||||||
|
labels:
|
||||||
|
quantile: "0.9"
|
||||||
|
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
||||||
|
expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le,
|
||||||
|
cluster, job, resource, verb)) / 1e+06
|
||||||
|
labels:
|
||||||
|
quantile: "0.5"
|
||||||
|
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
||||||
|
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||||
|
BY (le, cluster)) / 1e+06
|
||||||
|
labels:
|
||||||
|
quantile: "0.99"
|
||||||
|
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
||||||
|
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||||
|
BY (le, cluster)) / 1e+06
|
||||||
|
labels:
|
||||||
|
quantile: "0.9"
|
||||||
|
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
||||||
|
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||||
|
BY (le, cluster)) / 1e+06
|
||||||
|
labels:
|
||||||
|
quantile: "0.5"
|
||||||
|
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
||||||
|
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||||
|
BY (le, cluster)) / 1e+06
|
||||||
|
labels:
|
||||||
|
quantile: "0.99"
|
||||||
|
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
||||||
|
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||||
|
BY (le, cluster)) / 1e+06
|
||||||
|
labels:
|
||||||
|
quantile: "0.9"
|
||||||
|
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
||||||
|
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||||
|
BY (le, cluster)) / 1e+06
|
||||||
|
labels:
|
||||||
|
quantile: "0.5"
|
||||||
|
- record: cluster:scheduler_binding_latency:quantile_seconds
|
||||||
|
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
|
||||||
|
BY (le, cluster)) / 1e+06
|
||||||
|
labels:
|
||||||
|
quantile: "0.99"
|
||||||
|
- record: cluster:scheduler_binding_latency:quantile_seconds
|
||||||
|
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
|
||||||
|
BY (le, cluster)) / 1e+06
|
||||||
|
labels:
|
||||||
|
quantile: "0.9"
|
||||||
|
- record: cluster:scheduler_binding_latency:quantile_seconds
|
||||||
|
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
|
||||||
|
BY (le, cluster)) / 1e+06
|
||||||
|
labels:
|
||||||
|
quantile: "0.5"
|
||||||
|
|
||||||
### Container resources ###
|
|
||||||
|
|
||||||
cluster_namespace_controller_pod_container:spec_memory_limit_bytes =
|
|
||||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
|
||||||
label_replace(
|
|
||||||
container_spec_memory_limit_bytes{container_name!=""},
|
|
||||||
"controller", "$1",
|
|
||||||
"pod_name", "^(.*)-[a-z0-9]+"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
cluster_namespace_controller_pod_container:spec_cpu_shares =
|
|
||||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
|
||||||
label_replace(
|
|
||||||
container_spec_cpu_shares{container_name!=""},
|
|
||||||
"controller", "$1",
|
|
||||||
"pod_name", "^(.*)-[a-z0-9]+"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
cluster_namespace_controller_pod_container:cpu_usage:rate =
|
|
||||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
|
||||||
label_replace(
|
|
||||||
irate(
|
|
||||||
container_cpu_usage_seconds_total{container_name!=""}[5m]
|
|
||||||
),
|
|
||||||
"controller", "$1",
|
|
||||||
"pod_name", "^(.*)-[a-z0-9]+"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
cluster_namespace_controller_pod_container:memory_usage:bytes =
|
|
||||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
|
||||||
label_replace(
|
|
||||||
container_memory_usage_bytes{container_name!=""},
|
|
||||||
"controller", "$1",
|
|
||||||
"pod_name", "^(.*)-[a-z0-9]+"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
cluster_namespace_controller_pod_container:memory_working_set:bytes =
|
|
||||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
|
||||||
label_replace(
|
|
||||||
container_memory_working_set_bytes{container_name!=""},
|
|
||||||
"controller", "$1",
|
|
||||||
"pod_name", "^(.*)-[a-z0-9]+"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
cluster_namespace_controller_pod_container:memory_rss:bytes =
|
|
||||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
|
||||||
label_replace(
|
|
||||||
container_memory_rss{container_name!=""},
|
|
||||||
"controller", "$1",
|
|
||||||
"pod_name", "^(.*)-[a-z0-9]+"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
cluster_namespace_controller_pod_container:memory_cache:bytes =
|
|
||||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
|
||||||
label_replace(
|
|
||||||
container_memory_cache{container_name!=""},
|
|
||||||
"controller", "$1",
|
|
||||||
"pod_name", "^(.*)-[a-z0-9]+"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
cluster_namespace_controller_pod_container:disk_usage:bytes =
|
|
||||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
|
||||||
label_replace(
|
|
||||||
container_disk_usage_bytes{container_name!=""},
|
|
||||||
"controller", "$1",
|
|
||||||
"pod_name", "^(.*)-[a-z0-9]+"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
cluster_namespace_controller_pod_container:memory_pagefaults:rate =
|
|
||||||
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
|
|
||||||
label_replace(
|
|
||||||
irate(
|
|
||||||
container_memory_failures_total{container_name!=""}[5m]
|
|
||||||
),
|
|
||||||
"controller", "$1",
|
|
||||||
"pod_name", "^(.*)-[a-z0-9]+"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
cluster_namespace_controller_pod_container:memory_oom:rate =
|
|
||||||
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
|
|
||||||
label_replace(
|
|
||||||
irate(
|
|
||||||
container_memory_failcnt{container_name!=""}[5m]
|
|
||||||
),
|
|
||||||
"controller", "$1",
|
|
||||||
"pod_name", "^(.*)-[a-z0-9]+"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
### Cluster resources ###
|
|
||||||
|
|
||||||
cluster:memory_allocation:percent =
|
|
||||||
100 * sum by (cluster) (
|
|
||||||
container_spec_memory_limit_bytes{pod_name!=""}
|
|
||||||
) / sum by (cluster) (
|
|
||||||
machine_memory_bytes
|
|
||||||
)
|
|
||||||
|
|
||||||
cluster:memory_used:percent =
|
|
||||||
100 * sum by (cluster) (
|
|
||||||
container_memory_usage_bytes{pod_name!=""}
|
|
||||||
) / sum by (cluster) (
|
|
||||||
machine_memory_bytes
|
|
||||||
)
|
|
||||||
|
|
||||||
cluster:cpu_allocation:percent =
|
|
||||||
100 * sum by (cluster) (
|
|
||||||
container_spec_cpu_shares{pod_name!=""}
|
|
||||||
) / sum by (cluster) (
|
|
||||||
container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores
|
|
||||||
)
|
|
||||||
|
|
||||||
cluster:node_cpu_use:percent =
|
|
||||||
100 * sum by (cluster) (
|
|
||||||
rate(node_cpu{mode!="idle"}[5m])
|
|
||||||
) / sum by (cluster) (
|
|
||||||
machine_cpu_cores
|
|
||||||
)
|
|
||||||
|
|
||||||
### API latency ###
|
|
||||||
|
|
||||||
# Raw metrics are in microseconds. Convert to seconds.
|
|
||||||
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} =
|
|
||||||
histogram_quantile(
|
|
||||||
0.99,
|
|
||||||
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
|
|
||||||
) / 1e6
|
|
||||||
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} =
|
|
||||||
histogram_quantile(
|
|
||||||
0.9,
|
|
||||||
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
|
|
||||||
) / 1e6
|
|
||||||
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} =
|
|
||||||
histogram_quantile(
|
|
||||||
0.5,
|
|
||||||
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
|
|
||||||
) / 1e6
|
|
||||||
|
|
||||||
### Scheduling latency ###
|
|
||||||
|
|
||||||
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} =
|
|
||||||
histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
|
|
||||||
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} =
|
|
||||||
histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
|
|
||||||
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} =
|
|
||||||
histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
|
|
||||||
|
|
||||||
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} =
|
|
||||||
histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
|
|
||||||
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} =
|
|
||||||
histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
|
|
||||||
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} =
|
|
||||||
histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
|
|
||||||
|
|
||||||
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} =
|
|
||||||
histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
|
|
||||||
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} =
|
|
||||||
histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
|
|
||||||
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} =
|
|
||||||
histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
|
|
||||||
rabbitmq: |-
|
rabbitmq: |-
|
||||||
|
|
||||||
mysql: |-
|
mysql: |-
|
||||||
|
Loading…
Reference in New Issue
Block a user