From ca6322da876ec187fa2268cf43e20a8b639de91f Mon Sep 17 00:00:00 2001 From: Steve Wilkerson Date: Sat, 2 Dec 2017 17:42:47 -0600 Subject: [PATCH] Update Prometheus to version 2.0 Updates the Prometheus chart to use version 2.0 by default. This introduces a change in the rules format (to yaml), and changes the flags required for the storage layer. Change-Id: Icb06a6570683b7accebc142f75901530c6359180 --- alertmanager/values.yaml | 2 +- prometheus/templates/bin/_prometheus.sh.tpl | 17 +- prometheus/values.yaml | 862 +++++++++----------- 3 files changed, 380 insertions(+), 501 deletions(-) diff --git a/alertmanager/values.yaml b/alertmanager/values.yaml index e7e46ffdf..978d25fa8 100644 --- a/alertmanager/values.yaml +++ b/alertmanager/values.yaml @@ -45,7 +45,7 @@ pod: alertmanager: init_container: null replicas: - alertmanager: 1 + alertmanager: 3 lifecycle: upgrades: revision_history: 3 diff --git a/prometheus/templates/bin/_prometheus.sh.tpl b/prometheus/templates/bin/_prometheus.sh.tpl index 2b95c973c..972a82253 100644 --- a/prometheus/templates/bin/_prometheus.sh.tpl +++ b/prometheus/templates/bin/_prometheus.sh.tpl @@ -21,14 +21,15 @@ COMMAND="${@:-start}" function start () { exec /bin/prometheus \ - -config.file=/etc/config/prometheus.yml \ - -alertmanager.url={{ tuple "alerts" "internal" "api" . | include "helm-toolkit.endpoints.host_and_port_endpoint_uri_lookup" }} \ - -storage.local.path={{ .Values.conf.prometheus.storage.local.path }} \ - -storage.local.retention={{ .Values.conf.prometheus.storage.local.retention }} \ - -log.format={{ .Values.conf.prometheus.log.format | quote }} \ - -log.level={{ .Values.conf.prometheus.log.level | quote }} \ - -query.max-concurrency={{ .Values.conf.prometheus.query.max_concurrency }} \ - -query.timeout={{ .Values.conf.prometheus.query.timeout }} + --config.file=/etc/config/prometheus.yml \ + --log.level={{ .Values.conf.prometheus.log.level | quote }} \ + --query.max-concurrency={{ .Values.conf.prometheus.query.max_concurrency }} \ + --storage.tsdb.path={{ .Values.conf.prometheus.storage.tsdb.path }} \ + --storage.tsdb.retention={{ .Values.conf.prometheus.storage.tsdb.retention }} \ + {{ if .Values.conf.prometheus.web_admin_api.enabled }} + --web.enable-admin-api \ + {{ end }} + --query.timeout={{ .Values.conf.prometheus.query.timeout }} } function stop () { diff --git a/prometheus/values.yaml b/prometheus/values.yaml index debda3306..fbddc61df 100644 --- a/prometheus/values.yaml +++ b/prometheus/values.yaml @@ -19,7 +19,7 @@ images: tags: - prometheus: docker.io/prom/prometheus:v1.7.1 + prometheus: docker.io/prom/prometheus:v2.0.0 helm_tests: docker.io/kolla/ubuntu-source-kolla-toolbox:3.0.3 dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.2.1 image_repo_sync: docker.io/docker:17.07.0 @@ -185,15 +185,17 @@ manifests: conf: prometheus: storage: - local: + tsdb: path: /var/lib/prometheus/data - retention: 168h0m0s + retention: 7d log: format: logger:stdout?json=true level: info query: max_concurrency: 20 - timeout: 2m0s + timeout: 2m + web_admin_api: + enabled: true scrape_configs: | global: scrape_interval: 25s @@ -409,508 +411,384 @@ conf: alerting: alertmanagers: - kubernetes_sd_configs: - - role: endpoints - scheme: http + - role: pod relabel_configs: - - action: keep - source_labels: - - __meta_kubernetes_service_name - regex: alerts-api - - action: keep - source_labels: - - __meta_kubernetes_namespace - regex: monitoring - - action: keep - source_labels: - - __meta_kubernetes_endpoint_port_name - regex: alerts-api + - source_labels: [__meta_kubernetes_pod_label_name] + regex: alertmanager + action: keep + - source_labels: [__meta_kubernetes_namespace] + regex: openstack + action: keep + - source_labels: [__meta_kubernetes_pod_container_port_number] + regex: + action: drop rules: alertmanager: |- - ALERT AlertmanagerConfigInconsistent - IF count_values by (service) ("config_hash", alertmanager_config_hash) - / on(service) group_left - label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 - FOR 5m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "Alertmanager configurations are inconsistent", - description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync." - } + groups: + - name: alertmanager.rules + rules: + - alert: AlertmanagerConfigInconsistent + expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) + GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", + "alertmanager-$1", "alertmanager", "(.*)") != 1 + for: 5m + labels: + severity: critical + annotations: + description: The configuration of the instances of the Alertmanager cluster + `{{$labels.service}}` are out of sync. + summary: Alertmanager configurations are inconsistent + - alert: AlertmanagerDownOrMissing + expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", + "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 + for: 5m + labels: + severity: warning + annotations: + description: An unexpected number of Alertmanagers are scraped or Alertmanagers + disappeared from discovery. + summary: Alertmanager down or not discovered + - alert: FailedReload + expr: alertmanager_config_last_reload_successful == 0 + for: 10m + labels: + severity: warning + annotations: + description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace + }}/{{ $labels.pod}}. + summary: Alertmanager configuration reload has failed - ALERT AlertmanagerDownOrMissing - IF label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") - / on(job) group_right - sum by(job) (up) != 1 - FOR 5m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Alertmanager down or not discovered", - description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery." - } - - ALERT FailedReload - IF alertmanager_config_last_reload_successful == 0 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Alertmanager configuration reload has failed", - description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." - } etcd3: |- - # general cluster availability - # alert if another failed member will result in an unavailable cluster - ALERT InsufficientMembers + groups: + - name: etcd3.rules + rules: + - alert: InsufficientMembers + expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) + for: 3m + labels: + severity: critical + annotations: + description: If one more etcd member goes down the cluster will be unavailable + summary: etcd cluster insufficient members + - alert: NoLeader + expr: etcd_server_has_leader{job="etcd"} == 0 + for: 1m + labels: + severity: critical + annotations: + description: etcd member {{ $labels.instance }} has no leader + summary: etcd member has no leader + - alert: HighNumberOfLeaderChanges + expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader + changes within the last hour + summary: a high number of leader changes within the etcd cluster are happening + - alert: HighNumberOfFailedGRPCRequests + expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) + / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed + on etcd instance {{ $labels.instance }}' + summary: a high number of gRPC requests are failing + - alert: HighNumberOfFailedGRPCRequests + expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) + / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05 + for: 5m + labels: + severity: critical + annotations: + description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed + on etcd instance {{ $labels.instance }}' + summary: a high number of gRPC requests are failing + - alert: GRPCRequestsSlow + expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: critical + annotations: + description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method + }} are slow + summary: slow gRPC requests + - alert: HighNumberOfFailedHTTPRequests + expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) + BY (method) > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd + instance {{ $labels.instance }}' + summary: a high number of HTTP requests are failing + - alert: HighNumberOfFailedHTTPRequests + expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) + BY (method) > 0.05 + for: 5m + labels: + severity: critical + annotations: + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd + instance {{ $labels.instance }}' + summary: a high number of HTTP requests are failing + - alert: HTTPRequestsSlow + expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: warning + annotations: + description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method + }} are slow + summary: slow HTTP requests + - alert: EtcdMemberCommunicationSlow + expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} member communication with + {{ $labels.To }} is slow + summary: etcd member communication is slow + - alert: HighNumberOfFailedProposals + expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal + failures within the last hour + summary: a high number of proposals within the etcd cluster are failing + - alert: HighFsyncDurations + expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) + > 0.5 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} fync durations are high + summary: high fsync durations + - alert: HighCommitDurations + expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) + > 0.25 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} commit durations are high + summary: high commit durations - IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) - FOR 3m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "etcd cluster insufficient members", - description = "If one more etcd member goes down the cluster will be unavailable", - } - - # etcd leader alerts - # ================== - # alert if any etcd instance has no leader - ALERT NoLeader - IF etcd_server_has_leader{job="etcd"} == 0 - FOR 1m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "etcd member has no leader", - description = "etcd member {{ $labels.instance }} has no leader", - } - - # alert if there are lots of leader changes - ALERT HighNumberOfLeaderChanges - IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "a high number of leader changes within the etcd cluster are happening", - description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour", - } - - # gRPC request alerts - # =================== - # alert if more than 1% of gRPC method calls have failed within the last 5 minutes - ALERT HighNumberOfFailedGRPCRequests - IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) - / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "a high number of gRPC requests are failing", - description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", - } - - # alert if more than 5% of gRPC method calls have failed within the last 5 minutes - ALERT HighNumberOfFailedGRPCRequests - IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) - / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05 - FOR 5m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "a high number of gRPC requests are failing", - description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", - } - - # alert if the 99th percentile of gRPC method calls take more than 150ms - ALERT GRPCRequestsSlow - IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15 - FOR 10m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "slow gRPC requests", - description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow", - } - - # HTTP requests alerts - # ==================== - # alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes - ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", - } - - # alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes - ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05 - FOR 5m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", - } - - # alert if the 99th percentile of HTTP requests take more than 150ms - ALERT HTTPRequestsSlow - IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "slow HTTP requests", - description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow", - } - - # etcd member communication alerts - # ================================ - # alert if 99th percentile of round trips take 150ms - ALERT EtcdMemberCommunicationSlow - IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "etcd member communication is slow", - description = "etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow", - } - - # etcd proposal alerts - # ==================== - # alert if there are several failed proposals within an hour - ALERT HighNumberOfFailedProposals - IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "a high number of proposals within the etcd cluster are failing", - description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", - } - - # etcd disk io latency alerts - # =========================== - # alert if 99th percentile of fsync durations is higher than 500ms - ALERT HighFsyncDurations - IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "high fsync durations", - description = "etcd instance {{ $labels.instance }} fync durations are high", - } - - # alert if 99th percentile of commit durations is higher than 250ms - ALERT HighCommitDurations - IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "high commit durations", - description = "etcd instance {{ $labels.instance }} commit durations are high", - } kube_apiserver: |- - ALERT K8SApiserverDown - IF absent(up{job="apiserver"} == 1) - FOR 5m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.", - } - - # Some verbs excluded because they are expected to be long-lasting: - # WATCHLIST is long-poll, CONNECT is `kubectl exec`. - # - # apiserver_request_latencies' unit is microseconds - ALERT K8SApiServerLatency - IF histogram_quantile( - 0.99, - sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) - ) / 1e6 > 1.0 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Kubernetes apiserver latency is high", - description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", - } + groups: + - name: kube-apiserver.rules + rules: + - alert: K8SApiserverDown + expr: absent(up{job="apiserver"} == 1) + for: 5m + labels: + severity: critical + annotations: + description: Prometheus failed to scrape API server(s), or all API servers have + disappeared from service discovery. + summary: API server unreachable + - alert: K8SApiServerLatency + expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) + WITHOUT (instance, resource)) / 1e+06 > 1 + for: 10m + labels: + severity: warning + annotations: + description: 99th percentile Latency for {{ $labels.verb }} requests to the + kube-apiserver is higher than 1s. + summary: Kubernetes apiserver latency is high kube_controller_manager: |- - ALERT K8SControllerManagerDown - IF absent(up{job="kube-controller-manager"} == 1) - FOR 5m - LABELS { - severity = "critical", - } - ANNOTATIONS { - summary = "Controller manager is down", - description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", - runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager", - } + groups: + - name: kube-controller-manager.rules + rules: + - alert: K8SControllerManagerDown + expr: absent(up{job="kube-controller-manager"} == 1) + for: 5m + labels: + severity: critical + annotations: + description: There is no running K8S controller manager. Deployments and replication + controllers are not making progress. + runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager + summary: Controller manager is down kubelet: |- - ALERT K8SNodeNotReady - IF kube_node_status_ready{condition="true"} == 0 - FOR 1h - LABELS { - severity = "warning", - } - ANNOTATIONS { - summary = "Node status is NotReady", - description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", - } - - ALERT K8SManyNodesNotReady - IF - count(kube_node_status_ready{condition="true"} == 0) > 1 - AND - ( - count(kube_node_status_ready{condition="true"} == 0) - / - count(kube_node_status_ready{condition="true"}) - ) > 0.2 - FOR 1m - LABELS { - severity = "critical", - } - ANNOTATIONS { - summary = "Many Kubernetes nodes are Not Ready", - description = "{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).", - } - - ALERT K8SKubeletDown - IF count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 - FOR 1h - LABELS { - severity = "warning", - } - ANNOTATIONS { - summary = "Many Kubelets cannot be scraped", - description = "Prometheus failed to scrape {{ $value }}% of kubelets.", - } - - ALERT K8SKubeletDown - IF absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1 - FOR 1h - LABELS { - severity = "critical", - } - ANNOTATIONS { - summary = "Many Kubelets cannot be scraped", - description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.", - } - - ALERT K8SKubeletTooManyPods - IF kubelet_running_pod_count > 100 - LABELS { - severity = "warning", - } - ANNOTATIONS { - summary = "Kubelet is close to pod limit", - description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", - } + groups: + - name: kubelet.rules + rules: + - alert: K8SNodeNotReady + expr: kube_node_status_ready{condition="true"} == 0 + for: 1h + labels: + severity: warning + annotations: + description: The Kubelet on {{ $labels.node }} has not checked in with the API, + or has set itself to NotReady, for more than an hour + summary: Node status is NotReady + - alert: K8SManyNodesNotReady + expr: count(kube_node_status_ready{condition="true"} == 0) > 1 and (count(kube_node_status_ready{condition="true"} + == 0) / count(kube_node_status_ready{condition="true"})) > 0.2 + for: 1m + labels: + severity: critical + annotations: + description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady + state).' + summary: Many Kubernetes nodes are Not Ready + - alert: K8SKubeletDown + expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 + for: 1h + labels: + severity: warning + annotations: + description: Prometheus failed to scrape {{ $value }}% of kubelets. + summary: Many Kubelets cannot be scraped + - alert: K8SKubeletDown + expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) + > 0.1 + for: 1h + labels: + severity: critical + annotations: + description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets + have disappeared from service discovery. + summary: Many Kubelets cannot be scraped + - alert: K8SKubeletTooManyPods + expr: kubelet_running_pod_count > 100 + labels: + severity: warning + annotations: + description: Kubelet {{$labels.instance}} is running {{$value}} pods, close + to the limit of 110 + summary: Kubelet is close to pod limit kubernetes: |- - # NOTE: These rules were kindly contributed by the SoundCloud engineering team. + groups: + - name: kubernetes.rules + rules: + - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes + expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:spec_cpu_shares + expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:cpu_usage:rate + expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:memory_usage:bytes + expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:memory_working_set:bytes + expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:memory_rss:bytes + expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:memory_cache:bytes + expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:disk_usage:bytes + expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate + expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name, scope, type) + - record: cluster_namespace_controller_pod_container:memory_oom:rate + expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name, scope, type) + - record: cluster:memory_allocation:percent + expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) + / sum(machine_memory_bytes) BY (cluster) + - record: cluster:memory_used:percent + expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) + BY (cluster) + - record: cluster:cpu_allocation:percent + expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} + * ON(cluster, instance) machine_cpu_cores) BY (cluster) + - record: cluster:node_cpu_use:percent + expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) + BY (cluster) + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, + cluster, job, resource, verb)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, + cluster, job, resource, verb)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, + cluster, job, resource, verb)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" - ### Container resources ### - - cluster_namespace_controller_pod_container:spec_memory_limit_bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_spec_memory_limit_bytes{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:spec_cpu_shares = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_spec_cpu_shares{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:cpu_usage:rate = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - irate( - container_cpu_usage_seconds_total{container_name!=""}[5m] - ), - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:memory_usage:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_memory_usage_bytes{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:memory_working_set:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_memory_working_set_bytes{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:memory_rss:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_memory_rss{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:memory_cache:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_memory_cache{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:disk_usage:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_disk_usage_bytes{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:memory_pagefaults:rate = - sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( - label_replace( - irate( - container_memory_failures_total{container_name!=""}[5m] - ), - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:memory_oom:rate = - sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( - label_replace( - irate( - container_memory_failcnt{container_name!=""}[5m] - ), - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - ### Cluster resources ### - - cluster:memory_allocation:percent = - 100 * sum by (cluster) ( - container_spec_memory_limit_bytes{pod_name!=""} - ) / sum by (cluster) ( - machine_memory_bytes - ) - - cluster:memory_used:percent = - 100 * sum by (cluster) ( - container_memory_usage_bytes{pod_name!=""} - ) / sum by (cluster) ( - machine_memory_bytes - ) - - cluster:cpu_allocation:percent = - 100 * sum by (cluster) ( - container_spec_cpu_shares{pod_name!=""} - ) / sum by (cluster) ( - container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores - ) - - cluster:node_cpu_use:percent = - 100 * sum by (cluster) ( - rate(node_cpu{mode!="idle"}[5m]) - ) / sum by (cluster) ( - machine_cpu_cores - ) - - ### API latency ### - - # Raw metrics are in microseconds. Convert to seconds. - cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} = - histogram_quantile( - 0.99, - sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) - ) / 1e6 - cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} = - histogram_quantile( - 0.9, - sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) - ) / 1e6 - cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} = - histogram_quantile( - 0.5, - sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) - ) / 1e6 - - ### Scheduling latency ### - - cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} = - histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 - cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} = - histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 - cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} = - histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 - - cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} = - histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 - cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} = - histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 - cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} = - histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 - - cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} = - histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 - cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} = - histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 - cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = - histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 rabbitmq: |- mysql: |-