# Copyright 2017 The Openstack-Helm Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Default values for prometheus. # This is a YAML-formatted file. # Declare name/value pairs to be passed into your templates. # name: value images: tags: prometheus: docker.io/prom/prometheus:v2.0.0 helm_tests: docker.io/kolla/ubuntu-source-heat-engine:3.0.3 dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.3.0 image_repo_sync: docker.io/docker:17.07.0 pull_policy: IfNotPresent local_registry: active: false exclude: - dep_check - image_repo_sync labels: node_selector_key: openstack-control-plane node_selector_value: enabled pod: affinity: anti: type: default: preferredDuringSchedulingIgnoredDuringExecution topologyKey: default: kubernetes.io/hostname mounts: prometheus: prometheus: init_container: null replicas: prometheus: 1 lifecycle: upgrades: revision_history: 3 pod_replacement_strategy: RollingUpdate rolling_update: max_unavailable: 1 max_surge: 3 termination_grace_period: prometheus: timeout: 30 resources: enabled: false prometheus: limits: memory: "1024Mi" cpu: "2000m" requests: memory: "128Mi" cpu: "500m" jobs: image_repo_sync: requests: memory: "128Mi" cpu: "100m" limits: memory: "1024Mi" cpu: "2000m" tests: requests: memory: "128Mi" cpu: "100m" limits: memory: "1024Mi" cpu: "2000m" endpoints: cluster_domain_suffix: cluster.local local_image_registry: name: docker-registry namespace: docker-registry hosts: default: localhost internal: docker-registry node: localhost host_fqdn_override: default: null port: registry: node: 5000 monitoring: name: prometheus namespace: null hosts: default: prom-metrics public: prometheus host_fqdn_override: default: null path: default: null scheme: default: 'http' port: api: default: 9090 public: 80 alerts: name: alertmanager namespace: null hosts: default: alerts-api public: alertmanager host_fqdn_override: default: null path: default: null scheme: default: 'http' port: api: default: 9093 public: 80 dependencies: dynamic: common: local_image_registry: jobs: - prometheus-image-repo-sync services: - endpoint: node service: local_image_registry static: image_repo_sync: services: - endpoint: internal service: local_image_registry prometheus: services: null monitoring: prometheus: enabled: true prometheus: scrape: true network: prometheus: ingress: public: true proxy_body_size: 1024M node_port: enabled: false port: 30900 storage: enabled: true pvc: name: prometheus-pvc access_mode: [ "ReadWriteOnce" ] requests: storage: 5Gi storage_class: general manifests: configmap_bin: true configmap_etc: true ingress_prometheus: true helm_tests: true job_image_repo_sync: true service_ingress_prometheus: true service: true statefulset_prometheus: true conf: prometheus: storage: tsdb: path: /var/lib/prometheus/data retention: 7d min_block_duration: 2h max_block_duration: 6h log: format: logger:stdout?json=true level: info query: max_concurrency: 20 timeout: 2m web_admin_api: enabled: false scrape_configs: global: scrape_interval: 60s evaluation_interval: 60s scrape_configs: - job_name: kubelet scheme: https # This TLS & bearer token file config is used to connect to the actual scrape # endpoints for cluster components. This is separate to discovery auth # configuration because discovery & scraping are two separate concerns in # Prometheus. The discovery auth config is automatic if Prometheus runs inside # the cluster. Otherwise, more config options have to be provided within the # . tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token kubernetes_sd_configs: - role: node scrape_interval: 45s relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - source_labels: - __meta_kubernetes_node_name regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/${1}/proxy/metrics - source_labels: - __meta_kubernetes_node_name action: replace target_label: kubernetes_io_hostname # Scrape config for Kubelet cAdvisor. # # This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics # (those whose names begin with 'container_') have been removed from the # Kubelet metrics endpoint. This job scrapes the cAdvisor endpoint to # retrieve those metrics. # # In Kubernetes 1.7.0-1.7.2, these metrics are only exposed on the cAdvisor # HTTP endpoint; use "replacement: /api/v1/nodes/${1}:4194/proxy/metrics" # in that case (and ensure cAdvisor's HTTP server hasn't been disabled with # the --cadvisor-port=0 Kubelet flag). # # This job is not necessary and should be removed in Kubernetes 1.6 and # earlier versions, or it will cause the metrics to be scraped twice. - job_name: 'kubernetes-cadvisor' # Default to scraping over https. If required, just disable this or change to # `http`. scheme: https # This TLS & bearer token file config is used to connect to the actual scrape # endpoints for cluster components. This is separate to discovery auth # configuration because discovery & scraping are two separate concerns in # Prometheus. The discovery auth config is automatic if Prometheus runs inside # the cluster. Otherwise, more config options have to be provided within the # . tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token kubernetes_sd_configs: - role: node scrape_interval: 45s relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - source_labels: - __meta_kubernetes_node_name regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - source_labels: - __meta_kubernetes_node_name action: replace target_label: kubernetes_io_hostname metric_relabel_configs: - action: replace source_labels: - id regex: '^/machine\.slice/machine-rkt\\x2d([^\\]+)\\.+/([^/]+)\.service$' target_label: rkt_container_name replacement: '${2}-${1}' - action: replace source_labels: - id regex: '^/system\.slice/(.+)\.service$' target_label: systemd_service_name replacement: '${1}' # Scrape config for API servers. # # Kubernetes exposes API servers as endpoints to the default/kubernetes # service so this uses `endpoints` role and uses relabelling to only keep # the endpoints associated with the default/kubernetes service using the # default named port `https`. This works for single API server deployments as # well as HA API server deployments. - job_name: 'apiserver' kubernetes_sd_configs: - role: endpoints scrape_interval: 45s # Default to scraping over https. If required, just disable this or change to # `http`. scheme: https # This TLS & bearer token file config is used to connect to the actual scrape # endpoints for cluster components. This is separate to discovery auth # configuration because discovery & scraping are two separate concerns in # Prometheus. The discovery auth config is automatic if Prometheus runs inside # the cluster. Otherwise, more config options have to be provided within the # . tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt # If your node certificates are self-signed or use a different CA to the # master CA, then disable certificate verification below. Note that # certificate verification is an integral part of a secure infrastructure # so this should only be disabled in a controlled environment. You can # disable certificate verification by uncommenting the line below. # # insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token # Keep only the default/kubernetes service endpoints for the https port. This # will add targets for each API server which Kubernetes adds an endpoint to # the default/kubernetes service. relabel_configs: - source_labels: - __meta_kubernetes_namespace - __meta_kubernetes_service_name - __meta_kubernetes_endpoint_port_name action: keep regex: default;kubernetes;https # Scrape config for service endpoints. # # The relabeling allows the actual service scrape endpoint to be configured # via the following annotations: # # * `prometheus.io/scrape`: Only scrape services that have a value of `true` # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need # to set this to `https` & most likely set the `tls_config` of the scrape config. # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. # * `prometheus.io/port`: If the metrics are exposed on a different port to the # service then set this appropriately. - job_name: 'kubernetes-service-endpoints' kubernetes_sd_configs: - role: endpoints scrape_interval: 60s relabel_configs: - source_labels: - __meta_kubernetes_service_annotation_prometheus_io_scrape action: keep regex: true - source_labels: - __meta_kubernetes_service_annotation_prometheus_io_scheme action: replace target_label: __scheme__ regex: (https?) - source_labels: - __meta_kubernetes_service_annotation_prometheus_io_path action: replace target_label: __metrics_path__ regex: (.+) - source_labels: - __address__ - __meta_kubernetes_service_annotation_prometheus_io_port action: replace target_label: __address__ regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: - __meta_kubernetes_namespace action: replace target_label: kubernetes_namespace - source_labels: - __meta_kubernetes_service_name action: replace target_label: kubernetes_name - source_labels: - __meta_kubernetes_service_name target_label: job replacement: ${1} # Example scrape config for pods # # The relabeling allows the actual pod scrape endpoint to be configured via the # following annotations: # # * `prometheus.io/scrape`: Only scrape pods that have a value of `true` # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the # pod's declared ports (default is a port-free target if none are declared). - job_name: 'kubernetes-pods' kubernetes_sd_configs: - role: pod relabel_configs: - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep regex: true - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] action: replace regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 target_label: __address__ - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_pod_name] action: replace target_label: kubernetes_pod_name - job_name: calico-etcd honor_labels: false kubernetes_sd_configs: - role: service scrape_interval: 20s relabel_configs: - action: labelmap regex: __meta_kubernetes_service_label_(.+) - action: keep source_labels: - __meta_kubernetes_service_name regex: "calico-etcd" - action: keep source_labels: - __meta_kubernetes_namespace regex: kube-system target_label: namespace - source_labels: - __meta_kubernetes_pod_name target_label: pod - source_labels: - __meta_kubernetes_service_name target_label: service - source_labels: - __meta_kubernetes_service_name target_label: job replacement: ${1} - source_labels: - __meta_kubernetes_service_label target_label: job regex: calico-etcd replacement: ${1} - target_label: endpoint replacement: "calico-etcd" alerting: alertmanagers: - kubernetes_sd_configs: - role: pod relabel_configs: - source_labels: - __meta_kubernetes_pod_label_name regex: alertmanager action: keep - source_labels: - __meta_kubernetes_namespace regex: openstack action: keep - source_labels: - __meta_kubernetes_pod_container_port_number regex: action: drop rules: alertmanager: groups: - name: alertmanager.rules rules: - alert: AlertmanagerConfigInconsistent expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 for: 5m labels: severity: critical annotations: description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. summary: Alertmanager configurations are inconsistent - alert: AlertmanagerDownOrMissing expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 for: 5m labels: severity: warning annotations: description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery. summary: Alertmanager down or not discovered - alert: FailedReload expr: alertmanager_config_last_reload_successful == 0 for: 10m labels: severity: warning annotations: description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. summary: Alertmanager configuration reload has failed etcd3: groups: - name: etcd3.rules rules: - alert: InsufficientMembers expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) for: 3m labels: severity: critical annotations: description: If one more etcd member goes down the cluster will be unavailable summary: etcd cluster insufficient members - alert: NoLeader expr: etcd_server_has_leader{job="etcd"} == 0 for: 1m labels: severity: critical annotations: description: etcd member {{ $labels.instance }} has no leader summary: etcd member has no leader - alert: HighNumberOfLeaderChanges expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 labels: severity: warning annotations: description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour summary: a high number of leader changes within the etcd cluster are happening - alert: HighNumberOfFailedGRPCRequests expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01 for: 10m labels: severity: warning annotations: description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}' summary: a high number of gRPC requests are failing - alert: HighNumberOfFailedGRPCRequests expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05 for: 5m labels: severity: critical annotations: description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}' summary: a high number of gRPC requests are failing - alert: GRPCRequestsSlow expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15 for: 10m labels: severity: critical annotations: description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow summary: slow gRPC requests - alert: HighNumberOfFailedHTTPRequests expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01 for: 10m labels: severity: warning annotations: description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}' summary: a high number of HTTP requests are failing - alert: HighNumberOfFailedHTTPRequests expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05 for: 5m labels: severity: critical annotations: description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}' summary: a high number of HTTP requests are failing - alert: HTTPRequestsSlow expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 for: 10m labels: severity: warning annotations: description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow summary: slow HTTP requests - alert: EtcdMemberCommunicationSlow expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15 for: 10m labels: severity: warning annotations: description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow summary: etcd member communication is slow - alert: HighNumberOfFailedProposals expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 labels: severity: warning annotations: description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour summary: a high number of proposals within the etcd cluster are failing - alert: HighFsyncDurations expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 for: 10m labels: severity: warning annotations: description: etcd instance {{ $labels.instance }} fync durations are high summary: high fsync durations - alert: HighCommitDurations expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 for: 10m labels: severity: warning annotations: description: etcd instance {{ $labels.instance }} commit durations are high summary: high commit durations kube_apiserver: groups: - name: kube-apiserver.rules rules: - alert: K8SApiserverDown expr: absent(up{job="apiserver"} == 1) for: 5m labels: severity: critical annotations: description: Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery. summary: API server unreachable - alert: K8SApiServerLatency expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1 for: 10m labels: severity: warning annotations: description: 99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s. summary: Kubernetes apiserver latency is high kube_controller_manager: groups: - name: kube-controller-manager.rules rules: - alert: K8SControllerManagerDown expr: absent(up{job="kube-controller-manager"} == 1) for: 5m labels: severity: critical annotations: description: There is no running K8S controller manager. Deployments and replication controllers are not making progress. runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager summary: Controller manager is down kubelet: groups: - name: kubelet.rules rules: - alert: K8SNodeNotReady expr: kube_node_status_ready{condition="true"} == 0 for: 1h labels: severity: warning annotations: description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour summary: Node status is NotReady - alert: K8SManyNodesNotReady expr: count(kube_node_status_ready{condition="true"} == 0) > 1 and (count(kube_node_status_ready{condition="true"} == 0) / count(kube_node_status_ready{condition="true"})) > 0.2 for: 1m labels: severity: critical annotations: description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).' summary: Many Kubernetes nodes are Not Ready - alert: K8SKubeletDown expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 for: 1h labels: severity: warning annotations: description: Prometheus failed to scrape {{ $value }}% of kubelets. summary: Many Kubelets cannot be scraped - alert: K8SKubeletDown expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1 for: 1h labels: severity: critical annotations: description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery. summary: Many Kubelets cannot be scraped - alert: K8SKubeletTooManyPods expr: kubelet_running_pod_count > 100 labels: severity: warning annotations: description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110 summary: Kubelet is close to pod limit kubernetes: groups: - name: kubernetes.rules rules: - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:spec_cpu_shares expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:cpu_usage:rate expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:memory_usage:bytes expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:memory_working_set:bytes expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:memory_rss:bytes expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:memory_cache:bytes expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:disk_usage:bytes expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type) - record: cluster_namespace_controller_pod_container:memory_oom:rate expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type) - record: cluster:memory_allocation:percent expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster) - record: cluster:memory_used:percent expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster) - record: cluster:cpu_allocation:percent expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} * ON(cluster, instance) machine_cpu_cores) BY (cluster) - record: cluster:node_cpu_use:percent expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) BY (cluster) - record: cluster_resource_verb:apiserver_latency:quantile_seconds expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06 labels: quantile: "0.99" - record: cluster_resource_verb:apiserver_latency:quantile_seconds expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06 labels: quantile: "0.9" - record: cluster_resource_verb:apiserver_latency:quantile_seconds expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06 labels: quantile: "0.5" - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.99" - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.9" - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.5" - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.99" - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.9" - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.5" - record: cluster:scheduler_binding_latency:quantile_seconds expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.99" - record: cluster:scheduler_binding_latency:quantile_seconds expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.9" - record: cluster:scheduler_binding_latency:quantile_seconds expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.5"