Add node-problem-detector chart
This adds a chart for the node problem detector. This chart will help provide additional insight into the status of the underlying infrastructure of a deployment. Updated the chart with new yamllint checks. Change-Id: I21a24b67b121388107b20ab38ac7703c7a33f1c1 Signed-off-by: Steve Wilkerson <sw5822@att.com>
This commit is contained in:
parent
26350f37aa
commit
a31bb2b049
24
kubernetes-node-problem-detector/Chart.yaml
Normal file
24
kubernetes-node-problem-detector/Chart.yaml
Normal file
@ -0,0 +1,24 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
description: OpenStack-Helm Kubernetes Node Problem Detector
|
||||
name: kubernetes-node-problem-detector
|
||||
version: 0.1.0
|
||||
home: https://github.com/kubernetes/node-problem-detector
|
||||
sources:
|
||||
- https://github.com/kubernetes/node-problem-detector
|
||||
- https://opendev.org/openstack/openstack-helm-infra
|
||||
maintainers:
|
||||
- name: OpenStack-Helm Authors
|
||||
...
|
18
kubernetes-node-problem-detector/requirements.yaml
Normal file
18
kubernetes-node-problem-detector/requirements.yaml
Normal file
@ -0,0 +1,18 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
---
|
||||
dependencies:
|
||||
- name: helm-toolkit
|
||||
repository: http://localhost:8879/charts
|
||||
version: 0.1.0
|
||||
...
|
@ -0,0 +1,25 @@
|
||||
#!/bin/sh
|
||||
{{/*
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/}}
|
||||
|
||||
set -ex
|
||||
|
||||
exec /node-problem-detector \
|
||||
{{- range $monitor, $monitorConfig := .Values.conf.monitors }}
|
||||
{{- if $monitorConfig.enabled }}
|
||||
--config.{{$monitor}}={{ include "helm-toolkit.utils.joinListWithComma" $monitorConfig.enabled }} \
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
--logtostderr \
|
||||
--prometheus-address=0.0.0.0
|
@ -0,0 +1,36 @@
|
||||
{{/*
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/}}
|
||||
|
||||
{{- if .Values.manifests.configmap_bin }}
|
||||
{{- $envAll := . }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: node-problem-detector-bin
|
||||
data:
|
||||
node-problem-detector.sh: |
|
||||
{{ tuple "bin/_node-problem-detector.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
|
||||
image-repo-sync.sh: |
|
||||
{{- include "helm-toolkit.scripts.image_repo_sync" . | indent 4 }}
|
||||
{{- range $monitor, $monitorConfig := $envAll.Values.conf.monitors }}
|
||||
{{- $scripts := $monitorConfig.scripts }}
|
||||
{{- range $script, $scriptSource := $scripts.source }}
|
||||
{{- if has $script $scripts.enabled }}
|
||||
{{$script}}: |
|
||||
{{$scriptSource | indent 4 -}}
|
||||
{{- end }}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- end }}
|
@ -0,0 +1,31 @@
|
||||
{{/*
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/}}
|
||||
|
||||
{{- if .Values.manifests.configmap_etc }}
|
||||
|
||||
{{- $envAll := . }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: node-problem-detector-etc
|
||||
type: Opaque
|
||||
data:
|
||||
{{- range $monitor, $monitorConfig := $envAll.Values.conf.monitors }}
|
||||
{{- $plugins := $monitorConfig.config }}
|
||||
{{- range $plugin, $config := $plugins }}
|
||||
{{$plugin}}.json: {{ toJson $config | b64enc }}
|
||||
{{- end }}
|
||||
{{ end }}
|
||||
{{- end }}
|
135
kubernetes-node-problem-detector/templates/daemonset.yaml
Normal file
135
kubernetes-node-problem-detector/templates/daemonset.yaml
Normal file
@ -0,0 +1,135 @@
|
||||
{{/*
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/}}
|
||||
|
||||
{{- if .Values.manifests.daemonset }}
|
||||
{{- $envAll := . }}
|
||||
|
||||
{{- $serviceAccountName := printf "%s-%s" .Release.Name "node-problem-detector" }}
|
||||
{{ tuple $envAll "node_problem_detector" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: run-node-problem-detector
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: {{ $serviceAccountName }}
|
||||
namespace: {{ .Release.Namespace }}
|
||||
roleRef:
|
||||
kind: ClusterRole
|
||||
name: cluster-admin
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: node-problem-detector
|
||||
annotations:
|
||||
{{ tuple $envAll | include "helm-toolkit.snippets.release_uuid" }}
|
||||
labels:
|
||||
{{ tuple $envAll "node_problem_detector" "metrics" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 4 }}
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
{{ tuple $envAll "node_problem_detector" "metrics" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 6 }}
|
||||
{{ tuple $envAll "node_problem_detector" | include "helm-toolkit.snippets.kubernetes_upgrades_daemonset" | indent 2 }}
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
{{ tuple $envAll "node_problem_detector" "metrics" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 8 }}
|
||||
annotations:
|
||||
{{- if .Values.monitoring.prometheus.pod.enabled }}
|
||||
{{- $prometheus_annotations := $envAll.Values.monitoring.prometheus.node_problem_detector }}
|
||||
{{ tuple $prometheus_annotations | include "helm-toolkit.snippets.prometheus_pod_annotations" | indent 8 }}
|
||||
{{- end }}
|
||||
{{ dict "envAll" $envAll "podName" "node-problem-detector" "containerNames" (list "node-problem-detector") | include "helm-toolkit.snippets.kubernetes_mandatory_access_control_annotation" | indent 8 }}
|
||||
{{ tuple $envAll | include "helm-toolkit.snippets.release_uuid" | indent 8 }}
|
||||
configmap-bin-hash: {{ tuple "configmap-bin.yaml" . | include "helm-toolkit.utils.hash" }}
|
||||
spec:
|
||||
{{ dict "envAll" $envAll "application" "node_problem_detector" | include "helm-toolkit.snippets.kubernetes_pod_security_context" | indent 6 }}
|
||||
serviceAccountName: {{ $serviceAccountName }}
|
||||
{{ if .Values.pod.tolerations.node_problem_detector.enabled }}
|
||||
{{ tuple $envAll "node_exporter" | include "helm-toolkit.snippets.kubernetes_tolerations" | indent 6 }}
|
||||
{{ else }}
|
||||
nodeSelector:
|
||||
{{ .Values.labels.node_problem_detector.node_selector_key }}: {{ .Values.labels.node_problem_detector.node_selector_value | quote }}
|
||||
{{ end }}
|
||||
containers:
|
||||
- name: node-problem-detector
|
||||
{{ tuple $envAll "node_problem_detector" | include "helm-toolkit.snippets.image" | indent 10 }}
|
||||
{{ tuple $envAll $envAll.Values.pod.resources.node_problem_detector | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
|
||||
{{ dict "envAll" $envAll "application" "node_problem_detector" "container" "node_problem_detector" | include "helm-toolkit.snippets.kubernetes_container_security_context" | indent 10 }}
|
||||
command:
|
||||
- /tmp/node-problem-detector.sh
|
||||
ports:
|
||||
- name: metrics
|
||||
containerPort: {{ tuple "node_problem_detector" "internal" "metrics" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
|
||||
env:
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
volumeMounts:
|
||||
- name: log
|
||||
mountPath: /var/log
|
||||
readOnly: true
|
||||
- name: kmsg
|
||||
mountPath: /dev/kmsg
|
||||
readOnly: true
|
||||
- name: localtime
|
||||
mountPath: /etc/localtime
|
||||
readOnly: true
|
||||
- name: node-problem-detector-bin
|
||||
mountPath: /tmp/node-problem-detector.sh
|
||||
subPath: node-problem-detector.sh
|
||||
readOnly: true
|
||||
{{- range $monitor, $monitorConfig := $envAll.Values.conf.monitors }}
|
||||
{{- $scripts := $monitorConfig.scripts }}
|
||||
{{- range $script, $scriptSource := $scripts.source }}
|
||||
{{- if has $script $scripts.enabled }}
|
||||
- name: node-problem-detector-bin
|
||||
mountPath: /config/plugin/{{$script}}
|
||||
subPath: {{$script}}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- range $monitor, $monitorConfig := $envAll.Values.conf.monitors }}
|
||||
{{- $plugins := $monitorConfig.config }}
|
||||
{{- range $plugin, $config := $plugins }}
|
||||
- name: node-problem-detector-etc
|
||||
mountPath: /config/{{$plugin}}.json
|
||||
subPath: {{$plugin}}.json
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
volumes:
|
||||
- name: pod-tmp
|
||||
emptyDir: {}
|
||||
- name: log
|
||||
hostPath:
|
||||
path: /var/log
|
||||
- name: kmsg
|
||||
hostPath:
|
||||
path: /dev/kmsg
|
||||
- name: localtime
|
||||
hostPath:
|
||||
path: /etc/localtime
|
||||
- name: node-problem-detector-etc
|
||||
secret:
|
||||
secretName: node-problem-detector-etc
|
||||
defaultMode: 292
|
||||
- name: node-problem-detector-bin
|
||||
configMap:
|
||||
name: node-problem-detector-bin
|
||||
defaultMode: 365
|
||||
{{- end }}
|
@ -0,0 +1,18 @@
|
||||
{{/*
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/}}
|
||||
|
||||
{{- if and .Values.manifests.job_image_repo_sync .Values.images.local_registry.active }}
|
||||
{{- $imageRepoSyncJob := dict "envAll" . "serviceName" "node-problem-detector" -}}
|
||||
{{ $imageRepoSyncJob | include "helm-toolkit.manifests.job_image_repo_sync" }}
|
||||
{{- end }}
|
38
kubernetes-node-problem-detector/templates/service.yaml
Normal file
38
kubernetes-node-problem-detector/templates/service.yaml
Normal file
@ -0,0 +1,38 @@
|
||||
{{/*
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/}}
|
||||
|
||||
{{- if .Values.manifests.service }}
|
||||
{{- $envAll := . }}
|
||||
{{- $prometheus_annotations := $envAll.Values.monitoring.prometheus.node_problem_detector }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ tuple "node_problem_detector" "internal" . | include "helm-toolkit.endpoints.hostname_short_endpoint_lookup" }}
|
||||
labels:
|
||||
{{ tuple $envAll "node_problem_detector" "metrics" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 4 }}
|
||||
annotations:
|
||||
{{- if .Values.monitoring.prometheus.service.enabled }}
|
||||
{{ tuple $prometheus_annotations | include "helm-toolkit.snippets.prometheus_service_annotations" | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
type: ClusterIP
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: metrics
|
||||
port: {{ tuple "node_problem_detector" "internal" "metrics" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
|
||||
targetPort: {{ tuple "node_problem_detector" "internal" "metrics" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
|
||||
selector:
|
||||
{{ tuple $envAll "node_problem_detector" "metrics" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 4 }}
|
||||
{{- end }}
|
465
kubernetes-node-problem-detector/values.yaml
Normal file
465
kubernetes-node-problem-detector/values.yaml
Normal file
@ -0,0 +1,465 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Default values for node-exporter.
|
||||
# This is a YAML-formatted file.
|
||||
# Declare variables to be passed into your templates.
|
||||
|
||||
---
|
||||
images:
|
||||
tags:
|
||||
node_problem_detector: k8s.gcr.io/node-problem-detector:v0.7.0
|
||||
dep_check: quay.io/airshipit/kubernetes-entrypoint:v1.0.0
|
||||
image_repo_sync: docker.io/docker:17.07.0
|
||||
pull_policy: IfNotPresent
|
||||
local_registry:
|
||||
active: false
|
||||
exclude:
|
||||
- dep_check
|
||||
- image_repo_sync
|
||||
|
||||
labels:
|
||||
node_problem_detector:
|
||||
node_selector_key: openstack-control-plane
|
||||
node_selector_value: enabled
|
||||
job:
|
||||
node_selector_key: openstack-control-plane
|
||||
node_selector_value: enabled
|
||||
|
||||
pod:
|
||||
security_context:
|
||||
node_problem_detector:
|
||||
container:
|
||||
node_problem_detector:
|
||||
privileged: true
|
||||
affinity:
|
||||
anti:
|
||||
type:
|
||||
default: preferredDuringSchedulingIgnoredDuringExecution
|
||||
topologyKey:
|
||||
default: kubernetes.io/hostname
|
||||
mounts:
|
||||
node_problem_detector:
|
||||
node_problem_detector:
|
||||
init_container: null
|
||||
lifecycle:
|
||||
upgrades:
|
||||
daemonsets:
|
||||
pod_replacement_strategy: RollingUpdate
|
||||
node_problem_detector:
|
||||
enabled: true
|
||||
min_ready_seconds: 0
|
||||
revision_history: 3
|
||||
pod_replacement_strategy: RollingUpdate
|
||||
rolling_update:
|
||||
max_unavailable: 1
|
||||
max_surge: 3
|
||||
termination_grace_period:
|
||||
node_problem_detector:
|
||||
timeout: 30
|
||||
resources:
|
||||
enabled: false
|
||||
node_problem_detector:
|
||||
requests:
|
||||
memory: "128Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "1024Mi"
|
||||
cpu: "2000m"
|
||||
jobs:
|
||||
image_repo_sync:
|
||||
requests:
|
||||
memory: "128Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "1024Mi"
|
||||
cpu: "2000m"
|
||||
tolerations:
|
||||
node_problem_detector:
|
||||
enabled: false
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/master
|
||||
operator: Exists
|
||||
- key: node-role.kubernetes.io/node
|
||||
operator: Exists
|
||||
dependencies:
|
||||
dynamic:
|
||||
common:
|
||||
local_image_registry:
|
||||
jobs:
|
||||
- node-exporter-image-repo-sync
|
||||
services:
|
||||
- endpoint: node
|
||||
service: local_image_registry
|
||||
static:
|
||||
image_repo_sync:
|
||||
services:
|
||||
- endpoint: internal
|
||||
service: local_image_registry
|
||||
node_problem_detector:
|
||||
services: null
|
||||
|
||||
monitoring:
|
||||
prometheus:
|
||||
pod:
|
||||
enabled: true
|
||||
service:
|
||||
enabled: false
|
||||
node_problem_detector:
|
||||
scrape: true
|
||||
port: 20257
|
||||
|
||||
endpoints:
|
||||
cluster_domain_suffix: cluster.local
|
||||
local_image_registry:
|
||||
name: docker-registry
|
||||
namespace: docker-registry
|
||||
hosts:
|
||||
default: localhost
|
||||
internal: docker-registry
|
||||
node: localhost
|
||||
host_fqdn_override:
|
||||
default: null
|
||||
port:
|
||||
registry:
|
||||
node: 5000
|
||||
node_problem_detector:
|
||||
name: node-problem-detector
|
||||
namespace: null
|
||||
hosts:
|
||||
default: node-problem-detector
|
||||
host_fqdn_override:
|
||||
default: null
|
||||
path:
|
||||
default: null
|
||||
port:
|
||||
metrics:
|
||||
default: 20257
|
||||
|
||||
manifests:
|
||||
configmap_bin: true
|
||||
configmap_etc: true
|
||||
daemonset: true
|
||||
job_image_repo_sync: true
|
||||
service: false
|
||||
|
||||
conf:
|
||||
monitors:
|
||||
system-log-monitor:
|
||||
enabled:
|
||||
- /config/kernel-monitor.json
|
||||
- /config/docker-monitor.json
|
||||
- /config/systemd-monitor.json
|
||||
scripts:
|
||||
enabled: null
|
||||
source: null
|
||||
config:
|
||||
kernel-monitor:
|
||||
plugin: kmsg
|
||||
logPath: "/dev/kmsg"
|
||||
lookback: 5m
|
||||
bufferSize: 10
|
||||
source: kernel-monitor
|
||||
conditions:
|
||||
- type: KernelDeadlock
|
||||
reason: KernelHasNoDeadlock
|
||||
message: kernel has no deadlock
|
||||
- type: ReadonlyFilesystem
|
||||
reason: FilesystemIsNotReadOnly
|
||||
message: Filesystem is not read-only
|
||||
rules:
|
||||
- type: temporary
|
||||
reason: OOMKilling
|
||||
pattern: Kill process \d+ (.+) score \d+ or sacrifice child\nKilled process \d+
|
||||
(.+) total-vm:\d+kB, anon-rss:\d+kB, file-rss:\d+kB.*
|
||||
- type: temporary
|
||||
reason: TaskHung
|
||||
pattern: task \S+:\w+ blocked for more than \w+ seconds\.
|
||||
- type: temporary
|
||||
reason: UnregisterNetDevice
|
||||
pattern: 'unregister_netdevice: waiting for \w+ to become free. Usage count = \d+'
|
||||
- type: temporary
|
||||
reason: KernelOops
|
||||
pattern: 'BUG: unable to handle kernel NULL pointer dereference at .*'
|
||||
- type: temporary
|
||||
reason: KernelOops
|
||||
pattern: 'divide error: 0000 \[#\d+\] SMP'
|
||||
- type: permanent
|
||||
condition: KernelDeadlock
|
||||
reason: AUFSUmountHung
|
||||
pattern: task umount\.aufs:\w+ blocked for more than \w+ seconds\.
|
||||
- type: permanent
|
||||
condition: KernelDeadlock
|
||||
reason: DockerHung
|
||||
pattern: task docker:\w+ blocked for more than \w+ seconds\.
|
||||
- type: permanent
|
||||
condition: ReadonlyFilesystem
|
||||
reason: FilesystemIsReadOnly
|
||||
pattern: Remounting filesystem read-only
|
||||
kernel-monitor-filelog:
|
||||
plugin: filelog
|
||||
pluginConfig:
|
||||
timestamp: "^.{15}"
|
||||
message: 'kernel: \[.*\] (.*)'
|
||||
timestampFormat: Jan _2 15:04:05
|
||||
logPath: "/var/log/kern.log"
|
||||
lookback: 5m
|
||||
bufferSize: 10
|
||||
source: kernel-monitor
|
||||
conditions:
|
||||
- type: KernelDeadlock
|
||||
reason: KernelHasNoDeadlock
|
||||
message: kernel has no deadlock
|
||||
rules:
|
||||
- type: temporary
|
||||
reason: OOMKilling
|
||||
pattern: Kill process \d+ (.+) score \d+ or sacrifice child\nKilled process \d+
|
||||
(.+) total-vm:\d+kB, anon-rss:\d+kB, file-rss:\d+kB.*
|
||||
- type: temporary
|
||||
reason: TaskHung
|
||||
pattern: task \S+:\w+ blocked for more than \w+ seconds\.
|
||||
- type: temporary
|
||||
reason: UnregisterNetDevice
|
||||
pattern: 'unregister_netdevice: waiting for \w+ to become free. Usage count = \d+'
|
||||
- type: temporary
|
||||
reason: KernelOops
|
||||
pattern: 'BUG: unable to handle kernel NULL pointer dereference at .*'
|
||||
- type: temporary
|
||||
reason: KernelOops
|
||||
pattern: 'divide error: 0000 \[#\d+\] SMP'
|
||||
- type: permanent
|
||||
condition: KernelDeadlock
|
||||
reason: AUFSUmountHung
|
||||
pattern: task umount\.aufs:\w+ blocked for more than \w+ seconds\.
|
||||
- type: permanent
|
||||
condition: KernelDeadlock
|
||||
reason: DockerHung
|
||||
pattern: task docker:\w+ blocked for more than \w+ seconds\.
|
||||
kernel-monitor-counter:
|
||||
plugin: custom
|
||||
pluginConfig:
|
||||
invoke_interval: 5m
|
||||
timeout: 1m
|
||||
max_output_length: 80
|
||||
concurrency: 1
|
||||
source: kernel-monitor
|
||||
conditions:
|
||||
- type: FrequentUnregisterNetDevice
|
||||
reason: NoFrequentUnregisterNetDevice
|
||||
message: node is functioning properly
|
||||
rules:
|
||||
- type: permanent
|
||||
condition: FrequentUnregisterNetDevice
|
||||
reason: UnregisterNetDevice
|
||||
path: "/home/kubernetes/bin/log-counter"
|
||||
args:
|
||||
- "--journald-source=kernel"
|
||||
- "--log-path=/var/log/journal"
|
||||
- "--lookback=20m"
|
||||
- "--count=3"
|
||||
- "--pattern=unregister_netdevice: waiting for \\w+ to become free. Usage count
|
||||
= \\d+"
|
||||
timeout: 1m
|
||||
docker-monitor:
|
||||
plugin: journald
|
||||
pluginConfig:
|
||||
source: dockerd
|
||||
logPath: "/var/log/journal"
|
||||
lookback: 5m
|
||||
bufferSize: 10
|
||||
source: docker-monitor
|
||||
conditions: []
|
||||
rules:
|
||||
- type: temporary
|
||||
reason: CorruptDockerImage
|
||||
pattern: 'Error trying v2 registry: failed to register layer: rename /var/lib/docker/image/(.+)
|
||||
/var/lib/docker/image/(.+): directory not empty.*'
|
||||
docker-monitor-filelog:
|
||||
plugin: filelog
|
||||
pluginConfig:
|
||||
timestamp: ^time="(\S*)"
|
||||
message: |-
|
||||
msg="([^
|
||||
]*)"
|
||||
timestampFormat: '2006-01-02T15:04:05.999999999-07:00'
|
||||
logPath: "/var/log/docker.log"
|
||||
lookback: 5m
|
||||
bufferSize: 10
|
||||
source: docker-monitor
|
||||
conditions: []
|
||||
rules:
|
||||
- type: temporary
|
||||
reason: CorruptDockerImage
|
||||
pattern: 'Error trying v2 registry: failed to register layer: rename /var/lib/docker/image/(.+)
|
||||
/var/lib/docker/image/(.+): directory not empty.*'
|
||||
docker-monitor-counter:
|
||||
plugin: custom
|
||||
pluginConfig:
|
||||
invoke_interval: 5m
|
||||
timeout: 1m
|
||||
max_output_length: 80
|
||||
concurrency: 1
|
||||
source: docker-monitor
|
||||
conditions:
|
||||
- type: CorruptDockerOverlay2
|
||||
reason: NoCorruptDockerOverlay2
|
||||
message: docker overlay2 is functioning properly
|
||||
rules:
|
||||
- type: permanent
|
||||
condition: CorruptDockerOverlay2
|
||||
reason: CorruptDockerOverlay2
|
||||
path: "/home/kubernetes/bin/log-counter"
|
||||
args:
|
||||
- "--journald-source=dockerd"
|
||||
- "--log-path=/var/log/journal"
|
||||
- "--lookback=5m"
|
||||
- "--count=10"
|
||||
- "--pattern=returned error: readlink /var/lib/docker/overlay2.*: invalid argument.*"
|
||||
timeout: 1m
|
||||
systemd-monitor:
|
||||
plugin: journald
|
||||
pluginConfig:
|
||||
source: systemd
|
||||
logPath: "/var/log/journal"
|
||||
lookback: ''
|
||||
bufferSize: 10
|
||||
source: systemd-monitor
|
||||
conditions: []
|
||||
rules:
|
||||
- type: temporary
|
||||
reason: KubeletStart
|
||||
pattern: Started Kubernetes kubelet.
|
||||
- type: temporary
|
||||
reason: DockerStart
|
||||
pattern: Starting Docker Application Container Engine...
|
||||
- type: temporary
|
||||
reason: ContainerdStart
|
||||
pattern: Starting containerd container runtime...
|
||||
systemd-monitor-counter:
|
||||
plugin: custom
|
||||
pluginConfig:
|
||||
invoke_interval: 5m
|
||||
timeout: 1m
|
||||
max_output_length: 80
|
||||
concurrency: 1
|
||||
source: systemd-monitor
|
||||
conditions:
|
||||
- type: FrequentKubeletRestart
|
||||
reason: NoFrequentKubeletRestart
|
||||
message: kubelet is functioning properly
|
||||
- type: FrequentDockerRestart
|
||||
reason: NoFrequentDockerRestart
|
||||
message: docker is functioning properly
|
||||
- type: FrequentContainerdRestart
|
||||
reason: NoFrequentContainerdRestart
|
||||
message: containerd is functioning properly
|
||||
rules:
|
||||
- type: permanent
|
||||
condition: FrequentKubeletRestart
|
||||
reason: FrequentKubeletRestart
|
||||
path: "/home/kubernetes/bin/log-counter"
|
||||
args:
|
||||
- "--journald-source=systemd"
|
||||
- "--log-path=/var/log/journal"
|
||||
- "--lookback=20m"
|
||||
- "--delay=5m"
|
||||
- "--count=5"
|
||||
- "--pattern=Started Kubernetes kubelet."
|
||||
timeout: 1m
|
||||
- type: permanent
|
||||
condition: FrequentDockerRestart
|
||||
reason: FrequentDockerRestart
|
||||
path: "/home/kubernetes/bin/log-counter"
|
||||
args:
|
||||
- "--journald-source=systemd"
|
||||
- "--log-path=/var/log/journal"
|
||||
- "--lookback=20m"
|
||||
- "--count=5"
|
||||
- "--pattern=Starting Docker Application Container Engine..."
|
||||
timeout: 1m
|
||||
- type: permanent
|
||||
condition: FrequentContainerdRestart
|
||||
reason: FrequentContainerdRestart
|
||||
path: "/home/kubernetes/bin/log-counter"
|
||||
args:
|
||||
- "--journald-source=systemd"
|
||||
- "--log-path=/var/log/journal"
|
||||
- "--lookback=20m"
|
||||
- "--count=5"
|
||||
- "--pattern=Starting containerd container runtime..."
|
||||
timeout: 1m
|
||||
custom-plugin-monitor:
|
||||
enabled:
|
||||
- /config/network-problem-monitor.json
|
||||
scripts:
|
||||
enabled:
|
||||
- network_problem.sh
|
||||
source:
|
||||
network_problem.sh: |
|
||||
#!/bin/bash
|
||||
|
||||
# This plugin checks for common network issues. Currently, it only checks
|
||||
# if the conntrack table is full.
|
||||
|
||||
OK=0
|
||||
NONOK=1
|
||||
UNKNOWN=2
|
||||
|
||||
[ -f /proc/sys/net/ipv4/netfilter/ip_conntrack_max ] || exit $UNKNOWN
|
||||
[ -f /proc/sys/net/ipv4/netfilter/ip_conntrack_count ] || exit $UNKNOWN
|
||||
|
||||
conntrack_max=$(cat /proc/sys/net/ipv4/netfilter/ip_conntrack_max)
|
||||
conntrack_count=$(cat /proc/sys/net/ipv4/netfilter/ip_conntrack_count)
|
||||
|
||||
if (( conntrack_count >= conntrack_max )); then
|
||||
echo "Conntrack table full"
|
||||
exit $NONOK
|
||||
fi
|
||||
|
||||
echo "Conntrack table available"
|
||||
exit $OK
|
||||
config:
|
||||
network-problem-monitor:
|
||||
plugin: custom
|
||||
pluginConfig:
|
||||
invoke_interval: 30s
|
||||
timeout: 5s
|
||||
max_output_length: 80
|
||||
concurrency: 3
|
||||
source: network-custom-plugin-monitor
|
||||
conditions: []
|
||||
rules:
|
||||
- type: temporary
|
||||
reason: ConntrackFull
|
||||
path: "./config/plugin/network_problem.sh"
|
||||
timeout: 3s
|
||||
system-stats-monitor:
|
||||
enabled:
|
||||
- /config/system-stats-monitor.json
|
||||
scripts:
|
||||
enabled: null
|
||||
source: null
|
||||
config:
|
||||
system-stats-monitor:
|
||||
disk:
|
||||
metricsConfigs:
|
||||
disk/io_time:
|
||||
displayName: disk/io_time
|
||||
disk/weighted_io:
|
||||
displayName: disk/weighted_io
|
||||
disk/avg_queue_len:
|
||||
displayName: disk/avg_queue_len
|
||||
includeRootBlk: true
|
||||
includeAllAttachedBlk: true
|
||||
lsblkTimeout: 5s
|
||||
invokeInterval: 60s
|
||||
...
|
38
tools/deployment/common/node-problem-detector.sh
Executable file
38
tools/deployment/common/node-problem-detector.sh
Executable file
@ -0,0 +1,38 @@
|
||||
#!/bin/bash
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
set -xe
|
||||
|
||||
#NOTE: Lint and package chart
|
||||
make kubernetes-node-problem-detector
|
||||
|
||||
#NOTE: Deploy command
|
||||
tee /tmp/kubernetes-node-problem-detector.yaml << EOF
|
||||
monitoring:
|
||||
prometheus:
|
||||
pod:
|
||||
enabled: false
|
||||
service:
|
||||
enabled: true
|
||||
manifests:
|
||||
service: true
|
||||
EOF
|
||||
helm upgrade --install kubernetes-node-problem-detector \
|
||||
./kubernetes-node-problem-detector --namespace=kube-system \
|
||||
--values=/tmp/kubernetes-node-problem-detector.yaml
|
||||
|
||||
#NOTE: Wait for deploy
|
||||
./tools/deployment/common/wait-for-pods.sh kube-system
|
||||
|
||||
#NOTE: Validate Deployment info
|
||||
helm status kubernetes-node-problem-detector
|
1
tools/deployment/multinode/075-node-problem-detector.sh
Symbolic link
1
tools/deployment/multinode/075-node-problem-detector.sh
Symbolic link
@ -0,0 +1 @@
|
||||
../common/node-problem-detector.sh
|
@ -0,0 +1 @@
|
||||
../common/node-problem-detector.sh
|
@ -67,6 +67,7 @@
|
||||
- ./tools/deployment/multinode/050-prometheus.sh
|
||||
- ./tools/deployment/multinode/060-alertmanager.sh
|
||||
- ./tools/deployment/multinode/070-kube-state-metrics.sh
|
||||
- ./tools/deployment/multinode/075-node-problem-detector.sh
|
||||
- ./tools/deployment/multinode/080-node-exporter.sh
|
||||
- ./tools/deployment/multinode/085-process-exporter.sh
|
||||
- ./tools/deployment/multinode/090-openstack-exporter.sh
|
||||
@ -190,6 +191,7 @@
|
||||
- ./tools/deployment/osh-infra-monitoring/050-prometheus.sh
|
||||
- ./tools/deployment/osh-infra-monitoring/060-alertmanager.sh
|
||||
- ./tools/deployment/osh-infra-monitoring/070-kube-state-metrics.sh
|
||||
- ./tools/deployment/osh-infra-monitoring/075-node-problem-detector.sh
|
||||
- ./tools/deployment/osh-infra-monitoring/080-node-exporter.sh
|
||||
- ./tools/deployment/osh-infra-monitoring/090-process-exporter.sh
|
||||
- ./tools/deployment/osh-infra-monitoring/100-openstack-exporter.sh
|
||||
|
Loading…
Reference in New Issue
Block a user