Add node-problem-detector chart
This adds a chart for the node problem detector. This chart will help provide additional insight into the status of the underlying infrastructure of a deployment. Updated the chart with new yamllint checks. Change-Id: I21a24b67b121388107b20ab38ac7703c7a33f1c1 Signed-off-by: Steve Wilkerson <sw5822@att.com>
This commit is contained in:
parent
26350f37aa
commit
a31bb2b049
24
kubernetes-node-problem-detector/Chart.yaml
Normal file
24
kubernetes-node-problem-detector/Chart.yaml
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
description: OpenStack-Helm Kubernetes Node Problem Detector
|
||||||
|
name: kubernetes-node-problem-detector
|
||||||
|
version: 0.1.0
|
||||||
|
home: https://github.com/kubernetes/node-problem-detector
|
||||||
|
sources:
|
||||||
|
- https://github.com/kubernetes/node-problem-detector
|
||||||
|
- https://opendev.org/openstack/openstack-helm-infra
|
||||||
|
maintainers:
|
||||||
|
- name: OpenStack-Helm Authors
|
||||||
|
...
|
18
kubernetes-node-problem-detector/requirements.yaml
Normal file
18
kubernetes-node-problem-detector/requirements.yaml
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
---
|
||||||
|
dependencies:
|
||||||
|
- name: helm-toolkit
|
||||||
|
repository: http://localhost:8879/charts
|
||||||
|
version: 0.1.0
|
||||||
|
...
|
@ -0,0 +1,25 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
{{/*
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/}}
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
exec /node-problem-detector \
|
||||||
|
{{- range $monitor, $monitorConfig := .Values.conf.monitors }}
|
||||||
|
{{- if $monitorConfig.enabled }}
|
||||||
|
--config.{{$monitor}}={{ include "helm-toolkit.utils.joinListWithComma" $monitorConfig.enabled }} \
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
--logtostderr \
|
||||||
|
--prometheus-address=0.0.0.0
|
@ -0,0 +1,36 @@
|
|||||||
|
{{/*
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/}}
|
||||||
|
|
||||||
|
{{- if .Values.manifests.configmap_bin }}
|
||||||
|
{{- $envAll := . }}
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: node-problem-detector-bin
|
||||||
|
data:
|
||||||
|
node-problem-detector.sh: |
|
||||||
|
{{ tuple "bin/_node-problem-detector.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
|
||||||
|
image-repo-sync.sh: |
|
||||||
|
{{- include "helm-toolkit.scripts.image_repo_sync" . | indent 4 }}
|
||||||
|
{{- range $monitor, $monitorConfig := $envAll.Values.conf.monitors }}
|
||||||
|
{{- $scripts := $monitorConfig.scripts }}
|
||||||
|
{{- range $script, $scriptSource := $scripts.source }}
|
||||||
|
{{- if has $script $scripts.enabled }}
|
||||||
|
{{$script}}: |
|
||||||
|
{{$scriptSource | indent 4 -}}
|
||||||
|
{{- end }}
|
||||||
|
{{- end -}}
|
||||||
|
{{- end -}}
|
||||||
|
{{- end }}
|
@ -0,0 +1,31 @@
|
|||||||
|
{{/*
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/}}
|
||||||
|
|
||||||
|
{{- if .Values.manifests.configmap_etc }}
|
||||||
|
|
||||||
|
{{- $envAll := . }}
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
name: node-problem-detector-etc
|
||||||
|
type: Opaque
|
||||||
|
data:
|
||||||
|
{{- range $monitor, $monitorConfig := $envAll.Values.conf.monitors }}
|
||||||
|
{{- $plugins := $monitorConfig.config }}
|
||||||
|
{{- range $plugin, $config := $plugins }}
|
||||||
|
{{$plugin}}.json: {{ toJson $config | b64enc }}
|
||||||
|
{{- end }}
|
||||||
|
{{ end }}
|
||||||
|
{{- end }}
|
135
kubernetes-node-problem-detector/templates/daemonset.yaml
Normal file
135
kubernetes-node-problem-detector/templates/daemonset.yaml
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
{{/*
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/}}
|
||||||
|
|
||||||
|
{{- if .Values.manifests.daemonset }}
|
||||||
|
{{- $envAll := . }}
|
||||||
|
|
||||||
|
{{- $serviceAccountName := printf "%s-%s" .Release.Name "node-problem-detector" }}
|
||||||
|
{{ tuple $envAll "node_problem_detector" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
|
||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||||
|
kind: ClusterRoleBinding
|
||||||
|
metadata:
|
||||||
|
name: run-node-problem-detector
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: {{ $serviceAccountName }}
|
||||||
|
namespace: {{ .Release.Namespace }}
|
||||||
|
roleRef:
|
||||||
|
kind: ClusterRole
|
||||||
|
name: cluster-admin
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
name: node-problem-detector
|
||||||
|
annotations:
|
||||||
|
{{ tuple $envAll | include "helm-toolkit.snippets.release_uuid" }}
|
||||||
|
labels:
|
||||||
|
{{ tuple $envAll "node_problem_detector" "metrics" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 4 }}
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
{{ tuple $envAll "node_problem_detector" "metrics" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 6 }}
|
||||||
|
{{ tuple $envAll "node_problem_detector" | include "helm-toolkit.snippets.kubernetes_upgrades_daemonset" | indent 2 }}
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
{{ tuple $envAll "node_problem_detector" "metrics" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 8 }}
|
||||||
|
annotations:
|
||||||
|
{{- if .Values.monitoring.prometheus.pod.enabled }}
|
||||||
|
{{- $prometheus_annotations := $envAll.Values.monitoring.prometheus.node_problem_detector }}
|
||||||
|
{{ tuple $prometheus_annotations | include "helm-toolkit.snippets.prometheus_pod_annotations" | indent 8 }}
|
||||||
|
{{- end }}
|
||||||
|
{{ dict "envAll" $envAll "podName" "node-problem-detector" "containerNames" (list "node-problem-detector") | include "helm-toolkit.snippets.kubernetes_mandatory_access_control_annotation" | indent 8 }}
|
||||||
|
{{ tuple $envAll | include "helm-toolkit.snippets.release_uuid" | indent 8 }}
|
||||||
|
configmap-bin-hash: {{ tuple "configmap-bin.yaml" . | include "helm-toolkit.utils.hash" }}
|
||||||
|
spec:
|
||||||
|
{{ dict "envAll" $envAll "application" "node_problem_detector" | include "helm-toolkit.snippets.kubernetes_pod_security_context" | indent 6 }}
|
||||||
|
serviceAccountName: {{ $serviceAccountName }}
|
||||||
|
{{ if .Values.pod.tolerations.node_problem_detector.enabled }}
|
||||||
|
{{ tuple $envAll "node_exporter" | include "helm-toolkit.snippets.kubernetes_tolerations" | indent 6 }}
|
||||||
|
{{ else }}
|
||||||
|
nodeSelector:
|
||||||
|
{{ .Values.labels.node_problem_detector.node_selector_key }}: {{ .Values.labels.node_problem_detector.node_selector_value | quote }}
|
||||||
|
{{ end }}
|
||||||
|
containers:
|
||||||
|
- name: node-problem-detector
|
||||||
|
{{ tuple $envAll "node_problem_detector" | include "helm-toolkit.snippets.image" | indent 10 }}
|
||||||
|
{{ tuple $envAll $envAll.Values.pod.resources.node_problem_detector | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
|
||||||
|
{{ dict "envAll" $envAll "application" "node_problem_detector" "container" "node_problem_detector" | include "helm-toolkit.snippets.kubernetes_container_security_context" | indent 10 }}
|
||||||
|
command:
|
||||||
|
- /tmp/node-problem-detector.sh
|
||||||
|
ports:
|
||||||
|
- name: metrics
|
||||||
|
containerPort: {{ tuple "node_problem_detector" "internal" "metrics" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
|
||||||
|
env:
|
||||||
|
- name: NODE_NAME
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: spec.nodeName
|
||||||
|
volumeMounts:
|
||||||
|
- name: log
|
||||||
|
mountPath: /var/log
|
||||||
|
readOnly: true
|
||||||
|
- name: kmsg
|
||||||
|
mountPath: /dev/kmsg
|
||||||
|
readOnly: true
|
||||||
|
- name: localtime
|
||||||
|
mountPath: /etc/localtime
|
||||||
|
readOnly: true
|
||||||
|
- name: node-problem-detector-bin
|
||||||
|
mountPath: /tmp/node-problem-detector.sh
|
||||||
|
subPath: node-problem-detector.sh
|
||||||
|
readOnly: true
|
||||||
|
{{- range $monitor, $monitorConfig := $envAll.Values.conf.monitors }}
|
||||||
|
{{- $scripts := $monitorConfig.scripts }}
|
||||||
|
{{- range $script, $scriptSource := $scripts.source }}
|
||||||
|
{{- if has $script $scripts.enabled }}
|
||||||
|
- name: node-problem-detector-bin
|
||||||
|
mountPath: /config/plugin/{{$script}}
|
||||||
|
subPath: {{$script}}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- range $monitor, $monitorConfig := $envAll.Values.conf.monitors }}
|
||||||
|
{{- $plugins := $monitorConfig.config }}
|
||||||
|
{{- range $plugin, $config := $plugins }}
|
||||||
|
- name: node-problem-detector-etc
|
||||||
|
mountPath: /config/{{$plugin}}.json
|
||||||
|
subPath: {{$plugin}}.json
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
volumes:
|
||||||
|
- name: pod-tmp
|
||||||
|
emptyDir: {}
|
||||||
|
- name: log
|
||||||
|
hostPath:
|
||||||
|
path: /var/log
|
||||||
|
- name: kmsg
|
||||||
|
hostPath:
|
||||||
|
path: /dev/kmsg
|
||||||
|
- name: localtime
|
||||||
|
hostPath:
|
||||||
|
path: /etc/localtime
|
||||||
|
- name: node-problem-detector-etc
|
||||||
|
secret:
|
||||||
|
secretName: node-problem-detector-etc
|
||||||
|
defaultMode: 292
|
||||||
|
- name: node-problem-detector-bin
|
||||||
|
configMap:
|
||||||
|
name: node-problem-detector-bin
|
||||||
|
defaultMode: 365
|
||||||
|
{{- end }}
|
@ -0,0 +1,18 @@
|
|||||||
|
{{/*
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/}}
|
||||||
|
|
||||||
|
{{- if and .Values.manifests.job_image_repo_sync .Values.images.local_registry.active }}
|
||||||
|
{{- $imageRepoSyncJob := dict "envAll" . "serviceName" "node-problem-detector" -}}
|
||||||
|
{{ $imageRepoSyncJob | include "helm-toolkit.manifests.job_image_repo_sync" }}
|
||||||
|
{{- end }}
|
38
kubernetes-node-problem-detector/templates/service.yaml
Normal file
38
kubernetes-node-problem-detector/templates/service.yaml
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
{{/*
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/}}
|
||||||
|
|
||||||
|
{{- if .Values.manifests.service }}
|
||||||
|
{{- $envAll := . }}
|
||||||
|
{{- $prometheus_annotations := $envAll.Values.monitoring.prometheus.node_problem_detector }}
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: {{ tuple "node_problem_detector" "internal" . | include "helm-toolkit.endpoints.hostname_short_endpoint_lookup" }}
|
||||||
|
labels:
|
||||||
|
{{ tuple $envAll "node_problem_detector" "metrics" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 4 }}
|
||||||
|
annotations:
|
||||||
|
{{- if .Values.monitoring.prometheus.service.enabled }}
|
||||||
|
{{ tuple $prometheus_annotations | include "helm-toolkit.snippets.prometheus_service_annotations" | indent 4 }}
|
||||||
|
{{- end }}
|
||||||
|
spec:
|
||||||
|
type: ClusterIP
|
||||||
|
clusterIP: None
|
||||||
|
ports:
|
||||||
|
- name: metrics
|
||||||
|
port: {{ tuple "node_problem_detector" "internal" "metrics" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
|
||||||
|
targetPort: {{ tuple "node_problem_detector" "internal" "metrics" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
|
||||||
|
selector:
|
||||||
|
{{ tuple $envAll "node_problem_detector" "metrics" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 4 }}
|
||||||
|
{{- end }}
|
465
kubernetes-node-problem-detector/values.yaml
Normal file
465
kubernetes-node-problem-detector/values.yaml
Normal file
@ -0,0 +1,465 @@
|
|||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# Default values for node-exporter.
|
||||||
|
# This is a YAML-formatted file.
|
||||||
|
# Declare variables to be passed into your templates.
|
||||||
|
|
||||||
|
---
|
||||||
|
images:
|
||||||
|
tags:
|
||||||
|
node_problem_detector: k8s.gcr.io/node-problem-detector:v0.7.0
|
||||||
|
dep_check: quay.io/airshipit/kubernetes-entrypoint:v1.0.0
|
||||||
|
image_repo_sync: docker.io/docker:17.07.0
|
||||||
|
pull_policy: IfNotPresent
|
||||||
|
local_registry:
|
||||||
|
active: false
|
||||||
|
exclude:
|
||||||
|
- dep_check
|
||||||
|
- image_repo_sync
|
||||||
|
|
||||||
|
labels:
|
||||||
|
node_problem_detector:
|
||||||
|
node_selector_key: openstack-control-plane
|
||||||
|
node_selector_value: enabled
|
||||||
|
job:
|
||||||
|
node_selector_key: openstack-control-plane
|
||||||
|
node_selector_value: enabled
|
||||||
|
|
||||||
|
pod:
|
||||||
|
security_context:
|
||||||
|
node_problem_detector:
|
||||||
|
container:
|
||||||
|
node_problem_detector:
|
||||||
|
privileged: true
|
||||||
|
affinity:
|
||||||
|
anti:
|
||||||
|
type:
|
||||||
|
default: preferredDuringSchedulingIgnoredDuringExecution
|
||||||
|
topologyKey:
|
||||||
|
default: kubernetes.io/hostname
|
||||||
|
mounts:
|
||||||
|
node_problem_detector:
|
||||||
|
node_problem_detector:
|
||||||
|
init_container: null
|
||||||
|
lifecycle:
|
||||||
|
upgrades:
|
||||||
|
daemonsets:
|
||||||
|
pod_replacement_strategy: RollingUpdate
|
||||||
|
node_problem_detector:
|
||||||
|
enabled: true
|
||||||
|
min_ready_seconds: 0
|
||||||
|
revision_history: 3
|
||||||
|
pod_replacement_strategy: RollingUpdate
|
||||||
|
rolling_update:
|
||||||
|
max_unavailable: 1
|
||||||
|
max_surge: 3
|
||||||
|
termination_grace_period:
|
||||||
|
node_problem_detector:
|
||||||
|
timeout: 30
|
||||||
|
resources:
|
||||||
|
enabled: false
|
||||||
|
node_problem_detector:
|
||||||
|
requests:
|
||||||
|
memory: "128Mi"
|
||||||
|
cpu: "100m"
|
||||||
|
limits:
|
||||||
|
memory: "1024Mi"
|
||||||
|
cpu: "2000m"
|
||||||
|
jobs:
|
||||||
|
image_repo_sync:
|
||||||
|
requests:
|
||||||
|
memory: "128Mi"
|
||||||
|
cpu: "100m"
|
||||||
|
limits:
|
||||||
|
memory: "1024Mi"
|
||||||
|
cpu: "2000m"
|
||||||
|
tolerations:
|
||||||
|
node_problem_detector:
|
||||||
|
enabled: false
|
||||||
|
tolerations:
|
||||||
|
- key: node-role.kubernetes.io/master
|
||||||
|
operator: Exists
|
||||||
|
- key: node-role.kubernetes.io/node
|
||||||
|
operator: Exists
|
||||||
|
dependencies:
|
||||||
|
dynamic:
|
||||||
|
common:
|
||||||
|
local_image_registry:
|
||||||
|
jobs:
|
||||||
|
- node-exporter-image-repo-sync
|
||||||
|
services:
|
||||||
|
- endpoint: node
|
||||||
|
service: local_image_registry
|
||||||
|
static:
|
||||||
|
image_repo_sync:
|
||||||
|
services:
|
||||||
|
- endpoint: internal
|
||||||
|
service: local_image_registry
|
||||||
|
node_problem_detector:
|
||||||
|
services: null
|
||||||
|
|
||||||
|
monitoring:
|
||||||
|
prometheus:
|
||||||
|
pod:
|
||||||
|
enabled: true
|
||||||
|
service:
|
||||||
|
enabled: false
|
||||||
|
node_problem_detector:
|
||||||
|
scrape: true
|
||||||
|
port: 20257
|
||||||
|
|
||||||
|
endpoints:
|
||||||
|
cluster_domain_suffix: cluster.local
|
||||||
|
local_image_registry:
|
||||||
|
name: docker-registry
|
||||||
|
namespace: docker-registry
|
||||||
|
hosts:
|
||||||
|
default: localhost
|
||||||
|
internal: docker-registry
|
||||||
|
node: localhost
|
||||||
|
host_fqdn_override:
|
||||||
|
default: null
|
||||||
|
port:
|
||||||
|
registry:
|
||||||
|
node: 5000
|
||||||
|
node_problem_detector:
|
||||||
|
name: node-problem-detector
|
||||||
|
namespace: null
|
||||||
|
hosts:
|
||||||
|
default: node-problem-detector
|
||||||
|
host_fqdn_override:
|
||||||
|
default: null
|
||||||
|
path:
|
||||||
|
default: null
|
||||||
|
port:
|
||||||
|
metrics:
|
||||||
|
default: 20257
|
||||||
|
|
||||||
|
manifests:
|
||||||
|
configmap_bin: true
|
||||||
|
configmap_etc: true
|
||||||
|
daemonset: true
|
||||||
|
job_image_repo_sync: true
|
||||||
|
service: false
|
||||||
|
|
||||||
|
conf:
|
||||||
|
monitors:
|
||||||
|
system-log-monitor:
|
||||||
|
enabled:
|
||||||
|
- /config/kernel-monitor.json
|
||||||
|
- /config/docker-monitor.json
|
||||||
|
- /config/systemd-monitor.json
|
||||||
|
scripts:
|
||||||
|
enabled: null
|
||||||
|
source: null
|
||||||
|
config:
|
||||||
|
kernel-monitor:
|
||||||
|
plugin: kmsg
|
||||||
|
logPath: "/dev/kmsg"
|
||||||
|
lookback: 5m
|
||||||
|
bufferSize: 10
|
||||||
|
source: kernel-monitor
|
||||||
|
conditions:
|
||||||
|
- type: KernelDeadlock
|
||||||
|
reason: KernelHasNoDeadlock
|
||||||
|
message: kernel has no deadlock
|
||||||
|
- type: ReadonlyFilesystem
|
||||||
|
reason: FilesystemIsNotReadOnly
|
||||||
|
message: Filesystem is not read-only
|
||||||
|
rules:
|
||||||
|
- type: temporary
|
||||||
|
reason: OOMKilling
|
||||||
|
pattern: Kill process \d+ (.+) score \d+ or sacrifice child\nKilled process \d+
|
||||||
|
(.+) total-vm:\d+kB, anon-rss:\d+kB, file-rss:\d+kB.*
|
||||||
|
- type: temporary
|
||||||
|
reason: TaskHung
|
||||||
|
pattern: task \S+:\w+ blocked for more than \w+ seconds\.
|
||||||
|
- type: temporary
|
||||||
|
reason: UnregisterNetDevice
|
||||||
|
pattern: 'unregister_netdevice: waiting for \w+ to become free. Usage count = \d+'
|
||||||
|
- type: temporary
|
||||||
|
reason: KernelOops
|
||||||
|
pattern: 'BUG: unable to handle kernel NULL pointer dereference at .*'
|
||||||
|
- type: temporary
|
||||||
|
reason: KernelOops
|
||||||
|
pattern: 'divide error: 0000 \[#\d+\] SMP'
|
||||||
|
- type: permanent
|
||||||
|
condition: KernelDeadlock
|
||||||
|
reason: AUFSUmountHung
|
||||||
|
pattern: task umount\.aufs:\w+ blocked for more than \w+ seconds\.
|
||||||
|
- type: permanent
|
||||||
|
condition: KernelDeadlock
|
||||||
|
reason: DockerHung
|
||||||
|
pattern: task docker:\w+ blocked for more than \w+ seconds\.
|
||||||
|
- type: permanent
|
||||||
|
condition: ReadonlyFilesystem
|
||||||
|
reason: FilesystemIsReadOnly
|
||||||
|
pattern: Remounting filesystem read-only
|
||||||
|
kernel-monitor-filelog:
|
||||||
|
plugin: filelog
|
||||||
|
pluginConfig:
|
||||||
|
timestamp: "^.{15}"
|
||||||
|
message: 'kernel: \[.*\] (.*)'
|
||||||
|
timestampFormat: Jan _2 15:04:05
|
||||||
|
logPath: "/var/log/kern.log"
|
||||||
|
lookback: 5m
|
||||||
|
bufferSize: 10
|
||||||
|
source: kernel-monitor
|
||||||
|
conditions:
|
||||||
|
- type: KernelDeadlock
|
||||||
|
reason: KernelHasNoDeadlock
|
||||||
|
message: kernel has no deadlock
|
||||||
|
rules:
|
||||||
|
- type: temporary
|
||||||
|
reason: OOMKilling
|
||||||
|
pattern: Kill process \d+ (.+) score \d+ or sacrifice child\nKilled process \d+
|
||||||
|
(.+) total-vm:\d+kB, anon-rss:\d+kB, file-rss:\d+kB.*
|
||||||
|
- type: temporary
|
||||||
|
reason: TaskHung
|
||||||
|
pattern: task \S+:\w+ blocked for more than \w+ seconds\.
|
||||||
|
- type: temporary
|
||||||
|
reason: UnregisterNetDevice
|
||||||
|
pattern: 'unregister_netdevice: waiting for \w+ to become free. Usage count = \d+'
|
||||||
|
- type: temporary
|
||||||
|
reason: KernelOops
|
||||||
|
pattern: 'BUG: unable to handle kernel NULL pointer dereference at .*'
|
||||||
|
- type: temporary
|
||||||
|
reason: KernelOops
|
||||||
|
pattern: 'divide error: 0000 \[#\d+\] SMP'
|
||||||
|
- type: permanent
|
||||||
|
condition: KernelDeadlock
|
||||||
|
reason: AUFSUmountHung
|
||||||
|
pattern: task umount\.aufs:\w+ blocked for more than \w+ seconds\.
|
||||||
|
- type: permanent
|
||||||
|
condition: KernelDeadlock
|
||||||
|
reason: DockerHung
|
||||||
|
pattern: task docker:\w+ blocked for more than \w+ seconds\.
|
||||||
|
kernel-monitor-counter:
|
||||||
|
plugin: custom
|
||||||
|
pluginConfig:
|
||||||
|
invoke_interval: 5m
|
||||||
|
timeout: 1m
|
||||||
|
max_output_length: 80
|
||||||
|
concurrency: 1
|
||||||
|
source: kernel-monitor
|
||||||
|
conditions:
|
||||||
|
- type: FrequentUnregisterNetDevice
|
||||||
|
reason: NoFrequentUnregisterNetDevice
|
||||||
|
message: node is functioning properly
|
||||||
|
rules:
|
||||||
|
- type: permanent
|
||||||
|
condition: FrequentUnregisterNetDevice
|
||||||
|
reason: UnregisterNetDevice
|
||||||
|
path: "/home/kubernetes/bin/log-counter"
|
||||||
|
args:
|
||||||
|
- "--journald-source=kernel"
|
||||||
|
- "--log-path=/var/log/journal"
|
||||||
|
- "--lookback=20m"
|
||||||
|
- "--count=3"
|
||||||
|
- "--pattern=unregister_netdevice: waiting for \\w+ to become free. Usage count
|
||||||
|
= \\d+"
|
||||||
|
timeout: 1m
|
||||||
|
docker-monitor:
|
||||||
|
plugin: journald
|
||||||
|
pluginConfig:
|
||||||
|
source: dockerd
|
||||||
|
logPath: "/var/log/journal"
|
||||||
|
lookback: 5m
|
||||||
|
bufferSize: 10
|
||||||
|
source: docker-monitor
|
||||||
|
conditions: []
|
||||||
|
rules:
|
||||||
|
- type: temporary
|
||||||
|
reason: CorruptDockerImage
|
||||||
|
pattern: 'Error trying v2 registry: failed to register layer: rename /var/lib/docker/image/(.+)
|
||||||
|
/var/lib/docker/image/(.+): directory not empty.*'
|
||||||
|
docker-monitor-filelog:
|
||||||
|
plugin: filelog
|
||||||
|
pluginConfig:
|
||||||
|
timestamp: ^time="(\S*)"
|
||||||
|
message: |-
|
||||||
|
msg="([^
|
||||||
|
]*)"
|
||||||
|
timestampFormat: '2006-01-02T15:04:05.999999999-07:00'
|
||||||
|
logPath: "/var/log/docker.log"
|
||||||
|
lookback: 5m
|
||||||
|
bufferSize: 10
|
||||||
|
source: docker-monitor
|
||||||
|
conditions: []
|
||||||
|
rules:
|
||||||
|
- type: temporary
|
||||||
|
reason: CorruptDockerImage
|
||||||
|
pattern: 'Error trying v2 registry: failed to register layer: rename /var/lib/docker/image/(.+)
|
||||||
|
/var/lib/docker/image/(.+): directory not empty.*'
|
||||||
|
docker-monitor-counter:
|
||||||
|
plugin: custom
|
||||||
|
pluginConfig:
|
||||||
|
invoke_interval: 5m
|
||||||
|
timeout: 1m
|
||||||
|
max_output_length: 80
|
||||||
|
concurrency: 1
|
||||||
|
source: docker-monitor
|
||||||
|
conditions:
|
||||||
|
- type: CorruptDockerOverlay2
|
||||||
|
reason: NoCorruptDockerOverlay2
|
||||||
|
message: docker overlay2 is functioning properly
|
||||||
|
rules:
|
||||||
|
- type: permanent
|
||||||
|
condition: CorruptDockerOverlay2
|
||||||
|
reason: CorruptDockerOverlay2
|
||||||
|
path: "/home/kubernetes/bin/log-counter"
|
||||||
|
args:
|
||||||
|
- "--journald-source=dockerd"
|
||||||
|
- "--log-path=/var/log/journal"
|
||||||
|
- "--lookback=5m"
|
||||||
|
- "--count=10"
|
||||||
|
- "--pattern=returned error: readlink /var/lib/docker/overlay2.*: invalid argument.*"
|
||||||
|
timeout: 1m
|
||||||
|
systemd-monitor:
|
||||||
|
plugin: journald
|
||||||
|
pluginConfig:
|
||||||
|
source: systemd
|
||||||
|
logPath: "/var/log/journal"
|
||||||
|
lookback: ''
|
||||||
|
bufferSize: 10
|
||||||
|
source: systemd-monitor
|
||||||
|
conditions: []
|
||||||
|
rules:
|
||||||
|
- type: temporary
|
||||||
|
reason: KubeletStart
|
||||||
|
pattern: Started Kubernetes kubelet.
|
||||||
|
- type: temporary
|
||||||
|
reason: DockerStart
|
||||||
|
pattern: Starting Docker Application Container Engine...
|
||||||
|
- type: temporary
|
||||||
|
reason: ContainerdStart
|
||||||
|
pattern: Starting containerd container runtime...
|
||||||
|
systemd-monitor-counter:
|
||||||
|
plugin: custom
|
||||||
|
pluginConfig:
|
||||||
|
invoke_interval: 5m
|
||||||
|
timeout: 1m
|
||||||
|
max_output_length: 80
|
||||||
|
concurrency: 1
|
||||||
|
source: systemd-monitor
|
||||||
|
conditions:
|
||||||
|
- type: FrequentKubeletRestart
|
||||||
|
reason: NoFrequentKubeletRestart
|
||||||
|
message: kubelet is functioning properly
|
||||||
|
- type: FrequentDockerRestart
|
||||||
|
reason: NoFrequentDockerRestart
|
||||||
|
message: docker is functioning properly
|
||||||
|
- type: FrequentContainerdRestart
|
||||||
|
reason: NoFrequentContainerdRestart
|
||||||
|
message: containerd is functioning properly
|
||||||
|
rules:
|
||||||
|
- type: permanent
|
||||||
|
condition: FrequentKubeletRestart
|
||||||
|
reason: FrequentKubeletRestart
|
||||||
|
path: "/home/kubernetes/bin/log-counter"
|
||||||
|
args:
|
||||||
|
- "--journald-source=systemd"
|
||||||
|
- "--log-path=/var/log/journal"
|
||||||
|
- "--lookback=20m"
|
||||||
|
- "--delay=5m"
|
||||||
|
- "--count=5"
|
||||||
|
- "--pattern=Started Kubernetes kubelet."
|
||||||
|
timeout: 1m
|
||||||
|
- type: permanent
|
||||||
|
condition: FrequentDockerRestart
|
||||||
|
reason: FrequentDockerRestart
|
||||||
|
path: "/home/kubernetes/bin/log-counter"
|
||||||
|
args:
|
||||||
|
- "--journald-source=systemd"
|
||||||
|
- "--log-path=/var/log/journal"
|
||||||
|
- "--lookback=20m"
|
||||||
|
- "--count=5"
|
||||||
|
- "--pattern=Starting Docker Application Container Engine..."
|
||||||
|
timeout: 1m
|
||||||
|
- type: permanent
|
||||||
|
condition: FrequentContainerdRestart
|
||||||
|
reason: FrequentContainerdRestart
|
||||||
|
path: "/home/kubernetes/bin/log-counter"
|
||||||
|
args:
|
||||||
|
- "--journald-source=systemd"
|
||||||
|
- "--log-path=/var/log/journal"
|
||||||
|
- "--lookback=20m"
|
||||||
|
- "--count=5"
|
||||||
|
- "--pattern=Starting containerd container runtime..."
|
||||||
|
timeout: 1m
|
||||||
|
custom-plugin-monitor:
|
||||||
|
enabled:
|
||||||
|
- /config/network-problem-monitor.json
|
||||||
|
scripts:
|
||||||
|
enabled:
|
||||||
|
- network_problem.sh
|
||||||
|
source:
|
||||||
|
network_problem.sh: |
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This plugin checks for common network issues. Currently, it only checks
|
||||||
|
# if the conntrack table is full.
|
||||||
|
|
||||||
|
OK=0
|
||||||
|
NONOK=1
|
||||||
|
UNKNOWN=2
|
||||||
|
|
||||||
|
[ -f /proc/sys/net/ipv4/netfilter/ip_conntrack_max ] || exit $UNKNOWN
|
||||||
|
[ -f /proc/sys/net/ipv4/netfilter/ip_conntrack_count ] || exit $UNKNOWN
|
||||||
|
|
||||||
|
conntrack_max=$(cat /proc/sys/net/ipv4/netfilter/ip_conntrack_max)
|
||||||
|
conntrack_count=$(cat /proc/sys/net/ipv4/netfilter/ip_conntrack_count)
|
||||||
|
|
||||||
|
if (( conntrack_count >= conntrack_max )); then
|
||||||
|
echo "Conntrack table full"
|
||||||
|
exit $NONOK
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Conntrack table available"
|
||||||
|
exit $OK
|
||||||
|
config:
|
||||||
|
network-problem-monitor:
|
||||||
|
plugin: custom
|
||||||
|
pluginConfig:
|
||||||
|
invoke_interval: 30s
|
||||||
|
timeout: 5s
|
||||||
|
max_output_length: 80
|
||||||
|
concurrency: 3
|
||||||
|
source: network-custom-plugin-monitor
|
||||||
|
conditions: []
|
||||||
|
rules:
|
||||||
|
- type: temporary
|
||||||
|
reason: ConntrackFull
|
||||||
|
path: "./config/plugin/network_problem.sh"
|
||||||
|
timeout: 3s
|
||||||
|
system-stats-monitor:
|
||||||
|
enabled:
|
||||||
|
- /config/system-stats-monitor.json
|
||||||
|
scripts:
|
||||||
|
enabled: null
|
||||||
|
source: null
|
||||||
|
config:
|
||||||
|
system-stats-monitor:
|
||||||
|
disk:
|
||||||
|
metricsConfigs:
|
||||||
|
disk/io_time:
|
||||||
|
displayName: disk/io_time
|
||||||
|
disk/weighted_io:
|
||||||
|
displayName: disk/weighted_io
|
||||||
|
disk/avg_queue_len:
|
||||||
|
displayName: disk/avg_queue_len
|
||||||
|
includeRootBlk: true
|
||||||
|
includeAllAttachedBlk: true
|
||||||
|
lsblkTimeout: 5s
|
||||||
|
invokeInterval: 60s
|
||||||
|
...
|
38
tools/deployment/common/node-problem-detector.sh
Executable file
38
tools/deployment/common/node-problem-detector.sh
Executable file
@ -0,0 +1,38 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
# not use this file except in compliance with the License. You may obtain
|
||||||
|
# a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
# License for the specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
|
||||||
|
set -xe
|
||||||
|
|
||||||
|
#NOTE: Lint and package chart
|
||||||
|
make kubernetes-node-problem-detector
|
||||||
|
|
||||||
|
#NOTE: Deploy command
|
||||||
|
tee /tmp/kubernetes-node-problem-detector.yaml << EOF
|
||||||
|
monitoring:
|
||||||
|
prometheus:
|
||||||
|
pod:
|
||||||
|
enabled: false
|
||||||
|
service:
|
||||||
|
enabled: true
|
||||||
|
manifests:
|
||||||
|
service: true
|
||||||
|
EOF
|
||||||
|
helm upgrade --install kubernetes-node-problem-detector \
|
||||||
|
./kubernetes-node-problem-detector --namespace=kube-system \
|
||||||
|
--values=/tmp/kubernetes-node-problem-detector.yaml
|
||||||
|
|
||||||
|
#NOTE: Wait for deploy
|
||||||
|
./tools/deployment/common/wait-for-pods.sh kube-system
|
||||||
|
|
||||||
|
#NOTE: Validate Deployment info
|
||||||
|
helm status kubernetes-node-problem-detector
|
1
tools/deployment/multinode/075-node-problem-detector.sh
Symbolic link
1
tools/deployment/multinode/075-node-problem-detector.sh
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../common/node-problem-detector.sh
|
@ -0,0 +1 @@
|
|||||||
|
../common/node-problem-detector.sh
|
@ -67,6 +67,7 @@
|
|||||||
- ./tools/deployment/multinode/050-prometheus.sh
|
- ./tools/deployment/multinode/050-prometheus.sh
|
||||||
- ./tools/deployment/multinode/060-alertmanager.sh
|
- ./tools/deployment/multinode/060-alertmanager.sh
|
||||||
- ./tools/deployment/multinode/070-kube-state-metrics.sh
|
- ./tools/deployment/multinode/070-kube-state-metrics.sh
|
||||||
|
- ./tools/deployment/multinode/075-node-problem-detector.sh
|
||||||
- ./tools/deployment/multinode/080-node-exporter.sh
|
- ./tools/deployment/multinode/080-node-exporter.sh
|
||||||
- ./tools/deployment/multinode/085-process-exporter.sh
|
- ./tools/deployment/multinode/085-process-exporter.sh
|
||||||
- ./tools/deployment/multinode/090-openstack-exporter.sh
|
- ./tools/deployment/multinode/090-openstack-exporter.sh
|
||||||
@ -190,6 +191,7 @@
|
|||||||
- ./tools/deployment/osh-infra-monitoring/050-prometheus.sh
|
- ./tools/deployment/osh-infra-monitoring/050-prometheus.sh
|
||||||
- ./tools/deployment/osh-infra-monitoring/060-alertmanager.sh
|
- ./tools/deployment/osh-infra-monitoring/060-alertmanager.sh
|
||||||
- ./tools/deployment/osh-infra-monitoring/070-kube-state-metrics.sh
|
- ./tools/deployment/osh-infra-monitoring/070-kube-state-metrics.sh
|
||||||
|
- ./tools/deployment/osh-infra-monitoring/075-node-problem-detector.sh
|
||||||
- ./tools/deployment/osh-infra-monitoring/080-node-exporter.sh
|
- ./tools/deployment/osh-infra-monitoring/080-node-exporter.sh
|
||||||
- ./tools/deployment/osh-infra-monitoring/090-process-exporter.sh
|
- ./tools/deployment/osh-infra-monitoring/090-process-exporter.sh
|
||||||
- ./tools/deployment/osh-infra-monitoring/100-openstack-exporter.sh
|
- ./tools/deployment/osh-infra-monitoring/100-openstack-exporter.sh
|
||||||
|
Loading…
Reference in New Issue
Block a user