Add additional dashboards for Calico + systemd logs

This commit is contained in:
Matt Pryor 2023-02-28 14:51:28 +00:00
parent 44b012986f
commit 5b59b587e9
11 changed files with 2435 additions and 93 deletions

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,418 @@
{
"__requires": [
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "6.3.4"
},
{
"type": "panel",
"id": "graph",
"name": "Graph",
"version": ""
},
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"gnetId": 10880,
"graphTooltip": 0,
"id": null,
"links": [],
"panels": [
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"id": 8,
"interval": "",
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "avg_over_time(loki_ingester_chunk_entries_count[5m])",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Loki's stored chunk entries",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"id": 2,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "rate (loki_distributor_ingester_appends_total[1m])",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Average batch appends sent to ingesters (Loki)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 8
},
"id": 6,
"legend": {
"alignAsTable": true,
"avg": true,
"current": false,
"max": false,
"min": true,
"rightSide": true,
"show": true,
"sort": "avg",
"sortDesc": true,
"total": true,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum (promtail_file_bytes_total{instance=~\".*\"}) by (instance)",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Number of bytes total by promtail",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 16
},
"id": 4,
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum (promtail_encoded_bytes_total{pod=~'.*'}) by (pod)",
"format": "time_series",
"instant": false,
"legendFormat": "{{ pod }}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Number of bytes encoded and ready to send by Promtail",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"schemaVersion": 19,
"style": "dark",
"tags": [],
"templating": {
"list": []
},
"time": {
"from": "now-24h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
]
},
"timezone": "",
"title": "Loki / Metrics",
"uid": "MQHVDmtWk",
"version": 5,
"description": "Loki and Promtail metrics."
}

View File

@ -24,12 +24,6 @@
"name": "Loki", "name": "Loki",
"version": "1.0.0" "version": "1.0.0"
}, },
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
},
{ {
"type": "panel", "type": "panel",
"id": "text", "id": "text",
@ -165,10 +159,10 @@
"templating": { "templating": {
"list": [ "list": [
{ {
"allValue": null, "allValue": ".+",
"current": {}, "current": {},
"datasource": "Prometheus", "datasource": "Loki",
"definition": "label_values(kube_pod_info, namespace)", "definition": "label_values(namespace)",
"hide": 0, "hide": 0,
"includeAll": false, "includeAll": false,
"index": -1, "index": -1,
@ -176,7 +170,7 @@
"multi": false, "multi": false,
"name": "namespace", "name": "namespace",
"options": [], "options": [],
"query": "label_values(kube_pod_info, namespace)", "query": "label_values(namespace)",
"refresh": 1, "refresh": 1,
"regex": "", "regex": "",
"skipUrlSync": false, "skipUrlSync": false,
@ -188,10 +182,10 @@
"useTags": false "useTags": false
}, },
{ {
"allValue": ".*", "allValue": ".+",
"current": {}, "current": {},
"datasource": "Prometheus", "datasource": "Loki",
"definition": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", "definition": "label_values({namespace=~\"$namespace\"}, pod)",
"hide": 0, "hide": 0,
"includeAll": true, "includeAll": true,
"index": -1, "index": -1,
@ -199,7 +193,7 @@
"multi": true, "multi": true,
"name": "pod", "name": "pod",
"options": [], "options": [],
"query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", "query": "label_values({namespace=~\"$namespace\"}, pod)",
"refresh": 1, "refresh": 1,
"regex": "", "regex": "",
"skipUrlSync": false, "skipUrlSync": false,
@ -241,11 +235,11 @@
] ]
}, },
"timezone": "", "timezone": "",
"title": "Loki / Logs", "title": "Loki / Pod Logs",
"uid": "209fd89b771c318dd442225414a50b59", "uid": "209fd89b771c318dd442225414a50b59",
"variables": { "variables": {
"list": [] "list": []
}, },
"version": 1, "version": 1,
"description": "Search logs stored in Loki" "description": "Search pod logs stored in Loki"
} }

View File

@ -0,0 +1,245 @@
{
"__requires": [
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "6.7.0"
},
{
"type": "panel",
"id": "graph",
"name": "Graph",
"version": ""
},
{
"type": "panel",
"id": "logs",
"name": "Logs",
"version": ""
},
{
"type": "datasource",
"id": "loki",
"name": "Loki",
"version": "1.0.0"
},
{
"type": "panel",
"id": "text",
"name": "Text",
"version": ""
}
],
"editable": false,
"gnetId": null,
"graphTooltip": 0,
"id": null,
"links": [],
"panels": [
{
"aliasColors": {},
"bars": true,
"dashLength": 10,
"dashes": false,
"datasource": "Loki",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 3,
"w": 24,
"x": 0,
"y": 0
},
"hiddenSeries": false,
"id": 6,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": false,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(count_over_time({unit=~\"$unit\", hostname=~\"$hostname\"} |~ \"$search\"[$__interval]))",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:168",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
},
{
"$$hashKey": "object:169",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"datasource": "Loki",
"gridPos": {
"h": 25,
"w": 24,
"x": 0,
"y": 3
},
"id": 2,
"maxDataPoints": "",
"options": {
"showLabels": false,
"showTime": true,
"sortOrder": "Descending",
"wrapLogMessage": true
},
"targets": [
{
"expr": "{unit=~\"$unit\", hostname=~\"$hostname\"} |~ \"$search\"",
"refId": "A"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Logs Panel",
"type": "logs"
}
],
"schemaVersion": 22,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"allValue": ".+",
"current": {},
"datasource": "Loki",
"definition": "label_values(unit)",
"hide": 0,
"includeAll": false,
"index": -1,
"label": null,
"multi": false,
"name": "unit",
"options": [],
"query": "label_values(unit)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": ".+",
"current": {},
"datasource": "Loki",
"definition": "label_values({unit=~\"$unit\"}, hostname)",
"hide": 0,
"includeAll": true,
"index": -1,
"label": null,
"multi": true,
"name": "hostname",
"options": [],
"query": "label_values({unit=~\"$unit\"}, hostname)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"current": {},
"hide": 0,
"label": null,
"name": "search",
"options": [],
"query": "",
"skipUrlSync": false,
"type": "textbox"
}
]
},
"time": {
"from": "now-30m",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
]
},
"timezone": "",
"title": "Loki / Systemd Logs",
"uid": "fa1bd43aed803111be9cc923cada9811",
"variables": {
"list": []
},
"version": 1,
"description": "Search systemd logs stored in Loki"
}

View File

@ -13,6 +13,8 @@ stringData:
registry: {{ include "cluster-addons.imagePrefix" . }}quay.io registry: {{ include "cluster-addons.imagePrefix" . }}quay.io
installation: installation:
registry: {{ include "cluster-addons.imagePrefix" . }}docker.io/ registry: {{ include "cluster-addons.imagePrefix" . }}docker.io/
nodeMetricsPort: 9091
typhaMetricsPort: 9093
calicoNetwork: calicoNetwork:
bgp: Disabled bgp: Disabled
nodeAddressAutodetectionV4: nodeAddressAutodetectionV4:
@ -50,4 +52,110 @@ spec:
- secret: - secret:
name: {{ include "cluster-addons.componentName" (list . "cni-calico") }}-config name: {{ include "cluster-addons.componentName" (list . "cni-calico") }}-config
key: overrides key: overrides
{{- if .Values.monitoring.enabled }}
---
apiVersion: addons.stackhpc.com/v1alpha1
kind: Manifests
metadata:
name: {{ include "cluster-addons.componentName" (list . "cni-calico-monitoring") }}
labels: {{ include "cluster-addons.componentLabels" (list . "cni-calico-monitoring") | nindent 4 }}
annotations:
# Tell Argo to ignore the non-controller owner references for this object
argocd.argoproj.io/sync-options: "ControllerReferencesOnly=true"
spec:
clusterName: {{ include "cluster-addons.clusterName" . }}
bootstrap: true
targetNamespace: {{ .Values.cni.calico.release.namespace }}
releaseName: cni-calico-monitoring
manifestSources:
# calico-kube-controllers
- template: |
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: calico-kube-controllers-metrics
namespace: calico-system
spec:
endpoints:
- port: metrics-port
namespaceSelector:
matchNames:
- calico-system
selector:
matchLabels:
k8s-app: calico-kube-controllers
# calico-node
- template: |
apiVersion: v1
kind: Service
metadata:
name: calico-node-metrics
namespace: calico-system
labels:
k8s-app: calico-node
spec:
clusterIP: None
ports:
- name: metrics-port
port: 9091
selector:
k8s-app: calico-node
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: calico-node-metrics
namespace: calico-system
spec:
endpoints:
- port: metrics-port
namespaceSelector:
matchNames:
- calico-system
selector:
matchLabels:
k8s-app: calico-node
# calico-typha
- template: |
apiVersion: v1
kind: Service
metadata:
name: calico-typha-metrics
namespace: calico-system
labels:
k8s-app: calico-typha
spec:
clusterIP: None
ports:
- name: metrics-port
port: 9093
selector:
k8s-app: calico-typha
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: calico-typha-metrics
namespace: calico-system
spec:
endpoints:
- port: metrics-port
namespaceSelector:
matchNames:
- calico-system
selector:
matchLabels:
k8s-app: calico-typha
# dashboard
- template: |
apiVersion: v1
kind: ConfigMap
metadata:
name: cni-calico-dashboard
labels:
grafana_dashboard: "1"
data:
cni-calico-dashboard.json: |
{{- .Files.Get "grafana-dashboards/cni-calico-dashboard.json" | nindent 12 }}
{{- end }}
{{- end }} {{- end }}

View File

@ -10,14 +10,12 @@ metadata:
stringData: stringData:
defaults: | defaults: |
controller: controller:
# Indicates whether ingress controller metrics should be included in prometheus {{- if .Values.monitoring.enabled }}
metrics: metrics:
# Enable by default if cluster monitoring is enabled enabled: true
enabled: {{ .Values.monitoring.enabled }}
serviceMonitor: serviceMonitor:
enabled: {{ .Values.monitoring.enabled }} enabled: true
namespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }} {{- end }}
additionalLabels: {{ toYaml .Values.monitoring.serviceMonitorLabels | nindent 12 }}
image: image:
registry: {{ include "cluster-addons.imagePrefix" . }}registry.k8s.io registry: {{ include "cluster-addons.imagePrefix" . }}registry.k8s.io
admissionWebhooks: admissionWebhooks:
@ -51,4 +49,31 @@ spec:
- secret: - secret:
name: {{ include "cluster-addons.componentName" (list . "ingress-nginx") }}-config name: {{ include "cluster-addons.componentName" (list . "ingress-nginx") }}-config
key: overrides key: overrides
{{- if .Values.monitoring.enabled }}
---
apiVersion: addons.stackhpc.com/v1alpha1
kind: Manifests
metadata:
name: {{ include "cluster-addons.componentName" (list . "ingress-nginx-dashboards") }}
labels: {{ include "cluster-addons.componentLabels" (list . "ingress-nginx-dashboards") | nindent 4 }}
annotations:
# Tell Argo to ignore the non-controller owner references for this object
argocd.argoproj.io/sync-options: "ControllerReferencesOnly=true"
spec:
clusterName: {{ include "cluster-addons.clusterName" . }}
bootstrap: true
targetNamespace: {{ .Values.ingress.nginx.release.namespace }}
releaseName: ingress-nginx-dashboards
manifestSources:
- template: |
apiVersion: v1
kind: ConfigMap
metadata:
name: ingress-nginx-dashboards
labels:
grafana_dashboard: "1"
data:
nginx-ingress-dashboard.json: |
{{- .Files.Get "grafana-dashboards/ingress-nginx-dashboard.json" | nindent 12 }}
{{- end }}
{{- end }} {{- end }}

View File

@ -10,9 +10,20 @@ metadata:
stringData: stringData:
defaults: | defaults: |
alertmanager: alertmanager:
# Don't apply the namespace grouping by default
config:
route:
group_by: []
alertmanagerSpec: alertmanagerSpec:
image: image:
registry: {{ include "cluster-addons.imagePrefix" . }}quay.io registry: {{ include "cluster-addons.imagePrefix" . }}quay.io
# Make sure that alertmanager finds configurations with the alertmanager name as a label
alertmanagerConfigSelector:
matchLabels:
alertmanager: kube-prometheus-stack-alertmanager
# Do NOT add the namespace matcher to routes from AlertmanagerConfig resources
alertmanagerConfigMatcherStrategy:
type: None
prometheusOperator: prometheusOperator:
admissionWebhooks: admissionWebhooks:
patch: patch:
@ -29,14 +40,28 @@ stringData:
prometheusSpec: prometheusSpec:
image: image:
registry: {{ include "cluster-addons.imagePrefix" . }}quay.io registry: {{ include "cluster-addons.imagePrefix" . }}quay.io
retentionSize: {{ mulf 0.95 .Values.monitoring.prometheusVolumeCapacity }}GB # Tell Prometheus to pick up all monitors, regardless of labels
storageSpec: podMonitorSelectorNilUsesHelmValues: false
volumeClaimTemplate: serviceMonitorSelectorNilUsesHelmValues: false
spec: {{-
accessModes: ["ReadWriteOnce"] $storageSize := dig
resources: "prometheus"
requests: "prometheusSpec"
storage: {{ .Values.monitoring.prometheusVolumeCapacity }}Gi "storageSpec"
"volumeClaimTemplate"
"spec"
"resources"
"requests"
"storage"
""
.Values.monitoring.kubePrometheusStack.release.values
}}
{{- if $storageSize }}
# Set the retention size to 95% of the given volume size
{{- $storageAmount := mustRegexFind "^([0-9]*[.])?[0-9]+" $storageSize | float64 }}
{{- $storageUnits := mustRegexFind "(K|M|G|T|E|P)i?$" $storageSize }}
retentionSize: {{ mulf 0.95 $storageAmount }}{{ $storageUnits }}B
{{- end }}
thanosRuler: thanosRuler:
thanosRulerSpec: thanosRulerSpec:
image: image:
@ -46,13 +71,16 @@ stringData:
repository: {{ include "cluster-addons.imagePrefix" . }}registry.k8s.io/kube-state-metrics/kube-state-metrics repository: {{ include "cluster-addons.imagePrefix" . }}registry.k8s.io/kube-state-metrics/kube-state-metrics
prometheus-node-exporter: prometheus-node-exporter:
image: image:
repository: {{ include "cluster-addons.imagePrefix" . }}quay.io/prometheus/node-exporter registry: {{ include "cluster-addons.imagePrefix" . }}quay.io
grafana: grafana:
image: image:
repository: {{ include "cluster-addons.imagePrefix" . }}docker.io/grafana/grafana repository: {{ include "cluster-addons.imagePrefix" . }}docker.io/grafana/grafana
sidecar: sidecar:
image: image:
repository: {{ include "cluster-addons.imagePrefix" . }}quay.io/kiwigrid/k8s-sidecar repository: {{ include "cluster-addons.imagePrefix" . }}quay.io/kiwigrid/k8s-sidecar
# Tell Grafana to include dashboards from all namespaces
dashboards:
searchNamespace: ALL
downloadDashboardsImage: downloadDashboardsImage:
repository: {{ include "cluster-addons.imagePrefix" . }}docker.io/curlimages/curl repository: {{ include "cluster-addons.imagePrefix" . }}docker.io/curlimages/curl
initChownData: initChownData:
@ -104,14 +132,10 @@ spec:
apiVersion: v1 apiVersion: v1
kind: ConfigMap kind: ConfigMap
metadata: metadata:
name: additional-grafana-dashboard name: additional-grafana-dashboards
labels: labels:
grafana_dashboard: "1" grafana_dashboard: "1"
data: data:
nvidia-dcgm-exporter-dashboard.json: | nvidia-dcgm-exporter-dashboard.json: |
{{- .Files.Get "grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json" | nindent 12 }} {{- .Files.Get "grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json" | nindent 12 }}
{{- if .Values.ingress.enabled }}
nginx-ingress-dashboard.json: |
{{- .Files.Get "grafana-dashboards/nginx-ingress-dashboard.json" | nindent 12 }}
{{- end }}
{{- end }} {{- end }}

View File

@ -12,12 +12,45 @@ stringData:
loki: loki:
image: image:
repository: {{ include "cluster-addons.imagePrefix" . }}docker.io/grafana/loki repository: {{ include "cluster-addons.imagePrefix" . }}docker.io/grafana/loki
persistence: serviceMonitor:
enabled: true enabled: true
size: {{ .Values.monitoring.lokiVolumeCapacity }}Gi
promtail: promtail:
image: image:
registry: {{ include "cluster-addons.imagePrefix" . }}docker.io registry: {{ include "cluster-addons.imagePrefix" . }}docker.io
serviceMonitor:
enabled: true
# Get promtail to scrape systemd services
config:
snippets:
extraScrapeConfigs: |
- job_name: journal
journal:
path: /var/log/journal
max_age: 12h
labels:
job: systemd-journal
relabel_configs:
- source_labels: ['__journal__systemd_unit']
target_label: 'unit'
- source_labels: ['__journal__hostname']
target_label: 'hostname'
- source_labels: ['__journal_priority_keyword']
target_label: level
# Mount journal directory and machine-id file into promtail pods
extraVolumes:
- name: journal
hostPath:
path: /var/log/journal
- name: machine-id
hostPath:
path: /etc/machine-id
extraVolumeMounts:
- name: journal
mountPath: /var/log/journal
readOnly: true
- name: machine-id
mountPath: /etc/machine-id
readOnly: true
grafana: grafana:
sidecar: sidecar:
datasources: datasources:
@ -58,7 +91,7 @@ metadata:
spec: spec:
clusterName: {{ include "cluster-addons.clusterName" . }} clusterName: {{ include "cluster-addons.clusterName" . }}
bootstrap: true bootstrap: true
targetNamespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }} targetNamespace: {{ .Values.monitoring.lokiStack.release.namespace }}
releaseName: loki-stack-dashboards releaseName: loki-stack-dashboards
manifestSources: manifestSources:
- template: | - template: |
@ -82,10 +115,14 @@ spec:
apiVersion: v1 apiVersion: v1
kind: ConfigMap kind: ConfigMap
metadata: metadata:
name: loki-stack-grafana-dashboard name: loki-stack-grafana-dashboards
labels: labels:
grafana_dashboard: "1" grafana_dashboard: "1"
data: data:
loki-dashboard.json: | loki-pod-logs-dashboard.json: |
{{- .Files.Get "grafana-dashboards/loki-dashboard.json" | nindent 12 }} {{- .Files.Get "grafana-dashboards/loki-pod-logs-dashboard.json" | nindent 12 }}
loki-systemd-logs-dashboard.json: |
{{- .Files.Get "grafana-dashboards/loki-systemd-logs-dashboard.json" | nindent 12 }}
loki-metrics-dashboard.json: |
{{- .Files.Get "grafana-dashboards/loki-metrics-dashboard.json" | nindent 12 }}
{{- end }} {{- end }}

View File

@ -168,54 +168,29 @@ ingress:
monitoring: monitoring:
# Indicates if the cluster monitoring should be enabled # Indicates if the cluster monitoring should be enabled
enabled: false enabled: false
# labels to be added to ServiceMonitor resources
# must match labels from .serviceMonitorSelector.matchLabels
# field of Prometheus resource created by kube-prometheus-stack
# in order for Prometheus to scrape metrics from the services
serviceMonitorLabels:
release: kube-prometheus-stack
# Size of the volume in GB to provision on the target cloud for persistent storage of prometheus data
prometheusVolumeCapacity: 10
# Config for the kube-prometheus-stack helm chart # Config for the kube-prometheus-stack helm chart
# https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack # https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack
kubePrometheusStack: kubePrometheusStack:
chart: chart:
repo: https://prometheus-community.github.io/helm-charts repo: https://prometheus-community.github.io/helm-charts
name: kube-prometheus-stack name: kube-prometheus-stack
version: 43.3.1 version: 45.4.0
release: release:
namespace: monitoring-system namespace: monitoring-system
values: {} values:
prometheus:
############################################################################# prometheusSpec:
# Alertmanager does not come with pre-configured alert sinks so we have to # Enable persistence by default
# write our own (and store it elsewhere to keep credential/secrets hidden). # The amount of data that is retained will be 90 days or 95% of the size of the
# # persistent volume, whichever is reached first
# Example config to send alerts to a slack channel: retention: 90d
# storageSpec:
# Note - 'null' receiver must be include as default kube-prometheus-stack volumeClaimTemplate:
# alerting rules (specifically the WatchDog alert) require it. spec:
# If it is not included then alertmanager pods will fail to launch accessModes: ["ReadWriteOnce"]
# and errors will be printed in prometheus operator pod logs. resources:
# requests:
# alertmanager: storage: 10Gi
# enabled: true
# config:
# global:
# slack_api_url: '<insert-secret-slack-webhook-url>'
# route:
# receiver: 'slack-notifications'
# group_by: ['namespace']
# receivers:
# - name: 'null'
# - name: 'slack-notifications'
# slack_configs:
# - channel: '#<insert-channel-name>'
# send_resolved: true
# - name: 'gmail-notifications'
# TODO: Add example here
#############################################################################
lokiStack: lokiStack:
enabled: true enabled: true
chart: chart:
@ -224,9 +199,12 @@ monitoring:
version: 2.8.9 version: 2.8.9
release: release:
namespace: monitoring-system namespace: monitoring-system
values: {} values:
# Size of the volume in GB to provision on the target cloud for persistent storage of loki data loki:
lokiVolumeCapacity: 10 # Enable persistence by default
persistence:
enabled: true
size: 10Gi
# Settings for node feature discovery # Settings for node feature discovery
# https://github.com/kubernetes-sigs/node-feature-discovery/tree/master/deployment/helm/node-feature-discovery # https://github.com/kubernetes-sigs/node-feature-discovery/tree/master/deployment/helm/node-feature-discovery

View File

@ -1,28 +1,28 @@
docker.io: docker.io:
images: images:
grafana/grafana: grafana/grafana:
- 9.3.1 - 9.3.6
quay.io: quay.io:
images: images:
kiwigrid/k8s-sidecar: kiwigrid/k8s-sidecar:
- 1.21.0 - 1.22.0
prometheus/alertmanager: prometheus/alertmanager:
- v0.25.0 - v0.25.0
prometheus/node-exporter: prometheus/node-exporter:
- v1.5.0 - v1.5.0
prometheus/prometheus: prometheus/prometheus:
- v2.40.5 - v2.42.0
prometheus-operator/prometheus-config-reloader: prometheus-operator/prometheus-config-reloader:
- v0.61.1 - v0.63.0
prometheus-operator/prometheus-operator: prometheus-operator/prometheus-operator:
- v0.61.1 - v0.63.0
thanos/thanos: thanos/thanos:
- v0.29.0 - v0.30.2
registry.k8s.io: registry.k8s.io:
images: images:
ingress-nginx/kube-webhook-certgen: ingress-nginx/kube-webhook-certgen:
- v1.3.0 - v20221220-controller-v1.5.1-58-g787ea74b6
kube-state-metrics/kube-state-metrics: kube-state-metrics/kube-state-metrics:
- v2.7.0 - v2.8.0