Merge pull request #37 from stackhpc/feature/monitoring
Add monitoring and alerting config for kube-prometheus-stack
This commit is contained in:
commit
44b012986f
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,2 +1,3 @@
|
||||
charts/*/charts
|
||||
Chart.lock
|
||||
Chart.lock
|
||||
.vscode
|
File diff suppressed because it is too large
Load Diff
@ -1,68 +1,51 @@
|
||||
{
|
||||
"__inputs": [
|
||||
{
|
||||
"name": "DS_PROMETHEUS",
|
||||
"label": "Prometheus",
|
||||
"description": "",
|
||||
"type": "datasource",
|
||||
"pluginId": "prometheus",
|
||||
"pluginName": "Prometheus"
|
||||
}
|
||||
],
|
||||
"__requires": [
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "gauge",
|
||||
"name": "Gauge",
|
||||
"version": ""
|
||||
},
|
||||
{
|
||||
"type": "grafana",
|
||||
"id": "grafana",
|
||||
"name": "Grafana",
|
||||
"version": "6.7.3"
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "graph",
|
||||
"name": "Graph",
|
||||
"version": ""
|
||||
},
|
||||
{
|
||||
"type": "datasource",
|
||||
"id": "prometheus",
|
||||
"name": "Prometheus",
|
||||
"version": "1.0.0"
|
||||
}
|
||||
],
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"$$hashKey": "object:192",
|
||||
"builtIn": 1,
|
||||
"datasource": "-- Grafana --",
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "grafana"
|
||||
},
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"target": {
|
||||
"limit": 100,
|
||||
"matchAny": false,
|
||||
"tags": [],
|
||||
"type": "dashboard"
|
||||
},
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "This dashboard is to display the metrics from DCGM Exporter on a Kubernetes (1.13+) cluster",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"gnetId": 12239,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"iteration": 1588401887165,
|
||||
"id": 33,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"links": []
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@ -88,9 +71,10 @@
|
||||
"linewidth": 2,
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"dataLinks": []
|
||||
"alertThreshold": true
|
||||
},
|
||||
"percentage": false,
|
||||
"pluginVersion": "9.3.1",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
@ -100,6 +84,10 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"expr": "DCGM_FI_DEV_GPU_TEMP{instance=~\"${instance}\", gpu=~\"${gpu}\"}",
|
||||
"instant": false,
|
||||
"interval": "",
|
||||
@ -108,9 +96,7 @@
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [],
|
||||
"timeShift": null,
|
||||
"title": "GPU Temperature",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
@ -119,37 +105,60 @@
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "celsius",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
"align": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "#EAB839",
|
||||
"value": 83
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 87
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "celsius"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 6,
|
||||
@ -158,54 +167,30 @@
|
||||
},
|
||||
"id": 14,
|
||||
"options": {
|
||||
"fieldOptions": {
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"mean"
|
||||
],
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "#EAB839",
|
||||
"value": 83
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 87
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "celsius"
|
||||
},
|
||||
"overrides": [],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"orientation": "auto",
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"pluginVersion": "6.7.3",
|
||||
"pluginVersion": "9.3.1",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"expr": "avg(DCGM_FI_DEV_GPU_TEMP{instance=~\"${instance}\", gpu=~\"${gpu}\"})",
|
||||
"interval": "",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "GPU Avg. Temp",
|
||||
"type": "gauge"
|
||||
},
|
||||
@ -214,7 +199,16 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"links": []
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@ -240,10 +234,10 @@
|
||||
"linewidth": 2,
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"dataLinks": []
|
||||
"alertThreshold": true
|
||||
},
|
||||
"percentage": false,
|
||||
"pluginVersion": "6.5.2",
|
||||
"pluginVersion": "9.3.1",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
@ -253,6 +247,10 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"expr": "DCGM_FI_DEV_POWER_USAGE{instance=~\"${instance}\", gpu=~\"${gpu}\"}",
|
||||
"interval": "",
|
||||
"legendFormat": "GPU {{gpu}}",
|
||||
@ -260,9 +258,7 @@
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [],
|
||||
"timeShift": null,
|
||||
"title": "GPU Power Usage",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
@ -271,38 +267,60 @@
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "watt",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
"align": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cacheTimeout": null,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"max": 2400,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "#EAB839",
|
||||
"value": 1800
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 2200
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "watt"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 6,
|
||||
@ -312,55 +330,30 @@
|
||||
"id": 16,
|
||||
"links": [],
|
||||
"options": {
|
||||
"fieldOptions": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"sum"
|
||||
],
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"max": 2400,
|
||||
"min": 0,
|
||||
"nullValueMode": "connected",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "#EAB839",
|
||||
"value": 1800
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 2200
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "watt"
|
||||
},
|
||||
"overrides": [],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"orientation": "horizontal",
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"pluginVersion": "6.7.3",
|
||||
"pluginVersion": "9.3.1",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"expr": "sum(DCGM_FI_DEV_POWER_USAGE{instance=~\"${instance}\", gpu=~\"${gpu}\"})",
|
||||
"interval": "",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "GPU Power Total",
|
||||
"type": "gauge"
|
||||
},
|
||||
@ -369,7 +362,16 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"links": []
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@ -389,7 +391,6 @@
|
||||
"min": false,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
@ -397,9 +398,10 @@
|
||||
"linewidth": 2,
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"dataLinks": []
|
||||
"alertThreshold": true
|
||||
},
|
||||
"percentage": false,
|
||||
"pluginVersion": "9.3.1",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
@ -409,6 +411,10 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"expr": "DCGM_FI_DEV_SM_CLOCK{instance=~\"${instance}\", gpu=~\"${gpu}\"} * 1000000",
|
||||
"format": "time_series",
|
||||
"interval": "",
|
||||
@ -418,9 +424,7 @@
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [],
|
||||
"timeShift": null,
|
||||
"title": "GPU SM Clocks",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
@ -429,34 +433,25 @@
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"decimals": null,
|
||||
"format": "hertz",
|
||||
"label": "",
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
"align": false
|
||||
}
|
||||
},
|
||||
{
|
||||
@ -464,7 +459,10 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@ -502,6 +500,10 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"${instance}\", gpu=~\"${gpu}\"}",
|
||||
"interval": "",
|
||||
"legendFormat": "GPU {{gpu}}",
|
||||
@ -509,9 +511,7 @@
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [],
|
||||
"timeShift": null,
|
||||
"title": "GPU Utilization",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
@ -520,16 +520,13 @@
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "percent",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": "100",
|
||||
"min": "0",
|
||||
@ -537,16 +534,12 @@
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
"align": false
|
||||
}
|
||||
},
|
||||
{
|
||||
@ -554,7 +547,10 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@ -564,95 +560,6 @@
|
||||
"y": 32
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 18,
|
||||
"legend": {
|
||||
"avg": true,
|
||||
"current": false,
|
||||
"max": true,
|
||||
"min": false,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 2,
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"dataLinks": []
|
||||
},
|
||||
"percentage": false,
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "DCGM_FI_DEV_FB_USED{instance=~\"${instance}\", gpu=~\"${gpu}\"}",
|
||||
"interval": "",
|
||||
"legendFormat": "GPU {{gpu}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [],
|
||||
"timeShift": null,
|
||||
"title": "GPU Framebuffer Mem Used",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "decmbytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 24
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 4,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
@ -681,6 +588,10 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"expr": "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{instance=~\"${instance}\", gpu=~\"${gpu}\"}",
|
||||
"interval": "",
|
||||
"legendFormat": "GPU {{gpu}}",
|
||||
@ -688,9 +599,7 @@
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [],
|
||||
"timeShift": null,
|
||||
"title": "Tensor Core Utilization",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
@ -699,16 +608,13 @@
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "percentunit",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": "1",
|
||||
"min": "0",
|
||||
@ -716,66 +622,161 @@
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
"align": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 40
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 18,
|
||||
"legend": {
|
||||
"avg": true,
|
||||
"current": false,
|
||||
"max": true,
|
||||
"min": false,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 2,
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"dataLinks": []
|
||||
},
|
||||
"percentage": false,
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"expr": "DCGM_FI_DEV_FB_USED{instance=~\"${instance}\", gpu=~\"${gpu}\"}",
|
||||
"interval": "",
|
||||
"legendFormat": "GPU {{gpu}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeRegions": [],
|
||||
"title": "GPU Framebuffer Mem Used",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"mode": "time",
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "decmbytes",
|
||||
"logBase": 1,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"logBase": 1,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false
|
||||
}
|
||||
}
|
||||
],
|
||||
"refresh": false,
|
||||
"schemaVersion": 22,
|
||||
"schemaVersion": 37,
|
||||
"style": "dark",
|
||||
"tags": [],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {},
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"current": {
|
||||
"isNone": true,
|
||||
"selected": false,
|
||||
"text": "None",
|
||||
"value": ""
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"definition": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": null,
|
||||
"multi": true,
|
||||
"name": "instance",
|
||||
"options": [],
|
||||
"query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)",
|
||||
"query": {
|
||||
"query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)",
|
||||
"refId": "Prometheus-instance-Variable-Query"
|
||||
},
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 0,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {},
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"current": {
|
||||
"selected": false,
|
||||
"text": "All",
|
||||
"value": "$__all"
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"definition": "label_values(gpu)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"index": -1,
|
||||
"label": null,
|
||||
"multi": true,
|
||||
"name": "gpu",
|
||||
"options": [],
|
||||
"query": "label_values(gpu)",
|
||||
"query": {
|
||||
"query": "label_values(gpu)",
|
||||
"refId": "Prometheus-gpu-Variable-Query"
|
||||
},
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
@ -803,8 +804,6 @@
|
||||
"timezone": "",
|
||||
"title": "NVIDIA DCGM Exporter Dashboard",
|
||||
"uid": "Oxed_c6Wz",
|
||||
"variables": {
|
||||
"list": []
|
||||
},
|
||||
"version": 1
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
@ -10,6 +10,14 @@ metadata:
|
||||
stringData:
|
||||
defaults: |
|
||||
controller:
|
||||
# Indicates whether ingress controller metrics should be included in prometheus
|
||||
metrics:
|
||||
# Enable by default if cluster monitoring is enabled
|
||||
enabled: {{ .Values.monitoring.enabled }}
|
||||
serviceMonitor:
|
||||
enabled: {{ .Values.monitoring.enabled }}
|
||||
namespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }}
|
||||
additionalLabels: {{ toYaml .Values.monitoring.serviceMonitorLabels | nindent 12 }}
|
||||
image:
|
||||
registry: {{ include "cluster-addons.imagePrefix" . }}registry.k8s.io
|
||||
admissionWebhooks:
|
||||
|
@ -29,6 +29,14 @@ stringData:
|
||||
prometheusSpec:
|
||||
image:
|
||||
registry: {{ include "cluster-addons.imagePrefix" . }}quay.io
|
||||
retentionSize: {{ mulf 0.95 .Values.monitoring.prometheusVolumeCapacity }}GB
|
||||
storageSpec:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: {{ .Values.monitoring.prometheusVolumeCapacity }}Gi
|
||||
thanosRuler:
|
||||
thanosRulerSpec:
|
||||
image:
|
||||
@ -96,10 +104,14 @@ spec:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: nvidia-dcgm-exporter-dashboard
|
||||
name: additional-grafana-dashboard
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
data:
|
||||
nvidia-dcgm-exporter-dashboard.json: |
|
||||
{{- .Files.Get "grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json" | nindent 12 }}
|
||||
{{- .Files.Get "grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json" | nindent 12 }}
|
||||
{{- if .Values.ingress.enabled }}
|
||||
nginx-ingress-dashboard.json: |
|
||||
{{- .Files.Get "grafana-dashboards/nginx-ingress-dashboard.json" | nindent 12 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
@ -12,6 +12,9 @@ stringData:
|
||||
loki:
|
||||
image:
|
||||
repository: {{ include "cluster-addons.imagePrefix" . }}docker.io/grafana/loki
|
||||
persistence:
|
||||
enabled: true
|
||||
size: {{ .Values.monitoring.lokiVolumeCapacity }}Gi
|
||||
promtail:
|
||||
image:
|
||||
registry: {{ include "cluster-addons.imagePrefix" . }}docker.io
|
||||
|
@ -163,12 +163,21 @@ ingress:
|
||||
version: 4.4.2
|
||||
release:
|
||||
namespace: ingress-nginx
|
||||
values: {}
|
||||
|
||||
# Settings for cluster monitoring
|
||||
monitoring:
|
||||
# Indicates if the cluster monitoring should be enabled
|
||||
enabled: false
|
||||
# labels to be added to ServiceMonitor resources
|
||||
# must match labels from .serviceMonitorSelector.matchLabels
|
||||
# field of Prometheus resource created by kube-prometheus-stack
|
||||
# in order for Prometheus to scrape metrics from the services
|
||||
serviceMonitorLabels:
|
||||
release: kube-prometheus-stack
|
||||
# Size of the volume in GB to provision on the target cloud for persistent storage of prometheus data
|
||||
prometheusVolumeCapacity: 10
|
||||
# Config for the kube-prometheus-stack helm chart
|
||||
# https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack
|
||||
kubePrometheusStack:
|
||||
chart:
|
||||
repo: https://prometheus-community.github.io/helm-charts
|
||||
@ -177,6 +186,36 @@ monitoring:
|
||||
release:
|
||||
namespace: monitoring-system
|
||||
values: {}
|
||||
|
||||
#############################################################################
|
||||
# Alertmanager does not come with pre-configured alert sinks so we have to
|
||||
# write our own (and store it elsewhere to keep credential/secrets hidden).
|
||||
#
|
||||
# Example config to send alerts to a slack channel:
|
||||
#
|
||||
# Note - 'null' receiver must be include as default kube-prometheus-stack
|
||||
# alerting rules (specifically the WatchDog alert) require it.
|
||||
# If it is not included then alertmanager pods will fail to launch
|
||||
# and errors will be printed in prometheus operator pod logs.
|
||||
#
|
||||
# alertmanager:
|
||||
# enabled: true
|
||||
# config:
|
||||
# global:
|
||||
# slack_api_url: '<insert-secret-slack-webhook-url>'
|
||||
# route:
|
||||
# receiver: 'slack-notifications'
|
||||
# group_by: ['namespace']
|
||||
# receivers:
|
||||
# - name: 'null'
|
||||
# - name: 'slack-notifications'
|
||||
# slack_configs:
|
||||
# - channel: '#<insert-channel-name>'
|
||||
# send_resolved: true
|
||||
# - name: 'gmail-notifications'
|
||||
# TODO: Add example here
|
||||
#############################################################################
|
||||
|
||||
lokiStack:
|
||||
enabled: true
|
||||
chart:
|
||||
@ -186,6 +225,8 @@ monitoring:
|
||||
release:
|
||||
namespace: monitoring-system
|
||||
values: {}
|
||||
# Size of the volume in GB to provision on the target cloud for persistent storage of loki data
|
||||
lokiVolumeCapacity: 10
|
||||
|
||||
# Settings for node feature discovery
|
||||
# https://github.com/kubernetes-sigs/node-feature-discovery/tree/master/deployment/helm/node-feature-discovery
|
||||
|
Loading…
x
Reference in New Issue
Block a user