{ "annotations": { "list": [ { "$$hashKey": "object:192", "builtIn": 1, "datasource": { "type": "datasource", "uid": "grafana" }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "target": { "limit": 100, "matchAny": false, "tags": [], "type": "dashboard" }, "type": "dashboard" } ] }, "description": "This dashboard is to display the metrics from DCGM Exporter on a Kubernetes (1.13+) cluster", "editable": true, "fiscalYearStartMonth": 0, "gnetId": 12239, "graphTooltip": 0, "id": 33, "links": [], "liveNow": false, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 18, "x": 0, "y": 0 }, "hiddenSeries": false, "id": 12, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "9.3.1", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "DCGM_FI_DEV_GPU_TEMP{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "instant": false, "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "GPU Temperature", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "format": "celsius", "logBase": 1, "show": true }, { "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "#EAB839", "value": 83 }, { "color": "red", "value": 87 } ] }, "unit": "celsius" }, "overrides": [] }, "gridPos": { "h": 8, "w": 6, "x": 18, "y": 0 }, "id": 14, "options": { "orientation": "auto", "reduceOptions": { "calcs": [ "mean" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true }, "pluginVersion": "9.3.1", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "avg(DCGM_FI_DEV_GPU_TEMP{instance=~\"${instance}\", gpu=~\"${gpu}\"})", "interval": "", "legendFormat": "", "refId": "A" } ], "title": "GPU Avg. Temp", "type": "gauge" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 18, "x": 0, "y": 8 }, "hiddenSeries": false, "id": 10, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "9.3.1", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "DCGM_FI_DEV_POWER_USAGE{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "GPU Power Usage", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "format": "watt", "logBase": 1, "show": true }, { "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 2400, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "#EAB839", "value": 1800 }, { "color": "red", "value": 2200 } ] }, "unit": "watt" }, "overrides": [] }, "gridPos": { "h": 8, "w": 6, "x": 18, "y": 8 }, "id": 16, "links": [], "options": { "orientation": "horizontal", "reduceOptions": { "calcs": [ "sum" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true }, "pluginVersion": "9.3.1", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(DCGM_FI_DEV_POWER_USAGE{instance=~\"${instance}\", gpu=~\"${gpu}\"})", "interval": "", "legendFormat": "", "refId": "A" } ], "title": "GPU Power Total", "type": "gauge" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, "hiddenSeries": false, "id": 2, "interval": "", "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "9.3.1", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "DCGM_FI_DEV_SM_CLOCK{instance=~\"${instance}\", gpu=~\"${gpu}\"} * 1000000", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "GPU {{gpu}}", "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "GPU SM Clocks", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "format": "hertz", "label": "", "logBase": 1, "show": true }, { "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "type": "prometheus", "uid": "prometheus" }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, "hiddenSeries": false, "id": 6, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "GPU Utilization", "tooltip": { "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "format": "percent", "logBase": 1, "max": "100", "min": "0", "show": true }, { "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "type": "prometheus", "uid": "prometheus" }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 32 }, "hiddenSeries": false, "id": 4, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "Tensor Core Utilization", "tooltip": { "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "logBase": 1, "max": "1", "min": "0", "show": true }, { "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "type": "prometheus", "uid": "prometheus" }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 }, "hiddenSeries": false, "id": 18, "legend": { "avg": true, "current": false, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "DCGM_FI_DEV_FB_USED{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "GPU Framebuffer Mem Used", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "format": "decmbytes", "logBase": 1, "show": true }, { "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } } ], "refresh": false, "schemaVersion": 37, "style": "dark", "tags": [], "templating": { "list": [ { "current": { "isNone": true, "selected": false, "text": "None", "value": "" }, "datasource": { "type": "prometheus", "uid": "prometheus" }, "definition": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", "hide": 0, "includeAll": false, "multi": true, "name": "instance", "options": [], "query": { "query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", "refId": "Prometheus-instance-Variable-Query" }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", "tagsQuery": "", "type": "query", "useTags": false }, { "current": { "selected": false, "text": "All", "value": "$__all" }, "datasource": { "type": "prometheus", "uid": "prometheus" }, "definition": "label_values(gpu)", "hide": 0, "includeAll": true, "multi": true, "name": "gpu", "options": [], "query": { "query": "label_values(gpu)", "refId": "Prometheus-gpu-Variable-Query" }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tagsQuery": "", "type": "query", "useTags": false } ] }, "time": { "from": "now-15m", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ] }, "timezone": "", "title": "NVIDIA DCGM Exporter Dashboard", "uid": "Oxed_c6Wz", "version": 1, "weekStart": "" }