|
- apiVersion: v1
- data:
- cluster-device-statistic-dashboard.json: |
- {
- "dashboard": {
- "annotations": {
- "list": [
- {
- "builtIn": 1,
- "datasource": "-- Grafana --",
- "enable": true,
- "hide": true,
- "iconColor": "rgba(0, 211, 255, 1)",
- "name": "Annotations & Alerts",
- "type": "dashboard"
- }
- ]
- },
- "editable": true,
- "gnetId": null,
- "graphTooltip": 0,
- "hideControls": false,
- "id": null,
- "links": [],
- "rows": [
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 0,
- "id": 1,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 12,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "count (avg by (uuid,username) (task_device_percent)) by (username)",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "{{username}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Per user allocation",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": false
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": 250,
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 0,
- "id": 2,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 12,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "count (avg by (instance,vc_name) (task_device_percent)) by (vc_name)",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "{{vc_name}} allocation",
- "refId": "A"
- },
- {
- "expr": "count(avg by (uuid) (task_device_percent))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "Cluster wide allocation",
- "refId": "B"
- },
- {
- "expr": "avg by (instance,vc_name) (k8s_vc_device_total)",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{vc_name}} quota",
- "refId": "C"
- },
- {
- "expr": "sum(avg by (instance,device_str,device_str) (k8s_vc_device_total))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "Cluster wide quota",
- "refId": "D"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Per VC allocation & quota",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": false
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": 250,
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 0,
- "id": 3,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 12,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "count (avg by (uuid,vc_name) (task_device_percent)) by (vc_name) / sum (avg by (instance,vc_name,device_str) (k8s_vc_device_total)) by (vc_name)",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{vc_name}}",
- "refId": "A"
- },
- {
- "expr": "count (avg by (uuid,vc_name)(task_device_percent)) / sum (avg by (instance,vc_name,device_str) (k8s_vc_device_total))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "Cluster wide",
- "refId": "B"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Per VC allocation rate",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "percentunit",
- "label": null,
- "logBase": 1,
- "max": "1",
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": false
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": 250,
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 0,
- "id": 4,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 12,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg (task_device_percent) by (username)",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{username}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Per user device utils",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "percent",
- "label": null,
- "logBase": 1,
- "max": "100",
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": false
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": 250,
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 0,
- "id": 5,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 12,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg (task_device_percent) by (vc_name)",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{vc_name}}",
- "refId": "A"
- },
- {
- "expr": "avg (task_device_percent)",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "Cluster wide",
- "refId": "B"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Per VC device utils",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "percent",
- "label": null,
- "logBase": 1,
- "max": "100",
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": false
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- }
- ],
- "schemaVersion": 14,
- "style": "dark",
- "tags": [],
- "templating": {
- "list": []
- },
- "time": {
- "from": "now-6h",
- "to": "now"
- },
- "timepicker": {
- "refresh_intervals": [
- "30s",
- "1m",
- "5m",
- "15m",
- "30m",
- "1h",
- "2h",
- "1d"
- ],
- "time_options": [
- "5m",
- "15m",
- "1h",
- "6h",
- "12h",
- "24h",
- "2d",
- "7d",
- "30d"
- ]
- },
- "timezone": "browser",
- "title": "Cluster Device statistic",
- "version": 0,
- "uid": "cluster-gpu-statistic"
- }
- }
- cluster-status-dashboard.json: |
- {
- "dashboard": {
- "annotations": {
- "list": [
- {
- "builtIn": 1,
- "datasource": "-- Grafana --",
- "enable": true,
- "hide": true,
- "iconColor": "rgba(0, 211, 255, 1)",
- "name": "Annotations & Alerts",
- "type": "dashboard"
- }
- ]
- },
- "editable": true,
- "gnetId": null,
- "graphTooltip": 0,
- "hideControls": false,
- "id": null,
- "links": [],
- "refresh": "30s",
- "rows": [
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
- "datasource": null,
- "format": "none",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
- "id": 2,
- "interval": null,
- "links": [],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "span": 3,
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": false,
- "lineColor": "rgb(31, 120, 193)",
- "show": false
- },
- "tableColumn": "",
- "targets": [
- {
- "expr": "sum(avg by (host_ip) (k8s_node_device_used{device_str='nvidia.com/gpu'})) OR on() vector(0)",
- "format": "time_series",
- "instant": true,
- "intervalFactor": 2,
- "refId": "A"
- }
- ],
- "thresholds": "",
- "title": "Used GPUs",
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
- }
- ],
- "valueName": "avg"
- },
- {
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
- "datasource": null,
- "format": "none",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
- "id": 3,
- "interval": null,
- "links": [],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "span": 3,
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": false,
- "lineColor": "rgb(31, 120, 193)",
- "show": false
- },
- "tableColumn": "",
- "targets": [
- {
- "expr": "sum(avg by (host_ip) (k8s_node_device_available{device_str='nvidia.com/gpu'})) OR on() vector(0)",
- "format": "time_series",
- "instant": true,
- "intervalFactor": 2,
- "refId": "A"
- }
- ],
- "thresholds": "",
- "title": "Avaliable GPUs",
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
- }
- ],
- "valueName": "avg"
- },
- {
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
- "datasource": null,
- "format": "none",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
- "id": 7,
- "interval": null,
- "links": [],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "span": 3,
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": false,
- "lineColor": "rgb(31, 120, 193)",
- "show": false
- },
- "tableColumn": "",
- "targets": [
- {
- "expr": "sum(avg by (host_ip) (k8s_node_device_reserved{device_str='nvidia.com/gpu'})) OR on() vector(0)",
- "format": "time_series",
- "instant": true,
- "intervalFactor": 2,
- "refId": "A"
- }
- ],
- "thresholds": "",
- "title": "Reserved GPU",
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
- }
- ],
- "valueName": "avg"
- },
- {
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
- "datasource": null,
- "format": "none",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
- "id": 1,
- "interval": null,
- "links": [],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "span": 3,
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": false,
- "lineColor": "rgb(31, 120, 193)",
- "show": false
- },
- "tableColumn": "",
- "targets": [
- {
- "expr": "sum(avg by (host_ip) (k8s_node_device_total{device_str='nvidia.com/gpu'})) OR on() vector(0)",
- "format": "time_series",
- "hide": false,
- "instant": true,
- "intervalFactor": 2,
- "legendFormat": "",
- "refId": "A"
- }
- ],
- "thresholds": "",
- "title": "Total GPUs",
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
- }
- ],
- "valueName": "avg"
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
- "datasource": null,
- "format": "none",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
- "id": 8,
- "interval": null,
- "links": [],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "span": 3,
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": false,
- "lineColor": "rgb(31, 120, 193)",
- "show": false
- },
- "tableColumn": "",
- "targets": [
- {
- "expr": "sum(avg by (host_ip) (k8s_node_device_used{device_str='npu.huawei.com/NPU'})) OR on() vector(0)",
- "format": "time_series",
- "instant": true,
- "intervalFactor": 2,
- "refId": "A"
- }
- ],
- "thresholds": "",
- "title": "Used NPUs",
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
- }
- ],
- "valueName": "avg"
- },
- {
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
- "datasource": null,
- "format": "none",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
- "id": 9,
- "interval": null,
- "links": [],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "span": 3,
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": false,
- "lineColor": "rgb(31, 120, 193)",
- "show": false
- },
- "tableColumn": "",
- "targets": [
- {
- "expr": "sum(avg by (host_ip) (k8s_node_device_available{device_str='npu.huawei.com/NPU'})) OR on() vector(0)",
- "format": "time_series",
- "instant": true,
- "intervalFactor": 2,
- "refId": "A"
- }
- ],
- "thresholds": "",
- "title": "Avaliable NPUs",
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
- }
- ],
- "valueName": "avg"
- },
- {
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
- "datasource": null,
- "format": "none",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
- "id": 10,
- "interval": null,
- "links": [],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "span": 3,
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": false,
- "lineColor": "rgb(31, 120, 193)",
- "show": false
- },
- "tableColumn": "",
- "targets": [
- {
- "expr": "sum(avg by (host_ip) (k8s_node_device_reserved{device_str='npu.huawei.com/NPU'})) OR on() vector(0)",
- "format": "time_series",
- "instant": true,
- "intervalFactor": 2,
- "refId": "A"
- }
- ],
- "thresholds": "",
- "title": "Reserved NPU",
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
- }
- ],
- "valueName": "avg"
- },
- {
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
- "datasource": null,
- "format": "none",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
- "id": 11,
- "interval": null,
- "links": [],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "span": 3,
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": false,
- "lineColor": "rgb(31, 120, 193)",
- "show": false
- },
- "tableColumn": "",
- "targets": [
- {
- "expr": "sum(avg by (host_ip) (k8s_node_device_total{device_str='npu.huawei.com/NPU'})) OR on() vector(0)",
- "format": "time_series",
- "hide": false,
- "instant": true,
- "intervalFactor": 2,
- "legendFormat": "",
- "refId": "A"
- }
- ],
- "thresholds": "",
- "title": "Total NPUs",
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
- }
- ],
- "valueName": "avg"
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard NPU Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": 250,
- "panels": [
- {
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
- "datasource": null,
- "format": "none",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
- "id": 4,
- "interval": null,
- "links": [],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "span": 4,
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": false,
- "lineColor": "rgb(31, 120, 193)",
- "show": false
- },
- "tableColumn": "",
- "targets": [
- {
- "expr": "sum(avg by (job_id) (job_pod_count{phase='running'})) OR on() vector(0)",
- "format": "time_series",
- "hide": false,
- "instant": true,
- "intervalFactor": 2,
- "refId": "A"
- }
- ],
- "thresholds": "",
- "title": "Running Jobs",
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
- }
- ],
- "valueName": "avg"
- },
- {
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
- "datasource": null,
- "format": "none",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
- "id": 6,
- "interval": null,
- "links": [],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "span": 4,
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": false,
- "lineColor": "rgb(31, 120, 193)",
- "show": false
- },
- "tableColumn": "",
- "targets": [
- {
- "expr": "count(avg by (name) (pai_node_count{ready=\"true\", unschedulable=\"false\"})) OR on() vector(0)",
- "format": "time_series",
- "hide": false,
- "instant": true,
- "intervalFactor": 2,
- "refId": "A"
- }
- ],
- "thresholds": "",
- "title": "Active Nodes",
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
- }
- ],
- "valueName": "avg"
- },
- {
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
- "datasource": null,
- "format": "none",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
- "id": 5,
- "interval": null,
- "links": [],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "span": 4,
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": false,
- "lineColor": "rgb(31, 120, 193)",
- "show": false
- },
- "tableColumn": "",
- "targets": [
- {
- "expr": "count(avg by (name) (pai_node_count)) OR on() vector(0)",
- "format": "time_series",
- "hide": false,
- "instant": true,
- "intervalFactor": 2,
- "refId": "A"
- }
- ],
- "thresholds": "",
- "title": "Total Node",
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
- }
- ],
- "valueName": "avg"
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- }
- ],
- "schemaVersion": 14,
- "style": "dark",
- "tags": [],
- "templating": {
- "list": []
- },
- "time": {
- "from": "now-5m",
- "to": "now"
- },
- "timepicker": {
- "refresh_intervals": [
- "30s",
- "1m",
- "5m",
- "15m",
- "30m",
- "1h",
- "2h",
- "1d"
- ],
- "time_options": [
- "5m",
- "15m",
- "1h",
- "6h",
- "12h",
- "24h",
- "2d",
- "7d",
- "30d"
- ]
- },
- "timezone": "browser",
- "title": "Cluster Status",
- "version": 3,
- "uid": "cluster-status"
- }
- }
- device-fragmentation-dashboard.json: |
- {"dashboard": {
- "annotations": {
- "list": [
- {
- "builtIn": 1,
- "datasource": "-- Grafana --",
- "enable": true,
- "hide": true,
- "iconColor": "rgba(0, 211, 255, 1)",
- "name": "Annotations & Alerts",
- "type": "dashboard"
- }
- ]
- },
- "editable": true,
- "gnetId": null,
- "graphTooltip": 0,
- "hideControls": false,
- "id": null,
- "links": [],
- "refresh": "30s",
- "rows": [
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 0,
- "id": 1,
- "legend": {
- "avg": false,
- "current": true,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 12,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "count_values(\"gpu_available\", holt_winters(k8s_node_device_available{device_str='nvidia.com/gpu'}[1m],0.001,0.999))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{gpu_available}} available gpus",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Node count with certain number of available gpu",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": "Node count",
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": false
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 0,
- "id": 2,
- "legend": {
- "avg": false,
- "current": true,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 12,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "count_values(\"npu_available\", holt_winters(k8s_node_device_available{device_str='npu.huawei.com/NPU'}[1m],0.001,0.999))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{npu_available}} available npus",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Node count with certain number of available npu",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": "Node count",
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": false
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- }
- ],
- "schemaVersion": 14,
- "style": "dark",
- "tags": [],
- "templating": {
- "list": []
- },
- "time": {
- "from": "now-6h",
- "to": "now"
- },
- "timepicker": {
- "refresh_intervals": [
- "30s",
- "1m",
- "5m",
- "15m",
- "30m",
- "1h",
- "2h",
- "1d"
- ],
- "time_options": [
- "5m",
- "15m",
- "1h",
- "6h",
- "12h",
- "24h",
- "2d",
- "7d",
- "30d"
- ]
- },
- "timezone": "browser",
- "title": "Device fragmentation",
- "version": 0,
- "uid": "device-fragmentation"
- }}
- device-usage-history-dashboard.json: |
- {"dashboard": {
- "annotations": {
- "list": [
- {
- "builtIn": 1,
- "datasource": "-- Grafana --",
- "enable": true,
- "hide": true,
- "iconColor": "rgba(0, 211, 255, 1)",
- "name": "Annotations & Alerts",
- "type": "dashboard"
- }
- ]
- },
- "editable": false,
- "gnetId": null,
- "graphTooltip": 0,
- "hideControls": false,
- "id": null,
- "links": [],
- "refresh": "30s",
- "rows": [
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 0,
- "id": 1,
- "legend": {
- "avg": false,
- "current": true,
- "max": true,
- "min": true,
- "rightSide": false,
- "show": true,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "connected",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 12,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum (avg (k8s_node_device_available{device_str='nvidia.com/gpu'})by (host_ip) )",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "Available",
- "refId": "A"
- },
- {
- "expr": "sum(avg (k8s_node_device_total{device_str='nvidia.com/gpu'})by (host_ip) )",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "Total",
- "refId": "B"
- },
- {
- "expr": "sum(avg (k8s_node_device_reserved{device_str='nvidia.com/gpu'})by (host_ip) )",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "Reserved",
- "refId": "C"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "GPU Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": false
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 0,
- "id": 2,
- "legend": {
- "avg": false,
- "current": true,
- "max": true,
- "min": true,
- "rightSide": false,
- "show": true,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 12,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum(avg (k8s_node_device_available{device_str='npu.huawei.com/NPU'})by (host_ip))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "Available",
- "refId": "A"
- },
- {
- "expr": "sum(avg (k8s_node_device_total{device_str='npu.huawei.com/NPU'})by (host_ip))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "Total",
- "refId": "B"
- },
- {
- "expr": "sum(avg (k8s_node_device_reserved{device_str='npu.huawei.com/NPU'})by (host_ip))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "Reserved",
- "refId": "C"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "NPU Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": false
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- }
- ],
- "schemaVersion": 14,
- "style": "dark",
- "tags": [],
- "templating": {
- "list": []
- },
- "time": {
- "from": "now-6h",
- "to": "now"
- },
- "timepicker": {
- "refresh_intervals": [
- "30s",
- "1m",
- "5m",
- "15m",
- "30m",
- "1h",
- "2h",
- "1d"
- ],
- "time_options": [
- "5m",
- "15m",
- "1h",
- "6h",
- "12h",
- "24h",
- "2d",
- "7d",
- "30d"
- ]
- },
- "timezone": "browser",
- "title": "device Usage",
- "version": 0,
- "uid": "device-usage-history"
- }}
- email-notification.json: |
- {
- "uid": "default-email-notification",
- "name": "default email notification",
- "type": "email",
- "isDefault": true,
- "sendReminder": true,
- "frequency": "15m",
- "settings": {
- "addresses": ""
- }
- }
- job-dcgm-metrics-dashboard.json: |
- {
- "dashboard": {
- "annotations": {
- "list": [
- {
- "builtIn": 1,
- "datasource": "-- Grafana --",
- "enable": true,
- "hide": true,
- "iconColor": "rgba(0, 211, 255, 1)",
- "name": "Annotations & Alerts",
- "type": "dashboard"
- }
- ]
- },
- "description": "Dashboard to view job's DCGM metrics",
- "editable": true,
- "gnetId": null,
- "graphTooltip": 0,
- "hideControls": false,
- "id": null,
- "links": [],
- "refresh": "30s",
- "rows": [
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 0,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_gpu_util{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "GPU utilization (in %).",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "percent",
- "label": null,
- "logBase": 1,
- "max": "100",
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 1,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_memory_clock{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Memory clock frequency (in MHz).",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 2,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_gpu_temp{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "GPU temperature (in C).",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "celsius",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 3,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_memory_temp{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Memory temperature (in C).",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "celsius",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 4,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_sm_clock{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "SM clock frequency (in MHz).",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 5,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_power_usage{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Power draw (in W).",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "watt",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 6,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_total_energy_consumption{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Total energy consumption since boot (in mJ).",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 7,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_pcie_tx_throughput{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Total number of bytes transmitted through PCIe TX (in KB) via NVML.",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "deckbytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 8,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_pcie_rx_throughput{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Total number of bytes received through PCIe RX (in KB) via NVML.",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "deckbytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 9,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_pcie_replay_counter{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Total number of PCIe retries.",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 10,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_mem_copy_util{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Memory utilization (in %).",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "percent",
- "label": null,
- "logBase": 1,
- "max": "100",
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 11,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_enc_util{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Encoder utilization (in %).",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "percent",
- "label": null,
- "logBase": 1,
- "max": "100",
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 12,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_dec_util{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Decoder utilization (in %).",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "percent",
- "label": null,
- "logBase": 1,
- "max": "100",
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 13,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_xid_errors{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Value of the last XID error encountered.",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 14,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_power_violation{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Throttling duration due to power constraints (in us).",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "µs",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 15,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_thermal_violation{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Throttling duration due to thermal constraints (in us).",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "µs",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 16,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_sync_boost_violation{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Throttling duration due to sync-boost constraints (in us).",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "µs",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 17,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_board_limit_violation{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Throttling duration due to board limit constraints (in us).",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "µs",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 18,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_low_util_violation{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Throttling duration due to low utilization (in us).",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "µs",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 19,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_reliability_violation{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Throttling duration due to reliability constraints (in us).",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "µs",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 20,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_app_clock_violation{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Total throttling duration (in us).",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "µs",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 21,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_fb_free{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Framebuffer memory free (in MiB).",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "decmbytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 22,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_fb_used{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Framebuffer memory used (in MiB).",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "decmbytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 23,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_ecc_sbe_volatile_total{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Total number of single-bit volatile ECC errors.",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 24,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_ecc_dbe_volatile_total{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Total number of double-bit volatile ECC errors.",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 25,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_ecc_sbe_aggregate_total{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Total number of single-bit persistent ECC errors.",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 26,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_ecc_dbe_aggregate_total{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Total number of double-bit persistent ECC errors.",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 27,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_retired_pages_sbe{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Total number of retired pages due to single-bit errors.",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 28,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_retired_pages_dbe{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Total number of retired pages due to double-bit errors.",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 29,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_retired_pages_pending{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Total number of pages pending retirement.",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 30,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_nvlink_flit_crc_error_count_total{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Total number of NVLink flow-control CRC errors.",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 31,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_nvlink_data_crc_error_count_total{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Total number of NVLink data CRC errors.",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 32,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_nvlink_replay_error_count_total{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Total number of NVLink retries.",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 33,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_nvlink_recovery_error_count_total{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Total number of NVLink recovery errors.",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 34,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "NonePointMode": "None",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (task_dcgm_nvlink_bandwidth_total{job_name=\"$job_name\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Total number of NVLink bandwidth counters for all lanes",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- }
- ],
- "schemaVersion": 14,
- "style": "dark",
- "tags": [],
- "templating": {
- "list": [
- {
- "allValue": null,
- "current": {},
- "datasource": "PM",
- "hide": 0,
- "includeAll": false,
- "label": null,
- "multi": false,
- "name": "job_name",
- "options": [],
- "query": "label_values(task_cpu_percent, job_name)",
- "refresh": 1,
- "regex": "",
- "sort": 1,
- "tagValuesQuery": "",
- "tags": [],
- "tagsQuery": "",
- "type": "query",
- "useTags": false
- }
- ]
- },
- "time": {
- "from": "now-1h",
- "to": "now"
- },
- "timepicker": {
- "now": true,
- "refresh_intervals": [
- "30s",
- "1m",
- "5m",
- "15m",
- "30m",
- "1h",
- "2h",
- "1d"
- ],
- "time_options": [
- "5m",
- "15m",
- "1h",
- "6h",
- "12h",
- "24h",
- "2d",
- "7d",
- "30d"
- ]
- },
- "timezone": "browser",
- "title": "Job DCGM Metrics",
- "version": 1,
- "uid": "job-dcgm-metrics"
- }
- }
- job-status-dashboard.json: |
- {"dashboard": {
- "annotations": {
- "list": [
- {
- "builtIn": 1,
- "datasource": "-- Grafana --",
- "enable": true,
- "hide": true,
- "iconColor": "rgba(0, 211, 255, 1)",
- "name": "Annotations & Alerts",
- "type": "dashboard"
- }
- ]
- },
- "description": "Dashboard to view job metrics",
- "editable": true,
- "gnetId": null,
- "graphTooltip": 0,
- "hideControls": false,
- "id": null,
- "links": [],
- "refresh": "30s",
- "rows": [
-
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "PM",
- "fill": 0,
- "id": 3,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "connected",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg (task_cpu_percent{job_name=\"$job_id\"}) by (instance)",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "CPU",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "percent",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": false
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "PM",
- "fill": 0,
- "id": 4,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "connected",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg (task_mem_usage_byte{job_name=\"$job_id\"}) by (instance)",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Memory Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "decbytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "PM",
- "fill": 0,
- "id": 5,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "connected",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum(rate(task_net_in_byte{job_name=\"$job_id\"}[5m])) by (instance)",
- "format": "time_series",
- "interval": "",
- "intervalFactor": 2,
- "legendFormat": "{{instance}} Inbound",
- "refId": "A"
- },
- {
- "expr": "sum(rate(task_net_out_byte{job_name=\"$job_id\"}[5m])) by (instance)",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{instance}} Outbound",
- "refId": "B"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Network",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "Bps",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "PM",
- "fill": 0,
- "id": 6,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "connected",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg (irate(task_block_in_byte{job_name=\"$job_id\"}[300s])) by (instance)",
- "format": "time_series",
- "interval": "",
- "intervalFactor": 2,
- "legendFormat": "{{instance}} Read",
- "refId": "A"
- },
- {
- "expr": "avg (irate(task_block_out_byte{job_name=\"$job_id\"}[300s])) by (instance)",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{instance}} Write",
- "refId": "B"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Block IO",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "Bps",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 9,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "connected",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,id) (npu_chip_info_utilization{instance=~\".*(:[0-9]*)?$\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "NPU {{id}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "NPU Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "percent",
- "label": null,
- "logBase": 1,
- "max": "100",
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 10,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "connected",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,id) (npu_chip_info_used_memory{instance=~\".*(:[0-9]*)?$\"}/npu_chip_info_total_memory{instance=~\".*(:[0-9]*)?$\"})",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "NPU {{id}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "NPU Memory Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "percent",
- "label": null,
- "logBase": 1,
- "max": "100",
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 1,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "connected",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (minor_number,instance) (task_device_percent{job_name=\"$job_id\",device_str='nvidia.com/gpu'})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "GPU Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "percent",
- "label": null,
- "logBase": 1,
- "max": "100",
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 2,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "connected",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (minor_number,instance) (task_device_mem_percent{job_name=\"$job_id\",device_str='nvidia.com/gpu'})",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "GPU Memory Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "percent",
- "label": null,
- "logBase": 1,
- "max": "100",
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 7,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "connected",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (minor_number,instance) (task_dcgm_gpu_util{job_name=\"$job_id\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "GPU DCGM Util",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "percent",
- "label": null,
- "logBase": 1,
- "max": "100",
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "id": 8,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "connected",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (minor_number,instance) (task_dcgm_mem_copy_util{job_name=\"$job_id\"})",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "GPU {{minor_number}} on {{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "GPU DCGM Mem Copy Util",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "percent",
- "label": null,
- "logBase": 1,
- "max": "100",
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- }
- ],
- "schemaVersion": 14,
- "style": "dark",
- "tags": [],
- "templating": {
- "list": [
- {
- "allValue": null,
- "current": {},
- "datasource": "PM",
- "hide": 0,
- "includeAll": false,
- "label": null,
- "multi": false,
- "name": "job_id",
- "options": [],
- "query": "label_values(task_cpu_percent, job_name)",
- "refresh": 1,
- "regex": "",
- "sort": 1,
- "tagValuesQuery": "",
- "tags": [],
- "tagsQuery": "",
- "type": "query",
- "useTags": false
- }
- ]
- },
- "time": {
- "from": "now-1h",
- "to": "now"
- },
- "timepicker": {
- "now": true,
- "refresh_intervals": [
- "30s",
- "1m",
- "5m",
- "15m",
- "30m",
- "1h",
- "2h",
- "1d"
- ],
- "time_options": [
- "5m",
- "15m",
- "1h",
- "6h",
- "12h",
- "24h",
- "2d",
- "7d",
- "30d"
- ]
- },
- "timezone": "browser",
- "title": "Job Status",
- "version": 1,
- "uid": "job-status"
- }}
- nfs-read-write-dashboard.json: |
- {
- "dashboard": {
- "annotations": {
- "list": [
- {
- "builtIn": 1,
- "datasource": "-- Grafana --",
- "enable": true,
- "hide": true,
- "iconColor": "rgba(0, 211, 255, 1)",
- "name": "Annotations & Alerts",
- "type": "dashboard"
- }
- ]
- },
- "editable": true,
- "gnetId": null,
- "graphTooltip": 0,
- "id": null,
- "links": [],
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
- "h": 11,
- "w": 24,
- "x": 0,
- "y": 0
- },
- "hiddenSeries": false,
- "id": 2,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "nullPointMode": "null",
- "options": {
- "dataLinks": []
- },
- "percentage": false,
- "pointradius": 2,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "node_nfsd_disk_bytes_written_total",
- "interval": "",
- "legendFormat": "",
- "refId": "A"
- },
- {
- "expr": "node_nfsd_disk_bytes_read_total",
- "interval": "",
- "legendFormat": "",
- "refId": "B"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeRegions": [],
- "timeShift": null,
- "title": "NFS读写统计/bytes",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ],
- "yaxis": {
- "align": false,
- "alignLevel": null
- }
- }
- ],
- "schemaVersion": 22,
- "style": "dark",
- "tags": [],
- "templating": {
- "list": []
- },
- "time": {
- "from": "now-6h",
- "to": "now"
- },
- "timepicker": {
- "refresh_intervals": [
- "5s",
- "10s",
- "30s",
- "1m",
- "5m",
- "15m",
- "30m",
- "1h",
- "2h",
- "1d"
- ]
- },
- "timezone": "browser",
- "title": "NFS性能监控",
- "uid": null,
- "variables": {
- "list": []
- },
- "version": 0
- }
- }
- node-status-dashboard.json: |
- {"dashboard": {
- "annotations": {
- "list": [
- {
- "builtIn": 1,
- "datasource": "-- Grafana --",
- "enable": true,
- "hide": true,
- "iconColor": "rgba(0, 211, 255, 1)",
- "name": "Annotations & Alerts",
- "type": "dashboard"
- }
- ]
- },
- "description": "Dashboard to view multiple Node resource usage",
- "editable": false,
- "gnetId": null,
- "graphTooltip": 0,
- "hideControls": true,
- "id": null,
- "links": [],
- "refresh": "30s",
- "rows": [
- {
- "collapse": false,
- "height": 266,
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "PM",
- "decimals": 3,
- "editable": true,
- "error": false,
- "fill": 0,
- "grid": {},
- "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "hideEmpty": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": false,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "minSpan": null,
- "nullPointMode": "null",
- "percentage": true,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 4,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "100 - (avg by (instance)(irate(node_cpu_seconds_total{mode=\"idle\",instance=~\"$node(:[0-9]*)?$\"}[300s])) * 100)",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "usage",
- "refId": "B"
- }
- ],
- "thresholds": [
- {
- "colorMode": "custom",
- "fill": false,
- "fillColor": "rgba(216, 200, 27, 0.27)",
- "op": "gt",
- "value": 0
- }
- ],
- "timeFrom": null,
- "timeShift": null,
- "title": "CPU",
- "tooltip": {
- "msResolution": false,
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "transparent": false,
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "percent",
- "label": "%",
- "logBase": 1,
- "max": "100",
- "min": 0,
- "show": true
- },
- {
- "format": "short",
- "logBase": 1,
- "max": null,
- "min": null,
- "show": false
- }
- ]
- },
- {
- "aliasColors": {
- "Slab": "#E5A8E2",
- "Swap": "#E24D42"
- },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "PM",
- "decimals": 2,
- "editable": true,
- "error": false,
- "fill": 0,
- "grid": {},
- "id": 17,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "minSpan": 2,
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "/Apps|Buffers|Cached|Free|Slab|SwapCached|PageTables|VmallocUsed/"
- },
- {
- "alias": "Swap"
- }
- ],
- "spaceLength": 10,
- "span": 4,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance)(node_memory_MemTotal_bytes{instance=~'$node(:[0-9]*)?$'} - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes)",
- "format": "time_series",
- "interval": "",
- "intervalFactor": 2,
- "legendFormat": "usage",
- "metric": "",
- "refId": "A",
- "step": 600,
- "target": ""
- },
- {
- "expr": "avg by (instance)(node_memory_Buffers_bytes{instance=~'$node(:[0-9]*)?$'} + node_memory_Cached_bytes)",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "buff/cache",
- "refId": "B"
- },
- {
- "expr": "avg by (instance)(node_memory_MemTotal_bytes{instance=~'$node(:[0-9]*)?$'})",
- "format": "time_series",
- "hide": false,
- "interval": "",
- "intervalFactor": 2,
- "legendFormat": "total",
- "refId": "C",
- "step": 600
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Memory",
- "tooltip": {
- "msResolution": false,
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "bytes",
- "label": "GB",
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "PM",
- "editable": true,
- "error": false,
- "fill": 0,
- "grid": {},
- "id": 12,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "minSpan": null,
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "/.*_in/",
- "transform": "negative-Y"
- }
- ],
- "spaceLength": 10,
- "span": 4,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum(rate(node_network_receive_bytes_total{instance=~\"$node(:[0-9]*)?$\"}[300s]))",
- "format": "time_series",
- "interval": "",
- "intervalFactor": 2,
- "legendFormat": "in",
- "refId": "A",
- "step": 600
- },
- {
- "expr": "sum(rate(node_network_transmit_bytes_total{instance=~\"$node(:[0-9]*)?$\"}[300s]))",
- "format": "time_series",
- "interval": "",
- "intervalFactor": 2,
- "legendFormat": "out",
- "refId": "B",
- "step": 600
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Network Traffic",
- "tooltip": {
- "msResolution": false,
- "shared": true,
- "sort": 0,
- "value_type": "cumulative"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "Bps",
- "label": "",
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "short",
- "logBase": 1,
- "max": null,
- "min": null,
- "show": false
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "CPU",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": 262,
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "PM",
- "decimals": 3,
- "editable": true,
- "error": false,
- "fill": 0,
- "grid": {},
- "id": 9,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "minSpan": null,
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 4,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum(rate(node_disk_read_bytes_total{instance=~\"$node(:[0-9]*)?$\"}[300s]))",
- "format": "time_series",
- "interval": "",
- "intervalFactor": 2,
- "legendFormat": "read",
- "metric": "",
- "refId": "A",
- "step": 600,
- "target": ""
- },
- {
- "expr": "sum(rate(node_disk_written_bytes_total{instance=~\"$node(:[0-9]*)?$\"}[300s]))",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "write",
- "refId": "B"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Disk",
- "tooltip": {
- "msResolution": true,
- "shared": true,
- "sort": 0,
- "value_type": "cumulative"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "Bps",
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- },
- {
- "format": "short",
- "logBase": 1,
- "max": null,
- "min": null,
- "show": false
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "PM",
- "decimals": 3,
- "editable": true,
- "error": false,
- "fill": 0,
- "grid": {},
- "id": 18,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "minSpan": null,
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 4,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (nvidiasmi_utilization_gpu{instance=~\"$node(:[0-9]*)?$\"})",
- "format": "time_series",
- "legendFormat": "{{minor_number}}",
- "interval": "",
- "intervalFactor": 2,
- "metric": "",
- "refId": "A",
- "step": 600,
- "target": ""
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "GPU Utilization",
- "tooltip": {
- "msResolution": true,
- "shared": true,
- "sort": 0,
- "value_type": "cumulative"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "percent",
- "logBase": 1,
- "max": 100,
- "min": 0,
- "show": true
- },
- {
- "format": "short",
- "logBase": 1,
- "max": null,
- "min": null,
- "show": false
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "PM",
- "decimals": 3,
- "editable": true,
- "error": false,
- "fill": 0,
- "grid": {},
- "id": 23,
- "legend": {
- "show": true,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "minSpan": null,
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 4,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,minor_number) (nvidiasmi_utilization_memory{instance=~\"$node(:[0-9]*)?$\"})",
- "format": "time_series",
- "legendFormat": "{{minor_number}}",
- "intervalFactor": 2,
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "GPU Memory",
- "tooltip": {
- "msResolution": true,
- "shared": true,
- "sort": 0,
- "value_type": "cumulative"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "percent",
- "logBase": 1,
- "max": 100,
- "min": 0,
- "show": true
- },
- {
- "format": "short",
- "logBase": 1,
- "max": null,
- "min": null,
- "show": false
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "GPU row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": 262,
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "PM",
- "decimals": 3,
- "editable": true,
- "error": false,
- "fill": 0,
- "grid": {},
- "id": 33,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "minSpan": null,
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,id) (npu_chip_info_utilization{instance=~\"$node(:[0-9]*)?$\"})",
- "format": "time_series",
- "legendFormat": "{{id}}",
- "interval": "",
- "intervalFactor": 2,
- "metric": "",
- "refId": "A",
- "step": 600,
- "target": ""
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "NPU Utilization",
- "tooltip": {
- "msResolution": true,
- "shared": true,
- "sort": 0,
- "value_type": "cumulative"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "percent",
- "logBase": 1,
- "max": 100,
- "min": 0,
- "show": true
- },
- {
- "format": "short",
- "logBase": 1,
- "max": null,
- "min": null,
- "show": false
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "PM",
- "decimals": 3,
- "editable": true,
- "error": false,
- "fill": 0,
- "grid": {},
- "id": 35,
- "legend": {
- "show": true,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "minSpan": null,
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance,id) (npu_chip_info_used_memory{instance=~\"$node(:[0-9]*)?$\"}/npu_chip_info_total_memory{instance=~\"$node(:[0-9]*)?$\"})",
- "format": "time_series",
- "legendFormat": "{{id}}",
- "intervalFactor": 2,
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "NPU Memory",
- "tooltip": {
- "msResolution": true,
- "shared": true,
- "sort": 0,
- "value_type": "cumulative"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "percent",
- "logBase": 1,
- "max": 100,
- "min": 0,
- "show": true
- },
- {
- "format": "short",
- "logBase": 1,
- "max": null,
- "min": null,
- "show": false
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "NPU row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": 250,
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 0,
- "id": 24,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (device) (node_filesystem_free_bytes{instance=~\"$node(:[0-9]*)?$\", device=~\"/dev/.*\"} / node_filesystem_size_bytes) * 100",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "{{device}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Filesystem free space percent",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "percent",
- "label": null,
- "logBase": 1,
- "max": "100",
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 0,
- "id": 26,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (device) (node_filesystem_files_free{instance=~\"$node(:[0-9]*)?$\", device=~\"/dev/.*\"} / node_filesystem_files) * 100",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{device}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Filesystem free inode percent",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "percent",
- "label": null,
- "logBase": 1,
- "max": "100",
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": 250,
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 0,
- "id": 25,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 12,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance)(node_load1{instance=~\"$node(:[0-9]*)?$\"})",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "load1",
- "refId": "A"
- },
- {
- "expr": "avg by (instance)(node_load5{instance=~\"$node(:[0-9]*)?$\"})",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "load5",
- "refId": "B"
- },
- {
- "expr": "avg by (instance)(node_load15{instance=~\"$node(:[0-9]*)?$\"})",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "load15",
- "refId": "C"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Node load",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- }
- ],
- "schemaVersion": 14,
- "style": "dark",
- "tags": [],
- "templating": {
- "list": [
- {
- "allFormat": "glob",
- "allValue": null,
- "current": {
- "text": "",
- "value": ""
- },
- "datasource": "PM",
- "hide": 0,
- "includeAll": false,
- "label": "",
- "multi": false,
- "multiFormat": "regex values",
- "name": "node",
- "options": [],
- "query": "label_values(node_uname_info, instance)",
- "refresh": 1,
- "regex": "/([^:]*)/",
- "sort": 1,
- "tagValuesQuery": "",
- "tags": [],
- "tagsQuery": "",
- "type": "query",
- "useTags": false
- }
- ]
- },
- "time": {
- "from": "now-1h",
- "to": "now"
- },
- "timepicker": {
- "now": true,
- "refresh_intervals": [
- "30s",
- "1m",
- "5m",
- "15m",
- "30m",
- "1h",
- "2h",
- "1d"
- ],
- "time_options": [
- "5m",
- "15m",
- "1h",
- "6h",
- "12h",
- "24h",
- "2d",
- "7d",
- "30d"
- ]
- },
- "timezone": "browser",
- "title": "Node Status",
- "version": 1,
- "uid": "node-status"
- }}
- per-vc-device-usage-dashboard.json: |
- {
- "dashboard": {
- "annotations": {
- "list": [
- {
- "builtIn": 1,
- "datasource": "-- Grafana --",
- "enable": true,
- "hide": true,
- "iconColor": "rgba(0, 211, 255, 1)",
- "name": "Annotations & Alerts",
- "type": "dashboard"
- }
- ]
- },
- "editable": true,
- "gnetId": null,
- "graphTooltip": 0,
- "hideControls": false,
- "id": null,
- "links": [],
- "refresh": "30s",
- "rows": [
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 0,
- "id": 1,
- "legend": {
- "avg": false,
- "current": true,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 12,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum( avg (k8s_vc_device_total{vc_name=\"$vc_name\",device_str='nvidia.com/gpu'}) by(vc_name) )",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "Total",
- "refId": "A"
- },
- {
- "expr": "sum( avg (k8s_vc_device_available{vc_name=\"$vc_name\",device_str='nvidia.com/gpu'}) by(vc_name) )",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "Available",
- "refId": "B"
- },
- {
- "expr": "sum( avg (k8s_vc_device_preemptive_availabe{vc_name=\"$vc_name\",device_str='nvidia.com/gpu'}) by(vc_name) )",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "Preemptive available",
- "refId": "C"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Per VC GPU Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": false
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 0,
- "id": 2,
- "legend": {
- "avg": false,
- "current": true,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 12,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum( avg (k8s_vc_device_total{vc_name=\"$vc_name\",device_str='npu.huawei.com/NPU'}) by(vc_name) )",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "Total",
- "refId": "A"
- },
- {
- "expr": "sum( avg (k8s_vc_device_available{vc_name=\"$vc_name\",device_str='npu.huawei.com/NPU'}) by(vc_name) )",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "Available",
- "refId": "B"
- },
- {
- "expr": "sum( avg (k8s_vc_device_preemptive_availabe{vc_name=\"$vc_name\",device_str='npu.huawei.com/NPU'}) by(vc_name) )",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "Preemptive available",
- "refId": "C"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Per VC NPU Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": false
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- }
- ],
- "schemaVersion": 14,
- "style": "dark",
- "tags": [],
- "templating": {
- "list": [
- {
- "allValue": null,
- "current": {
- "text": "platform",
- "value": "platform"
- },
- "datasource": "PM",
- "hide": 0,
- "includeAll": false,
- "label": null,
- "multi": false,
- "name": "vc_name",
- "options": [],
- "query": "label_values(k8s_vc_device_total, vc_name)",
- "refresh": 1,
- "regex": "",
- "sort": 1,
- "tagValuesQuery": "",
- "tags": [],
- "tagsQuery": "",
- "type": "query",
- "useTags": false
- }
- ]
- },
- "time": {
- "from": "now-6h",
- "to": "now"
- },
- "timepicker": {
- "refresh_intervals": [
- "30s",
- "1m",
- "5m",
- "15m",
- "30m",
- "1h",
- "2h",
- "1d"
- ],
- "time_options": [
- "5m",
- "15m",
- "1h",
- "6h",
- "12h",
- "24h",
- "2d",
- "7d",
- "30d"
- ]
- },
- "timezone": "browser",
- "title": "Per VC Device Statistic",
- "version": 0,
- "uid": "per-vc-gpu-statistic"
- }
- }
- perf-dashboard.json: |
- {
- "dashboard": {
- "annotations": {
- "list": [
- {
- "builtIn": 1,
- "datasource": "-- Grafana --",
- "enable": true,
- "hide": true,
- "iconColor": "rgba(0, 211, 255, 1)",
- "name": "Annotations & Alerts",
- "type": "dashboard"
- }
- ]
- },
- "editable": true,
- "gnetId": null,
- "graphTooltip": 0,
- "hideControls": false,
- "id": null,
- "links": [],
- "refresh": "30s",
- "rows": [
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 0,
- "id": 1,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "histogram_quantile(0.9, sum(rate(datahandler_fn_latency_seconds_bucket{scraped_from=~\"jobmanager.*\"}[5m])) by (le, fn_name))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{fn_name}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Datahandler 90th percentile latency per function from jobmanager",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "s",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": false
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 0,
- "id": 2,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "histogram_quantile(0.9, sum(rate(datahandler_fn_latency_seconds_bucket{scraped_from=~\"restfulapi.*\"}[5m])) by (le, fn_name))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{fn_name}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Datahandler 90th percentile latency per function from restfulapi",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "s",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": 250,
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 0,
- "id": 3,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "histogram_quantile(0.9, sum(rate(db_connect_latency_seconds_bucket{scraped_from=~\"jobmanager.*\"}[5m])) by (le))",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 2,
- "legendFormat": "Connection Latency",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "90th percentile DB connection latency from jobmanager",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "s",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": false
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": null,
- "fill": 0,
- "id": 4,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "histogram_quantile(0.9, sum(rate(db_connect_latency_seconds_bucket{scraped_from=~\"restfulapi.*\"}[5m])) by (le))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "Connection Latency",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "90th percentile DB connection latency from restfulapi",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "s",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": false
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- }
- ],
- "schemaVersion": 14,
- "style": "dark",
- "tags": [],
- "templating": {
- "list": []
- },
- "time": {
- "from": "now-6h",
- "to": "now"
- },
- "timepicker": {
- "refresh_intervals": [
- "30s",
- "1m",
- "5m",
- "15m",
- "30m",
- "1h",
- "2h",
- "1d"
- ],
- "time_options": [
- "5m",
- "15m",
- "1h",
- "6h",
- "12h",
- "24h",
- "2d",
- "7d",
- "30d"
- ]
- },
- "timezone": "browser",
- "title": "Performance dashboard",
- "version": 0,
- "uid": "perf"
- }
- }
- prom-datasource.json: |
- {
- "name": "PM",
- "url": "http://localhost:9091/prometheus",
- "basicAuth": false,
- "access": "proxy",
- "type": "prometheus",
- "isDefault": true
- }
- service-status-dashboard.json: |
- {"dashboard": {
- "annotations": {
- "list": [
- {
- "builtIn": 1,
- "datasource": "-- Grafana --",
- "enable": true,
- "hide": true,
- "iconColor": "rgba(0, 211, 255, 1)",
- "name": "Annotations & Alerts",
- "type": "dashboard"
- }
- ]
- },
- "description": "Dashboard to view service metrics",
- "editable": true,
- "gnetId": null,
- "graphTooltip": 0,
- "hideControls": false,
- "id": null,
- "links": [],
- "refresh": "30s",
- "rows": [
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "PM",
- "fill": 0,
- "height": "400px",
- "id": 1,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "connected",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance)(service_cpu_percent{name=\"$service_name\"})",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "CPU",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "percent",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": false
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "PM",
- "fill": 0,
- "height": "400px",
- "id": 2,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance)(service_mem_usage_byte{name=\"$service_name\"})",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{instance}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Memory Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "decbytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- },
- {
- "collapse": false,
- "height": 250,
- "panels": [
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "PM",
- "fill": 0,
- "height": "400px",
- "id": 3,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance)(service_net_in_byte{name=\"$service_name\"})",
- "format": "time_series",
- "interval": "",
- "intervalFactor": 2,
- "legendFormat": "{{instance}} Inbound",
- "refId": "A"
- },
- {
- "expr": "avg by (instance)(service_net_out_byte{name=\"$service_name\"})",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{instance}} Outbound",
- "refId": "B"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Network",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "Bps",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "PM",
- "fill": 0,
- "height": "400px",
- "id": 4,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "avg by (instance)(irate(service_block_in_byte{name=\"$service_name\"}[300s]))",
- "format": "time_series",
- "interval": "",
- "intervalFactor": 2,
- "legendFormat": "{{instance}} Read",
- "refId": "A"
- },
- {
- "expr": "avg by (instance)(irate(service_block_out_byte{name=\"$service_name\"}[300s]))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{instance}} Write",
- "refId": "B"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Block IO",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "Bps",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- }
- ],
- "schemaVersion": 14,
- "style": "dark",
- "tags": [],
- "templating": {
- "list": [
- {
- "allValue": null,
- "current": {},
- "datasource": "PM",
- "hide": 0,
- "includeAll": false,
- "label": null,
- "multi": false,
- "name": "service_name",
- "options": [],
- "query": "label_values(service_cpu_percent, name)",
- "refresh": 1,
- "regex": "",
- "sort": 1,
- "tagValuesQuery": "",
- "tags": [],
- "tagsQuery": "",
- "type": "query",
- "useTags": false
- }
- ]
- },
- "time": {
- "from": "now-1h",
- "to": "now"
- },
- "timepicker": {
- "now": true,
- "refresh_intervals": [
- "30s",
- "1m",
- "5m",
- "15m",
- "30m",
- "1h",
- "2h",
- "1d"
- ],
- "time_options": [
- "5m",
- "15m",
- "1h",
- "6h",
- "12h",
- "24h",
- "2d",
- "7d",
- "30d"
- ]
- },
- "timezone": "browser",
- "title": "Service Status",
- "version": 1,
- "uid": "service-status"
- }}
- storage-usage-dashboard.json: "{\"dashboard\": {\n \"annotations\": {\n \"list\":
- [\n {\n \"builtIn\": 1,\n \"datasource\": \"-- Grafana --\",\n
- \ \"enable\": true,\n \"hide\": true,\n \"iconColor\": \"rgba(0,
- 211, 255, 1)\",\n \"name\": \"Annotations & Alerts\",\n \"type\":
- \"dashboard\"\n }\n ]\n },\n \"editable\": true,\n \"gnetId\": null,\n
- \ \"graphTooltip\": 0,\n \"hideControls\": false,\n \"id\": null,\n \"links\":
- [],\n \"rows\": [\n {\n \"collapse\": false,\n \"height\": \"250px\",\n
- \ \"panels\": [\n {\n\t\t\t\t\t\"alert\": {\n\t\t\t\t\t\t\"alertRuleTags\":
- {},\n\t\t\t\t\t\t\"conditions\": [\n\t\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\t\t\t\"evaluator\":
- {\n\t\t\t\t\t\t\t\t\t\t\t\t\"params\": [\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t70\n\t\t\t\t\t\t\t\t\t\t\t\t],\n\t\t\t\t\t\t\t\t\t\t\t\t\"type\":
- \"gt\"\n\t\t\t\t\t\t\t\t\t\t},\n\t\t\t\t\t\t\t\t\t\t\"operator\": {\n\t\t\t\t\t\t\t\t\t\t\t\t\"type\":
- \"and\"\n\t\t\t\t\t\t\t\t\t\t},\n\t\t\t\t\t\t\t\t\t\t\"query\": {\n\t\t\t\t\t\t\t\t\t\t\t\t\"params\":
- [\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\"A\",\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\"5m\",\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\"now\"\n\t\t\t\t\t\t\t\t\t\t\t\t]\n\t\t\t\t\t\t\t\t\t\t},\n\t\t\t\t\t\t\t\t\t\t\"reducer\":
- {\n\t\t\t\t\t\t\t\t\t\t\t\t\"params\": [],\n\t\t\t\t\t\t\t\t\t\t\t\t\"type\":
- \"avg\"\n\t\t\t\t\t\t\t\t\t\t},\n\t\t\t\t\t\t\t\t\t\t\"type\": \"query\"\n\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t],\n\t\t\t\t\t\t\"executionErrorState\":
- \"alerting\",\n\t\t\t\t\t\t\"for\": \"5m\",\n\t\t\t\t\t\t\"frequency\": \"1m\",\n\t\t\t\t\t\t\"handler\":
- 1,\n\t\t\t\t\t\t\"message\": \"Storage usage is more than 70%\",\n\t\t\t\t\t\t\"name\":
- \"Storage Usage alert\",\n\t\t\t\t\t\t\"noDataState\": \"no_data\",\n\t\t\t\t\t\t\"notifications\":
- []\n\t\t\t\t\t},\n \"aliasColors\": {},\n \"bars\": false,\n
- \ \"dashLength\": 10,\n \"dashes\": false,\n \"datasource\":
- null,\n \"fill\": 0,\n \"id\": 1,\n \"legend\": {\n
- \ \"avg\": false,\n \"current\": false,\n \"max\":
- false,\n \"min\": false,\n \"show\": true,\n \"total\":
- false,\n \"values\": false\n },\n \"lines\": true,\n
- \ \"linewidth\": 1,\n \"links\": [],\n \"nullPointMode\":
- \"null\",\n \"percentage\": false,\n \"pointradius\": 5,\n \"points\":
- false,\n \"renderer\": \"flot\",\n \"seriesOverrides\": [],\n
- \ \"spaceLength\": 10,\n \"span\": 12,\n \"stack\":
- false,\n \"steppedLine\": false,\n \"targets\": [\n {\n
- \ \"expr\": \"min ((node_filesystem_size_bytes - node_filesystem_avail_bytes{fstype=~\\\"nfs[0-9]?\\\"})
- / node_filesystem_size_bytes) by (device) * 100\",\n \"format\":
- \"time_series\",\n \"hide\": false,\n \"intervalFactor\":
- 2,\n \"legendFormat\": \"{{ device }}\",\n \"refId\":
- \"A\"\n }\n ],\n \"thresholds\": [],\n \"timeFrom\":
- null,\n \"timeShift\": null,\n \"title\": \"Storage Usage\",\n
- \ \"tooltip\": {\n \"shared\": true,\n \"sort\":
- 0,\n \"value_type\": \"individual\"\n },\n \"type\":
- \"graph\",\n \"xaxis\": {\n \"buckets\": null,\n \"mode\":
- \"time\",\n \"name\": null,\n \"show\": true,\n \"values\":
- []\n },\n \"yaxes\": [\n {\n \"format\":
- \"percent\",\n \"label\": null,\n \"logBase\": 1,\n
- \ \"max\": \"100\",\n \"min\": \"0\",\n \"show\":
- true\n },\n {\n \"format\": \"short\",\n \"label\":
- null,\n \"logBase\": 1,\n \"max\": null,\n \"min\":
- null,\n \"show\": false\n }\n ]\n }\n
- \ ],\n \"repeat\": null,\n \"repeatIteration\": null,\n \"repeatRowId\":
- null,\n \"showTitle\": false,\n \"title\": \"Dashboard Row\",\n \"titleSize\":
- \"h6\"\n }\n ],\n \"schemaVersion\": 14,\n \"style\": \"dark\",\n \"tags\":
- [],\n \"templating\": {\n \"list\": []\n },\n \"time\": {\n \"from\":
- \"now-6h\",\n \"to\": \"now\"\n },\n \"timepicker\": {\n \"refresh_intervals\":
- [\n \"30s\",\n \"1m\",\n \"5m\",\n \"15m\",\n \"30m\",\n
- \ \"1h\",\n \"2h\",\n \"1d\"\n ],\n \"time_options\": [\n
- \ \"5m\",\n \"15m\",\n \"1h\",\n \"6h\",\n \"12h\",\n
- \ \"24h\",\n \"2d\",\n \"7d\",\n \"30d\"\n ]\n },\n \"timezone\":
- \"browser\",\n \"title\": \"Storage usage\",\n \"version\": 0,\n \"uid\": \"storage-usage\"\n}}\n"
- user-status-dashboard.json: |
- {
- "dashboard": {
- "annotations": {
- "list": [
- {
- "builtIn": 1,
- "datasource": "-- Grafana --",
- "enable": true,
- "hide": true,
- "iconColor": "rgba(0, 211, 255, 1)",
- "name": "Annotations & Alerts",
- "type": "dashboard"
- }
- ]
- },
- "editable": true,
- "gnetId": null,
- "graphTooltip": 0,
- "hideControls": false,
- "id": null,
- "links": [],
- "refresh": "30s",
- "rows": [
- {
- "collapse": false,
- "height": "250px",
- "panels": [
- {
- "columns": [],
- "datasource": null,
- "fontSize": "100%",
- "id": 1,
- "links": [],
- "pageSize": null,
- "scroll": true,
- "showHeader": true,
- "sort": {
- "col": 0,
- "desc": true
- },
- "span": 12,
- "styles": [
- {
- "alias": "Time",
- "dateFormat": "YYYY-MM-DD HH:mm:ss",
- "pattern": "Time",
- "type": "hidden"
- },
- {
- "alias": "",
- "colorMode": null,
- "colors": [
- "rgba(245, 54, 54, 0.9)",
- "rgba(237, 129, 40, 0.89)",
- "rgba(50, 172, 45, 0.97)"
- ],
- "dateFormat": "YYYY-MM-DD HH:mm:ss",
- "decimals": 2,
- "pattern": "",
- "thresholds": [],
- "type": "number",
- "unit": "short"
- },
- {
- "alias": "used GPU",
- "colorMode": null,
- "colors": [
- "rgba(245, 54, 54, 0.9)",
- "rgba(237, 129, 40, 0.89)",
- "rgba(50, 172, 45, 0.97)"
- ],
- "dateFormat": "YYYY-MM-DD HH:mm:ss",
- "decimals": null,
- "pattern": "Value",
- "thresholds": [],
- "type": "number",
- "unit": "none"
- },
- {
- "alias": "",
- "colorMode": null,
- "colors": [
- "rgba(245, 54, 54, 0.9)",
- "rgba(237, 129, 40, 0.89)",
- "rgba(50, 172, 45, 0.97)"
- ],
- "decimals": 2,
- "pattern": "/.*/",
- "thresholds": [],
- "type": "number",
- "unit": "short"
- }
- ],
- "targets": [
- {
- "expr": "count by (username) (avg by (uuid,username) (task_device_percent{device_str='nvidia.com/gpu'}))",
- "format": "table",
- "hide": false,
- "instant": true,
- "interval": "",
- "intervalFactor": 1,
- "refId": "A"
- }
- ],
- "title": "User GPU Status",
- "transform": "table",
- "type": "table"
- },
- {
- "columns": [],
- "datasource": null,
- "fontSize": "100%",
- "id": 2,
- "links": [],
- "pageSize": null,
- "scroll": true,
- "showHeader": true,
- "sort": {
- "col": 0,
- "desc": true
- },
- "span": 12,
- "styles": [
- {
- "alias": "Time",
- "dateFormat": "YYYY-MM-DD HH:mm:ss",
- "pattern": "Time",
- "type": "hidden"
- },
- {
- "alias": "",
- "colorMode": null,
- "colors": [
- "rgba(245, 54, 54, 0.9)",
- "rgba(237, 129, 40, 0.89)",
- "rgba(50, 172, 45, 0.97)"
- ],
- "dateFormat": "YYYY-MM-DD HH:mm:ss",
- "decimals": 2,
- "pattern": "",
- "thresholds": [],
- "type": "number",
- "unit": "short"
- },
- {
- "alias": "used NPU",
- "colorMode": null,
- "colors": [
- "rgba(245, 54, 54, 0.9)",
- "rgba(237, 129, 40, 0.89)",
- "rgba(50, 172, 45, 0.97)"
- ],
- "dateFormat": "YYYY-MM-DD HH:mm:ss",
- "decimals": null,
- "pattern": "Value",
- "thresholds": [],
- "type": "number",
- "unit": "none"
- },
- {
- "alias": "",
- "colorMode": null,
- "colors": [
- "rgba(245, 54, 54, 0.9)",
- "rgba(237, 129, 40, 0.89)",
- "rgba(50, 172, 45, 0.97)"
- ],
- "decimals": 2,
- "pattern": "/.*/",
- "thresholds": [],
- "type": "number",
- "unit": "short"
- }
- ],
- "targets": [
- {
- "expr": "count by (username) (avg by (uuid,username) (task_device_percent{device_str='npu.huawei.com/NPU'}))",
- "format": "table",
- "hide": false,
- "instant": true,
- "interval": "",
- "intervalFactor": 1,
- "refId": "A"
- }
- ],
- "title": "User NPU Status",
- "transform": "table",
- "type": "table"
- }
- ],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6"
- }
- ],
- "schemaVersion": 14,
- "style": "dark",
- "tags": [],
- "templating": {
- "list": []
- },
- "time": {
- "from": "now-15m",
- "to": "now"
- },
- "timepicker": {
- "refresh_intervals": [
- "30s",
- "1m",
- "5m",
- "15m",
- "30m",
- "1h",
- "2h",
- "1d"
- ],
- "time_options": [
- "5m",
- "15m",
- "1h",
- "6h",
- "12h",
- "24h",
- "2d",
- "7d",
- "30d"
- ]
- },
- "timezone": "browser",
- "title": "User Status",
- "version": 1,
- "uid": "user-status"
- }
- }
- kind: ConfigMap
- metadata:
- creationTimestamp: null
- name: grafana-configuration
- namespace: kube-system
|