|
18 | 18 | "editable": true, |
19 | 19 | "gnetId": 11752, |
20 | 20 | "graphTooltip": 0, |
21 | | - "id": 5, |
22 | | - "iteration": 1605896702545, |
| 21 | + "iteration": 1606131081690, |
23 | 22 | "links": [ |
24 | 23 | { |
25 | 24 | "icon": "external link", |
|
270 | 269 | "tableColumn": "", |
271 | 270 | "targets": [ |
272 | 271 | { |
273 | | - "expr": "avg(DCGM_FI_DEV_GPU_UTIL{instance_id=\"$instance_id\"})", |
| 272 | + "expr": "DCGM_FI_DEV_GPU_UTIL{instance_id=\"$instance_id\"}", |
274 | 273 | "format": "time_series", |
| 274 | + "instant": true, |
275 | 275 | "interval": "", |
276 | 276 | "intervalFactor": 1, |
277 | 277 | "legendFormat": "", |
|
309 | 309 | }, |
310 | 310 | "format": "watt", |
311 | 311 | "gauge": { |
312 | | - "maxValue": 2400, |
| 312 | + "maxValue": 100, |
313 | 313 | "minValue": 0, |
314 | 314 | "show": true, |
315 | 315 | "thresholdLabels": false, |
|
358 | 358 | "tableColumn": "", |
359 | 359 | "targets": [ |
360 | 360 | { |
361 | | - "expr": "sum(DCGM_FI_DEV_POWER_USAGE{instance_id=\"$instance_id\"})", |
| 361 | + "expr": "DCGM_FI_DEV_POWER_USAGE{instance_id=\"$instance_id\"}", |
362 | 362 | "format": "time_series", |
363 | | - "instant": false, |
| 363 | + "instant": true, |
364 | 364 | "interval": "", |
365 | 365 | "intervalFactor": 1, |
366 | 366 | "legendFormat": "", |
367 | 367 | "refId": "A" |
368 | 368 | } |
369 | 369 | ], |
370 | | - "thresholds": "1800,2200", |
| 370 | + "thresholds": "60,90", |
371 | 371 | "title": "GPU Total Power", |
372 | 372 | "type": "singlestat", |
373 | 373 | "valueFontSize": "80%", |
|
390 | 390 | "#d44a3a" |
391 | 391 | ], |
392 | 392 | "datasource": "prometheus", |
393 | | - "description": "", |
394 | 393 | "fieldConfig": { |
395 | 394 | "defaults": { |
396 | 395 | "custom": {} |
397 | 396 | }, |
398 | 397 | "overrides": [] |
399 | 398 | }, |
400 | | - "format": "percent", |
| 399 | + "format": "celsius", |
401 | 400 | "gauge": { |
402 | | - "maxValue": 100, |
| 401 | + "maxValue": 90, |
403 | 402 | "minValue": 0, |
404 | 403 | "show": true, |
405 | 404 | "thresholdLabels": false, |
|
411 | 410 | "x": 12, |
412 | 411 | "y": 0 |
413 | 412 | }, |
414 | | - "id": 68, |
| 413 | + "id": 31, |
415 | 414 | "interval": null, |
416 | 415 | "links": [], |
417 | 416 | "mappingType": 1, |
|
448 | 447 | "tableColumn": "", |
449 | 448 | "targets": [ |
450 | 449 | { |
451 | | - "expr": "avg(DCGM_FI_DEV_DEC_UTIL{instance_id=\"$instance_id\"})", |
| 450 | + "expr": "DCGM_FI_DEV_GPU_TEMP{instance_id=\"$instance_id\"}", |
452 | 451 | "format": "time_series", |
| 452 | + "instant": true, |
453 | 453 | "interval": "", |
454 | 454 | "intervalFactor": 1, |
455 | 455 | "legendFormat": "", |
456 | 456 | "refId": "A" |
457 | 457 | } |
458 | 458 | ], |
459 | | - "thresholds": "80,90", |
460 | | - "title": "GPU Decored Utilization", |
| 459 | + "thresholds": "83,87", |
| 460 | + "title": "GPU Avg. Temperature", |
461 | 461 | "type": "singlestat", |
462 | 462 | "valueFontSize": "80%", |
463 | 463 | "valueMaps": [ |
|
479 | 479 | "#d44a3a" |
480 | 480 | ], |
481 | 481 | "datasource": "prometheus", |
| 482 | + "description": "", |
482 | 483 | "fieldConfig": { |
483 | 484 | "defaults": { |
484 | 485 | "custom": {} |
485 | 486 | }, |
486 | 487 | "overrides": [] |
487 | 488 | }, |
488 | | - "format": "celsius", |
| 489 | + "format": "percent", |
489 | 490 | "gauge": { |
490 | | - "maxValue": 90, |
| 491 | + "maxValue": 100, |
491 | 492 | "minValue": 0, |
492 | 493 | "show": true, |
493 | 494 | "thresholdLabels": false, |
|
499 | 500 | "x": 15, |
500 | 501 | "y": 0 |
501 | 502 | }, |
502 | | - "id": 31, |
| 503 | + "id": 68, |
503 | 504 | "interval": null, |
504 | 505 | "links": [], |
505 | 506 | "mappingType": 1, |
|
536 | 537 | "tableColumn": "", |
537 | 538 | "targets": [ |
538 | 539 | { |
539 | | - "expr": "avg(DCGM_FI_DEV_GPU_TEMP{instance_id=\"$instance_id\"})", |
| 540 | + "expr": "DCGM_FI_DEV_DEC_UTIL{instance_id=\"$instance_id\"}", |
540 | 541 | "format": "time_series", |
| 542 | + "instant": true, |
541 | 543 | "interval": "", |
542 | 544 | "intervalFactor": 1, |
543 | 545 | "legendFormat": "", |
544 | 546 | "refId": "A" |
545 | 547 | } |
546 | 548 | ], |
547 | | - "thresholds": "83,87", |
548 | | - "title": "GPU Avg. Temperature", |
| 549 | + "thresholds": "80,90", |
| 550 | + "title": "GPU Decored Utilization", |
549 | 551 | "type": "singlestat", |
550 | 552 | "valueFontSize": "80%", |
551 | 553 | "valueMaps": [ |
|
625 | 627 | "tableColumn": "", |
626 | 628 | "targets": [ |
627 | 629 | { |
628 | | - "expr": "avg(DCGM_FI_DEV_ENC_UTIL{instance_id=\"$instance_id\"})", |
| 630 | + "expr": "DCGM_FI_DEV_ENC_UTIL{instance_id=\"$instance_id\"}", |
629 | 631 | "format": "time_series", |
| 632 | + "instant": true, |
630 | 633 | "interval": "", |
631 | 634 | "intervalFactor": 1, |
632 | 635 | "legendFormat": "", |
|
714 | 717 | "tableColumn": "", |
715 | 718 | "targets": [ |
716 | 719 | { |
717 | | - "expr": "avg(DCGM_FI_DEV_MEM_COPY_UTIL{instance_id=\"$instance_id\"})", |
| 720 | + "expr": "DCGM_FI_DEV_FB_USED{instance_id=\"$instance_id\"}/(DCGM_FI_DEV_FB_FREE{instance_id=\"$instance_id\"}+DCGM_FI_DEV_FB_USED{instance_id=\"$instance_id\"})*100", |
718 | 721 | "format": "time_series", |
| 722 | + "instant": true, |
719 | 723 | "interval": "", |
720 | 724 | "intervalFactor": 1, |
721 | 725 | "legendFormat": "", |
722 | 726 | "refId": "A" |
723 | 727 | } |
724 | 728 | ], |
725 | 729 | "thresholds": "70,90", |
726 | | - "title": "GPU Total Mem Cpy Utilization", |
| 730 | + "title": "GPU Mem Util.", |
727 | 731 | "type": "singlestat", |
728 | 732 | "valueFontSize": "80%", |
729 | 733 | "valueMaps": [ |
|
802 | 806 | "tableColumn": "", |
803 | 807 | "targets": [ |
804 | 808 | { |
805 | | - "expr": "avg(DCGM_FI_DEV_SM_CLOCK{instance_id=\"$instance_id\"}*1000000)", |
| 809 | + "expr": "DCGM_FI_DEV_SM_CLOCK{instance_id=\"$instance_id\"}*1000000", |
806 | 810 | "format": "time_series", |
807 | 811 | "interval": "", |
808 | 812 | "intervalFactor": 1, |
|
890 | 894 | "tableColumn": "", |
891 | 895 | "targets": [ |
892 | 896 | { |
893 | | - "expr": "avg(DCGM_FI_DEV_MEM_CLOCK{instance_id=\"$instance_id\"}*1000000)", |
| 897 | + "expr": "DCGM_FI_DEV_MEM_CLOCK{instance_id=\"$instance_id\"}*1000000", |
894 | 898 | "format": "time_series", |
895 | 899 | "interval": "", |
896 | 900 | "intervalFactor": 1, |
|
1044 | 1048 | "hiddenSeries": false, |
1045 | 1049 | "id": 57, |
1046 | 1050 | "legend": { |
1047 | | - "avg": false, |
| 1051 | + "alignAsTable": true, |
| 1052 | + "avg": true, |
1048 | 1053 | "current": true, |
1049 | | - "max": false, |
1050 | | - "min": false, |
1051 | | - "show": false, |
| 1054 | + "max": true, |
| 1055 | + "min": true, |
| 1056 | + "rightSide": true, |
| 1057 | + "show": true, |
1052 | 1058 | "total": false, |
1053 | 1059 | "values": true |
1054 | 1060 | }, |
|
1562 | 1568 | "hiddenSeries": false, |
1563 | 1569 | "id": 42, |
1564 | 1570 | "legend": { |
1565 | | - "avg": false, |
| 1571 | + "alignAsTable": true, |
| 1572 | + "avg": true, |
1566 | 1573 | "current": true, |
1567 | | - "max": false, |
1568 | | - "min": false, |
1569 | | - "show": false, |
| 1574 | + "max": true, |
| 1575 | + "min": true, |
| 1576 | + "rightSide": true, |
| 1577 | + "show": true, |
1570 | 1578 | "total": false, |
1571 | 1579 | "values": true |
1572 | 1580 | }, |
|
1588 | 1596 | "steppedLine": false, |
1589 | 1597 | "targets": [ |
1590 | 1598 | { |
1591 | | - "expr": "DCGM_FI_DEV_FB_USED{instance_id=\"$instance_id\"}/(DCGM_FI_DEV_FB_USED{instance_id=\"$instance_id\"}+DCGM_FI_DEV_FB_FREE{instance_id=\"$instance_id\"})", |
| 1599 | + "expr": "DCGM_FI_DEV_FB_USED{instance_id=\"$instance_id\"}", |
1592 | 1600 | "format": "time_series", |
1593 | 1601 | "hide": false, |
1594 | 1602 | "interval": "", |
|
1617 | 1625 | }, |
1618 | 1626 | "yaxes": [ |
1619 | 1627 | { |
| 1628 | + "$$hashKey": "object:1193", |
1620 | 1629 | "decimals": null, |
1621 | | - "format": "percentunit", |
| 1630 | + "format": "decmbytes", |
1622 | 1631 | "label": null, |
1623 | 1632 | "logBase": 1, |
1624 | 1633 | "max": null, |
1625 | 1634 | "min": "0", |
1626 | 1635 | "show": true |
1627 | 1636 | }, |
1628 | 1637 | { |
| 1638 | + "$$hashKey": "object:1194", |
1629 | 1639 | "format": "watt", |
1630 | 1640 | "label": null, |
1631 | 1641 | "logBase": 1, |
|
1848 | 1858 | "list": [ |
1849 | 1859 | { |
1850 | 1860 | "allValue": null, |
| 1861 | + "current": { |
| 1862 | + "selected": false, |
| 1863 | + "text": "i-076225e1b1aefc813", |
| 1864 | + "value": "i-076225e1b1aefc813" |
| 1865 | + }, |
1851 | 1866 | "datasource": "prometheus", |
1852 | 1867 | "definition": "label_values(node_uname_info{job=~\"ec2_instances\",instance_type=~\"g[3-4].*\"}, instance_id)", |
1853 | 1868 | "error": null, |
|
0 commit comments