"
+#define DCGM_FR_NO_MEMORY_CLOCKS_NEXT ""
+#define DCGM_FR_NO_GRAPHICS_CLOCKS_NEXT ""
+#define DCGM_FR_HAD_TO_RESTORE_STATE_NEXT ""
+#define DCGM_FR_L1TAG_UNSUPPORTED_NEXT ""
+#define DCGM_FR_L1TAG_MISCOMPARE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG
+#define DCGM_FR_ROW_REMAP_FAILURE_NEXT DCGM_FR_VOLATILE_DBE_DETECTED_NEXT
+#define DCGM_FR_UNCONTAINED_ERROR_NEXT DCGM_FR_VOLATILE_DBE_DETECTED_NEXT
dcgmErrorSeverity_t dcgmErrorGetPriorityByCode(unsigned int code);
const char *dcgmErrorGetFormatMsgByCode(unsigned int code);
diff --git a/bindings/go/dcgm/dcgm_fields.h b/bindings/go/dcgm/dcgm_fields.h
index b514766..8bbf799 100644
--- a/bindings/go/dcgm/dcgm_fields.h
+++ b/bindings/go/dcgm/dcgm_fields.h
@@ -15,70 +15,79 @@
#ifdef __cplusplus
extern "C" {
#endif
-
+
/***************************************************************************************************/
/** @defgroup dcgmFieldTypes Field Types
* Field Types are a single byte.
* @{
*/
-/***************************************************************************************************/
+/***************************************************************************************************/
/**
* Blob of binary data representing a structure
*/
-#define DCGM_FT_BINARY 'b'
-
+#define DCGM_FT_BINARY 'b'
+
/**
* 8-byte double precision
*/
-#define DCGM_FT_DOUBLE 'd'
-
+#define DCGM_FT_DOUBLE 'd'
+
/**
* 8-byte signed integer
*/
-#define DCGM_FT_INT64 'i'
-
+#define DCGM_FT_INT64 'i'
+
/**
* Null-terminated ASCII Character string
*/
-#define DCGM_FT_STRING 's'
-
+#define DCGM_FT_STRING 's'
+
/**
* 8-byte signed integer usec since 1970
*/
#define DCGM_FT_TIMESTAMP 't'
-
-/** @} */
-
+
+/** @} */
+
/***************************************************************************************************/
/** @defgroup dcgmFieldScope Field Scope
* Represents field association with entity scope or global scope.
* @{
*/
-/***************************************************************************************************/
+/***************************************************************************************************/
/**
* Field is global (ex: driver version)
*/
-#define DCGM_FS_GLOBAL 0
+#define DCGM_FS_GLOBAL 0
/**
* Field is associated with an entity (GPU, VGPU...etc)
*/
-#define DCGM_FS_ENTITY 1
+#define DCGM_FS_ENTITY 1
/**
* Field is associated with a device. Deprecated. Use DCGM_FS_ENTITY
*/
-#define DCGM_FS_DEVICE DCGM_FS_ENTITY
+#define DCGM_FS_DEVICE DCGM_FS_ENTITY
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup dcgmFieldConstants Field Constants
+ * Constants that represent contents of individual field values.
+ * @{
+ */
+/***************************************************************************************************/
/**
* DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY is 16 bits of major version followed by
* 16 bits of the minor version. These macros separate the two.
*/
-#define DCGM_CUDA_COMPUTE_CAPABILITY_MAJOR(x) ((uint64_t)(x) & 0xFFFF0000)
-#define DCGM_CUDA_COMPUTE_CAPABILITY_MINOR(x) ((uint64_t)(x) & 0x0000FFFF)
+#define DCGM_CUDA_COMPUTE_CAPABILITY_MAJOR(x) ((uint64_t)(x)&0xFFFF0000)
+#define DCGM_CUDA_COMPUTE_CAPABILITY_MINOR(x) ((uint64_t)(x)&0x0000FFFF)
/**
* DCGM_FI_DEV_CLOCK_THROTTLE_REASONS is a bitmap of why the clock is throttled.
@@ -88,23 +97,23 @@ extern "C" {
/** Nothing is running on the GPU and the clocks are dropping to Idle state
* \note This limiter may be removed in a later release
*/
-#define DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE 0x0000000000000001LL
+#define DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE 0x0000000000000001LL
/** GPU clocks are limited by current setting of applications clocks
*/
-#define DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING 0x0000000000000002LL
-/** SW Power Scaling algorithm is reducing the clocks below requested clocks
+#define DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING 0x0000000000000002LL
+/** SW Power Scaling algorithm is reducing the clocks below requested clocks
*/
-#define DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP 0x0000000000000004LL
+#define DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP 0x0000000000000004LL
/** HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
*
- *This is an indicator of:
- * - temperature being too high
- * - External Power Brake Assertion is triggered (e.g. by the system power supply)
- * - Power draw is too high and Fast Trigger protection is reducing the clocks
- * - May be also reported during PState or clock change
- * - This behavior may be removed in a later release.
- */
-#define DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN 0x0000000000000008LL
+ * This is an indicator of:
+ * - temperature being too high
+ * - External Power Brake Assertion is triggered (e.g. by the system power supply)
+ * - Power draw is too high and Fast Trigger protection is reducing the clocks
+ * - May be also reported during PState or clock change
+ * - This behavior may be removed in a later release.
+ */
+#define DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN 0x0000000000000008LL
/** Sync Boost
*
* This GPU has been added to a Sync boost group with nvidia-smi or DCGM in
@@ -113,29 +122,42 @@ extern "C" {
* the throttle reasons for other GPUs in the system to see why those GPUs are
* holding this one at lower clocks.
*/
-#define DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST 0x0000000000000010LL
+#define DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST 0x0000000000000010LL
/** SW Thermal Slowdown
*
* This is an indicator of one or more of the following:
* - Current GPU temperature above the GPU Max Operating Temperature
* - Current memory temperature above the Memory Max Operating Temperature
*/
-#define DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL 0x0000000000000020LL
+#define DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL 0x0000000000000020LL
/** HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
*
* This is an indicator of:
* - temperature being too high
*/
-#define DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL 0x0000000000000040LL
+#define DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL 0x0000000000000040LL
/** HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
*
* This is an indicator of:
* - External Power Brake Assertion being triggered (e.g. by the system power supply)
*/
-#define DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE 0x0000000000000080LL
+#define DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE 0x0000000000000080LL
/** GPU clocks are limited by current setting of Display clocks
*/
-#define DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS 0x0000000000000100LL
+#define DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS 0x0000000000000100LL
+
+/**
+ * GPU virtualization mode types for DCGM_FI_DEV_VIRTUAL_MODE
+ */
+typedef enum
+{
+ DCGM_GPU_VIRTUALIZATION_MODE_NONE = 0, //!< Represents Bare Metal GPU
+ DCGM_GPU_VIRTUALIZATION_MODE_PASSTHROUGH = 1, //!< Device is associated with GPU-Passthrough
+ DCGM_GPU_VIRTUALIZATION_MODE_VGPU = 2, //!< Device is associated with vGPU inside virtual machine.
+ DCGM_GPU_VIRTUALIZATION_MODE_HOST_VGPU = 3, //!< Device is associated with VGX hypervisor in vGPU mode
+ DCGM_GPU_VIRTUALIZATION_MODE_HOST_VSGA = 4, //!< Device is associated with VGX hypervisor in vSGA mode
+} dcgmGpuVirtualizationMode_t;
+
/** @} */
@@ -151,12 +173,14 @@ extern "C" {
*/
typedef enum dcgm_field_entity_group_t
{
- DCGM_FE_NONE = 0, /** Field is not associated with an entity. Field scope should be DCGM_FS_GLOBAL */
- DCGM_FE_GPU, /** Field is associated with a GPU entity */
- DCGM_FE_VGPU, /** Field is associated with a VGPU entity */
- DCGM_FE_SWITCH, /** Field is associated with a Switch entity */
-
- DCGM_FE_COUNT /** Number of elements in this enumeration. Keep this entry last */
+ DCGM_FE_NONE = 0, /*!< Field is not associated with an entity. Field scope should be DCGM_FS_GLOBAL */
+ DCGM_FE_GPU, /*!< Field is associated with a GPU entity */
+ DCGM_FE_VGPU, /*!< Field is associated with a VGPU entity */
+ DCGM_FE_SWITCH, /*!< Field is associated with a Switch entity */
+ DCGM_FE_GPU_I, /*!< Field is associated with a GPU Instance entity */
+ DCGM_FE_GPU_CI, /*!< Field is associated with a GPU Compute Instance entity */
+
+ DCGM_FE_COUNT /*!< Number of elements in this enumeration. Keep this entry last */
} dcgm_field_entity_group_t;
/**
@@ -172,98 +196,106 @@ typedef unsigned int dcgm_field_eid_t;
* @{
*/
/***************************************************************************************************/
-
+
/**
* NULL field
- */
-#define DCGM_FI_UNKNOWN 0
-
+ */
+#define DCGM_FI_UNKNOWN 0
+
/**
* Driver Version
*/
-#define DCGM_FI_DRIVER_VERSION 1
-
+#define DCGM_FI_DRIVER_VERSION 1
+
/* Underlying NVML version */
-#define DCGM_FI_NVML_VERSION 2
-
+#define DCGM_FI_NVML_VERSION 2
+
/*
* Process Name
*/
-#define DCGM_FI_PROCESS_NAME 3
-
+#define DCGM_FI_PROCESS_NAME 3
+
/**
* Number of Devices on the node
- */
-#define DCGM_FI_DEV_COUNT 4
+ */
+#define DCGM_FI_DEV_COUNT 4
+
+/**
+ * Cuda Driver Version
+ * Retrieves a number with the major value in the thousands place and the minor value in the hundreds place.
+ * CUDA 11.1 = 11100
+ */
+#define DCGM_FI_CUDA_DRIVER_VERSION 5
+
/**
* Name of the GPU device
*/
-#define DCGM_FI_DEV_NAME 50
-
+#define DCGM_FI_DEV_NAME 50
+
/**
* Device Brand
*/
-#define DCGM_FI_DEV_BRAND 51
-
+#define DCGM_FI_DEV_BRAND 51
+
/**
* NVML index of this GPU
*/
-#define DCGM_FI_DEV_NVML_INDEX 52
+#define DCGM_FI_DEV_NVML_INDEX 52
/**
* Device Serial Number
*/
-#define DCGM_FI_DEV_SERIAL 53
+#define DCGM_FI_DEV_SERIAL 53
/**
* UUID corresponding to the device
*/
-#define DCGM_FI_DEV_UUID 54
+#define DCGM_FI_DEV_UUID 54
/**
* Device node minor number /dev/nvidia#
*/
-#define DCGM_FI_DEV_MINOR_NUMBER 55
+#define DCGM_FI_DEV_MINOR_NUMBER 55
/**
* OEM inforom version
*/
-#define DCGM_FI_DEV_OEM_INFOROM_VER 56
+#define DCGM_FI_DEV_OEM_INFOROM_VER 56
/**
* PCI attributes for the device
*/
-#define DCGM_FI_DEV_PCI_BUSID 57
+#define DCGM_FI_DEV_PCI_BUSID 57
/**
* The combined 16-bit device id and 16-bit vendor id
*/
-#define DCGM_FI_DEV_PCI_COMBINED_ID 58
-
+#define DCGM_FI_DEV_PCI_COMBINED_ID 58
+
/**
* The 32-bit Sub System Device ID
*/
-#define DCGM_FI_DEV_PCI_SUBSYS_ID 59
+#define DCGM_FI_DEV_PCI_SUBSYS_ID 59
/**
* Topology of all GPUs on the system via PCI (static)
*/
-#define DCGM_FI_GPU_TOPOLOGY_PCI 60
+#define DCGM_FI_GPU_TOPOLOGY_PCI 60
/**
* Topology of all GPUs on the system via NVLINK (static)
*/
-#define DCGM_FI_GPU_TOPOLOGY_NVLINK 61
+#define DCGM_FI_GPU_TOPOLOGY_NVLINK 61
/**
* Affinity of all GPUs on the system (static)
*/
-#define DCGM_FI_GPU_TOPOLOGY_AFFINITY 62
+#define DCGM_FI_GPU_TOPOLOGY_AFFINITY 62
/**
* Cuda compute capability for the device.
- * The major version is the upper 32 bits and
+ * The major version is the upper 32 bits and
* the minor version is the lower 32 bits.
*/
#define DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY 63
@@ -271,103 +303,114 @@ typedef unsigned int dcgm_field_eid_t;
/**
* Compute mode for the device
*/
-#define DCGM_FI_DEV_COMPUTE_MODE 65
+#define DCGM_FI_DEV_COMPUTE_MODE 65
+/**
+ * Persistence mode for the device
+ * Boolean: 0 is disabled, 1 is enabled
+ */
+#define DCGM_FI_DEV_PERSISTENCE_MODE 66
+
+/**
+ * MIG mode for the device
+ * Boolean: 0 is disabled, 1 is enabled
+ */
+#define DCGM_FI_DEV_MIG_MODE 67
/**
* Device CPU affinity. part 1/8 = cpus 0 - 63
*/
-#define DCGM_FI_DEV_CPU_AFFINITY_0 70
-
+#define DCGM_FI_DEV_CPU_AFFINITY_0 70
+
/**
* Device CPU affinity. part 1/8 = cpus 64 - 127
- */
-#define DCGM_FI_DEV_CPU_AFFINITY_1 71
-
+ */
+#define DCGM_FI_DEV_CPU_AFFINITY_1 71
+
/**
* Device CPU affinity. part 2/8 = cpus 128 - 191
- */
-#define DCGM_FI_DEV_CPU_AFFINITY_2 72
+ */
+#define DCGM_FI_DEV_CPU_AFFINITY_2 72
/**
* Device CPU affinity. part 3/8 = cpus 192 - 255
*/
-#define DCGM_FI_DEV_CPU_AFFINITY_3 73
+#define DCGM_FI_DEV_CPU_AFFINITY_3 73
/**
* ECC inforom version
*/
-#define DCGM_FI_DEV_ECC_INFOROM_VER 80
+#define DCGM_FI_DEV_ECC_INFOROM_VER 80
/**
* Power management object inforom version
*/
-#define DCGM_FI_DEV_POWER_INFOROM_VER 81
+#define DCGM_FI_DEV_POWER_INFOROM_VER 81
/**
* Inforom image version
*/
-#define DCGM_FI_DEV_INFOROM_IMAGE_VER 82
+#define DCGM_FI_DEV_INFOROM_IMAGE_VER 82
/**
* Inforom configuration checksum
*/
-#define DCGM_FI_DEV_INFOROM_CONFIG_CHECK 83
+#define DCGM_FI_DEV_INFOROM_CONFIG_CHECK 83
/**
* Reads the infoROM from the flash and verifies the checksums
*/
-#define DCGM_FI_DEV_INFOROM_CONFIG_VALID 84
+#define DCGM_FI_DEV_INFOROM_CONFIG_VALID 84
/**
* VBIOS version of the device
*/
-#define DCGM_FI_DEV_VBIOS_VERSION 85
+#define DCGM_FI_DEV_VBIOS_VERSION 85
/**
* Total BAR1 of the GPU in MB
*/
-#define DCGM_FI_DEV_BAR1_TOTAL 90
+#define DCGM_FI_DEV_BAR1_TOTAL 90
/**
- * Sync boost settings on the node
+ * Deprecated - Sync boost settings on the node
*/
-#define DCGM_FI_SYNC_BOOST 91
+#define DCGM_FI_SYNC_BOOST 91
/**
* Used BAR1 of the GPU in MB
*/
-#define DCGM_FI_DEV_BAR1_USED 92
+#define DCGM_FI_DEV_BAR1_USED 92
/**
* Free BAR1 of the GPU in MB
*/
-#define DCGM_FI_DEV_BAR1_FREE 93
+#define DCGM_FI_DEV_BAR1_FREE 93
/**
* SM clock for the device
*/
-#define DCGM_FI_DEV_SM_CLOCK 100
+#define DCGM_FI_DEV_SM_CLOCK 100
/**
* Memory clock for the device
*/
-#define DCGM_FI_DEV_MEM_CLOCK 101
+#define DCGM_FI_DEV_MEM_CLOCK 101
/**
* Video encoder/decoder clock for the device
*/
-#define DCGM_FI_DEV_VIDEO_CLOCK 102
+#define DCGM_FI_DEV_VIDEO_CLOCK 102
/**
* SM Application clocks
*/
-#define DCGM_FI_DEV_APP_SM_CLOCK 110
+#define DCGM_FI_DEV_APP_SM_CLOCK 110
/**
* Memory Application clocks
*/
-#define DCGM_FI_DEV_APP_MEM_CLOCK 111
+#define DCGM_FI_DEV_APP_MEM_CLOCK 111
/**
* Current clock throttle reasons (bitmask of DCGM_CLOCKS_THROTTLE_REASON_*)
@@ -377,42 +420,53 @@ typedef unsigned int dcgm_field_eid_t;
/**
* Maximum supported SM clock for the device
*/
-#define DCGM_FI_DEV_MAX_SM_CLOCK 113
+#define DCGM_FI_DEV_MAX_SM_CLOCK 113
/**
* Maximum supported Memory clock for the device
*/
-#define DCGM_FI_DEV_MAX_MEM_CLOCK 114
+#define DCGM_FI_DEV_MAX_MEM_CLOCK 114
/**
* Maximum supported Video encoder/decoder clock for the device
*/
-#define DCGM_FI_DEV_MAX_VIDEO_CLOCK 115
+#define DCGM_FI_DEV_MAX_VIDEO_CLOCK 115
/**
* Auto-boost for the device (1 = enabled. 0 = disabled)
*/
-#define DCGM_FI_DEV_AUTOBOOST 120
+#define DCGM_FI_DEV_AUTOBOOST 120
/**
* Supported clocks for the device
*/
-#define DCGM_FI_DEV_SUPPORTED_CLOCKS 130
+#define DCGM_FI_DEV_SUPPORTED_CLOCKS 130
/**
* Memory temperature for the device
*/
-#define DCGM_FI_DEV_MEMORY_TEMP 140
+#define DCGM_FI_DEV_MEMORY_TEMP 140
/**
* Current temperature readings for the device, in degrees C
*/
-#define DCGM_FI_DEV_GPU_TEMP 150
+#define DCGM_FI_DEV_GPU_TEMP 150
+
+/**
+ * Maximum operating temperature for the memory of this GPU
+ */
+#define DCGM_FI_DEV_MEM_MAX_OP_TEMP 151
+
+/**
+ * Maximum operating temperature for this GPU
+ */
+#define DCGM_FI_DEV_GPU_MAX_OP_TEMP 152
+
/**
* Power usage for the device in Watts
*/
-#define DCGM_FI_DEV_POWER_USAGE 155
+#define DCGM_FI_DEV_POWER_USAGE 155
/**
* Total energy consumption for the GPU in mJ since the driver was last reloaded
@@ -422,72 +476,76 @@ typedef unsigned int dcgm_field_eid_t;
/**
* Slowdown temperature for the device
*/
-#define DCGM_FI_DEV_SLOWDOWN_TEMP 158
+#define DCGM_FI_DEV_SLOWDOWN_TEMP 158
/**
* Shutdown temperature for the device
*/
-#define DCGM_FI_DEV_SHUTDOWN_TEMP 159
+#define DCGM_FI_DEV_SHUTDOWN_TEMP 159
/**
* Current Power limit for the device
*/
-#define DCGM_FI_DEV_POWER_MGMT_LIMIT 160
+#define DCGM_FI_DEV_POWER_MGMT_LIMIT 160
/**
* Minimum power management limit for the device
*/
-#define DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN 161
+#define DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN 161
/**
* Maximum power management limit for the device
*/
-#define DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX 162
+#define DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX 162
/**
* Default power management limit for the device
*/
-#define DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF 163
+#define DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF 163
/**
* Effective power limit that the driver enforces after taking into account all limiters
*/
-#define DCGM_FI_DEV_ENFORCED_POWER_LIMIT 164
+#define DCGM_FI_DEV_ENFORCED_POWER_LIMIT 164
/**
* Performance state (P-State) 0-15. 0=highest
*/
-#define DCGM_FI_DEV_PSTATE 190
+#define DCGM_FI_DEV_PSTATE 190
/**
* Fan speed for the device in percent 0-100
*/
-#define DCGM_FI_DEV_FAN_SPEED 191
+#define DCGM_FI_DEV_FAN_SPEED 191
/**
* PCIe Tx utilization information
+ *
+ * Deprecated: Use DCGM_FI_PROF_PCIE_TX_BYTES instead.
*/
-#define DCGM_FI_DEV_PCIE_TX_THROUGHPUT 200
-
+#define DCGM_FI_DEV_PCIE_TX_THROUGHPUT 200
+
/**
* PCIe Rx utilization information
- */
-#define DCGM_FI_DEV_PCIE_RX_THROUGHPUT 201
-
+ *
+ * Deprecated: Use DCGM_FI_PROF_PCIE_RX_BYTES instead.
+ */
+#define DCGM_FI_DEV_PCIE_RX_THROUGHPUT 201
+
/**
* PCIe replay counter
*/
-#define DCGM_FI_DEV_PCIE_REPLAY_COUNTER 202
+#define DCGM_FI_DEV_PCIE_REPLAY_COUNTER 202
/**
* GPU Utilization
*/
-#define DCGM_FI_DEV_GPU_UTIL 203
+#define DCGM_FI_DEV_GPU_UTIL 203
/**
* Memory Utilization
*/
-#define DCGM_FI_DEV_MEM_COPY_UTIL 204
+#define DCGM_FI_DEV_MEM_COPY_UTIL 204
/**
* Process accounting stats.
@@ -496,17 +554,17 @@ typedef unsigned int dcgm_field_eid_t;
* enable accounting ahead of time. Accounting mode can be enabled by
* running "nvidia-smi -am 1" as root on the same node the host engine is running on.
*/
-#define DCGM_FI_DEV_ACCOUNTING_DATA 205
+#define DCGM_FI_DEV_ACCOUNTING_DATA 205
/**
* Encoder Utilization
*/
-#define DCGM_FI_DEV_ENC_UTIL 206
+#define DCGM_FI_DEV_ENC_UTIL 206
/**
* Decoder Utilization
*/
-#define DCGM_FI_DEV_DEC_UTIL 207
+#define DCGM_FI_DEV_DEC_UTIL 207
/**
* Memory utilization samples
@@ -516,57 +574,57 @@ typedef unsigned int dcgm_field_eid_t;
/*
* SM utilization samples
*/
-#define DCGM_FI_DEV_GPU_UTIL_SAMPLES 211
+#define DCGM_FI_DEV_GPU_UTIL_SAMPLES 211
/**
* Graphics processes running on the GPU.
*/
-#define DCGM_FI_DEV_GRAPHICS_PIDS 220
+#define DCGM_FI_DEV_GRAPHICS_PIDS 220
/**
* Compute processes running on the GPU.
*/
-#define DCGM_FI_DEV_COMPUTE_PIDS 221
+#define DCGM_FI_DEV_COMPUTE_PIDS 221
/**
* XID errors. The value is the specific XID error
*/
-#define DCGM_FI_DEV_XID_ERRORS 230
+#define DCGM_FI_DEV_XID_ERRORS 230
/**
* PCIe Max Link Generation
*/
-#define DCGM_FI_DEV_PCIE_MAX_LINK_GEN 235
+#define DCGM_FI_DEV_PCIE_MAX_LINK_GEN 235
/**
* PCIe Max Link Width
*/
-#define DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH 236
+#define DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH 236
/**
* PCIe Current Link Generation
*/
-#define DCGM_FI_DEV_PCIE_LINK_GEN 237
+#define DCGM_FI_DEV_PCIE_LINK_GEN 237
/**
* PCIe Current Link Width
*/
-#define DCGM_FI_DEV_PCIE_LINK_WIDTH 238
+#define DCGM_FI_DEV_PCIE_LINK_WIDTH 238
/**
* Power Violation time in usec
*/
-#define DCGM_FI_DEV_POWER_VIOLATION 240
+#define DCGM_FI_DEV_POWER_VIOLATION 240
/**
* Thermal Violation time in usec
*/
-#define DCGM_FI_DEV_THERMAL_VIOLATION 241
+#define DCGM_FI_DEV_THERMAL_VIOLATION 241
/**
* Sync Boost Violation time in usec
*/
-#define DCGM_FI_DEV_SYNC_BOOST_VIOLATION 242
+#define DCGM_FI_DEV_SYNC_BOOST_VIOLATION 242
/**
* Board violation limit.
@@ -576,7 +634,7 @@ typedef unsigned int dcgm_field_eid_t;
/**
*Low utilisation violation limit.
*/
-#define DCGM_FI_DEV_LOW_UTIL_VIOLATION 244
+#define DCGM_FI_DEV_LOW_UTIL_VIOLATION 244
/**
*Reliability violation limit.
@@ -596,1301 +654,1439 @@ typedef unsigned int dcgm_field_eid_t;
/**
* Total Frame Buffer of the GPU in MB
*/
-#define DCGM_FI_DEV_FB_TOTAL 250
+#define DCGM_FI_DEV_FB_TOTAL 250
/**
* Free Frame Buffer in MB
*/
-#define DCGM_FI_DEV_FB_FREE 251
+#define DCGM_FI_DEV_FB_FREE 251
/**
* Used Frame Buffer in MB
*/
-#define DCGM_FI_DEV_FB_USED 252
+#define DCGM_FI_DEV_FB_USED 252
/**
* Current ECC mode for the device
*/
-#define DCGM_FI_DEV_ECC_CURRENT 300
-
+#define DCGM_FI_DEV_ECC_CURRENT 300
+
/**
* Pending ECC mode for the device
- */
-#define DCGM_FI_DEV_ECC_PENDING 301
-
+ */
+#define DCGM_FI_DEV_ECC_PENDING 301
+
/**
* Total single bit volatile ECC errors
- */
-#define DCGM_FI_DEV_ECC_SBE_VOL_TOTAL 310
-
+ */
+#define DCGM_FI_DEV_ECC_SBE_VOL_TOTAL 310
+
/**
* Total double bit volatile ECC errors
- */
-#define DCGM_FI_DEV_ECC_DBE_VOL_TOTAL 311
-
+ */
+#define DCGM_FI_DEV_ECC_DBE_VOL_TOTAL 311
+
/**
* Total single bit aggregate (persistent) ECC errors
* Note: monotonically increasing
- */
-#define DCGM_FI_DEV_ECC_SBE_AGG_TOTAL 312
-
+ */
+#define DCGM_FI_DEV_ECC_SBE_AGG_TOTAL 312
+
/**
* Total double bit aggregate (persistent) ECC errors
* Note: monotonically increasing
- */
-#define DCGM_FI_DEV_ECC_DBE_AGG_TOTAL 313
-
+ */
+#define DCGM_FI_DEV_ECC_DBE_AGG_TOTAL 313
+
/**
* L1 cache single bit volatile ECC errors
*/
-#define DCGM_FI_DEV_ECC_SBE_VOL_L1 314
-
+#define DCGM_FI_DEV_ECC_SBE_VOL_L1 314
+
/**
* L1 cache double bit volatile ECC errors
*/
-#define DCGM_FI_DEV_ECC_DBE_VOL_L1 315
-
+#define DCGM_FI_DEV_ECC_DBE_VOL_L1 315
+
/**
* L2 cache single bit volatile ECC errors
*/
-#define DCGM_FI_DEV_ECC_SBE_VOL_L2 316
-
+#define DCGM_FI_DEV_ECC_SBE_VOL_L2 316
+
/**
* L2 cache double bit volatile ECC errors
*/
-#define DCGM_FI_DEV_ECC_DBE_VOL_L2 317
-
+#define DCGM_FI_DEV_ECC_DBE_VOL_L2 317
+
/**
* Device memory single bit volatile ECC errors
*/
-#define DCGM_FI_DEV_ECC_SBE_VOL_DEV 318
+#define DCGM_FI_DEV_ECC_SBE_VOL_DEV 318
/**
* Device memory double bit volatile ECC errors
*/
-#define DCGM_FI_DEV_ECC_DBE_VOL_DEV 319
-
+#define DCGM_FI_DEV_ECC_DBE_VOL_DEV 319
+
/**
* Register file single bit volatile ECC errors
*/
-#define DCGM_FI_DEV_ECC_SBE_VOL_REG 320
-
+#define DCGM_FI_DEV_ECC_SBE_VOL_REG 320
+
/**
* Register file double bit volatile ECC errors
- */
-#define DCGM_FI_DEV_ECC_DBE_VOL_REG 321
-
+ */
+#define DCGM_FI_DEV_ECC_DBE_VOL_REG 321
+
/**
* Texture memory single bit volatile ECC errors
- */
-#define DCGM_FI_DEV_ECC_SBE_VOL_TEX 322
-
+ */
+#define DCGM_FI_DEV_ECC_SBE_VOL_TEX 322
+
/**
* Texture memory double bit volatile ECC errors
- */
-#define DCGM_FI_DEV_ECC_DBE_VOL_TEX 323
-
+ */
+#define DCGM_FI_DEV_ECC_DBE_VOL_TEX 323
+
/**
* L1 cache single bit aggregate (persistent) ECC errors
* Note: monotonically increasing
- */
-#define DCGM_FI_DEV_ECC_SBE_AGG_L1 324
-
+ */
+#define DCGM_FI_DEV_ECC_SBE_AGG_L1 324
+
/**
* L1 cache double bit aggregate (persistent) ECC errors
* Note: monotonically increasing
- */
-#define DCGM_FI_DEV_ECC_DBE_AGG_L1 325
-
+ */
+#define DCGM_FI_DEV_ECC_DBE_AGG_L1 325
+
/**
* L2 cache single bit aggregate (persistent) ECC errors
* Note: monotonically increasing
- */
-#define DCGM_FI_DEV_ECC_SBE_AGG_L2 326
+ */
+#define DCGM_FI_DEV_ECC_SBE_AGG_L2 326
/**
* L2 cache double bit aggregate (persistent) ECC errors
* Note: monotonically increasing
- */
-#define DCGM_FI_DEV_ECC_DBE_AGG_L2 327
-
+ */
+#define DCGM_FI_DEV_ECC_DBE_AGG_L2 327
+
/**
* Device memory single bit aggregate (persistent) ECC errors
* Note: monotonically increasing
- */
-#define DCGM_FI_DEV_ECC_SBE_AGG_DEV 328
-
+ */
+#define DCGM_FI_DEV_ECC_SBE_AGG_DEV 328
+
/**
* Device memory double bit aggregate (persistent) ECC errors
* Note: monotonically increasing
- */
-#define DCGM_FI_DEV_ECC_DBE_AGG_DEV 329
-
+ */
+#define DCGM_FI_DEV_ECC_DBE_AGG_DEV 329
+
/**
* Register File single bit aggregate (persistent) ECC errors
* Note: monotonically increasing
- */
-#define DCGM_FI_DEV_ECC_SBE_AGG_REG 330
-
+ */
+#define DCGM_FI_DEV_ECC_SBE_AGG_REG 330
+
/**
* Register File double bit aggregate (persistent) ECC errors
* Note: monotonically increasing
*/
-#define DCGM_FI_DEV_ECC_DBE_AGG_REG 331
-
+#define DCGM_FI_DEV_ECC_DBE_AGG_REG 331
+
/**
* Texture memory single bit aggregate (persistent) ECC errors
* Note: monotonically increasing
*/
-#define DCGM_FI_DEV_ECC_SBE_AGG_TEX 332
+#define DCGM_FI_DEV_ECC_SBE_AGG_TEX 332
/**
* Texture memory double bit aggregate (persistent) ECC errors
* Note: monotonically increasing
- */
-#define DCGM_FI_DEV_ECC_DBE_AGG_TEX 333
-
+ */
+#define DCGM_FI_DEV_ECC_DBE_AGG_TEX 333
+
/**
* Number of retired pages because of single bit errors
* Note: monotonically increasing
*/
-#define DCGM_FI_DEV_RETIRED_SBE 390
+#define DCGM_FI_DEV_RETIRED_SBE 390
/**
* Number of retired pages because of double bit errors
* Note: monotonically increasing
*/
-#define DCGM_FI_DEV_RETIRED_DBE 391
+#define DCGM_FI_DEV_RETIRED_DBE 391
/**
* Number of pages pending retirement
*/
-#define DCGM_FI_DEV_RETIRED_PENDING 392
+#define DCGM_FI_DEV_RETIRED_PENDING 392
+
+/**
+ * Number of remapped rows for uncorrectable errors
+ */
+#define DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS 393
+
+/**
+ * Number of remapped rows for correctable errors
+ */
+#define DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS 394
+
+/**
+ * Whether remapping of rows has failed
+ */
+#define DCGM_FI_DEV_ROW_REMAP_FAILURE 395
/*
-* NV Link flow control CRC Error Counter for Lane 0
-*/
-#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 400
+ * NV Link flow control CRC Error Counter for Lane 0
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 400
/*
-* NV Link flow control CRC Error Counter for Lane 1
-*/
-#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 401
+ * NV Link flow control CRC Error Counter for Lane 1
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 401
/*
-* NV Link flow control CRC Error Counter for Lane 2
-*/
-#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 402
+ * NV Link flow control CRC Error Counter for Lane 2
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 402
/*
-* NV Link flow control CRC Error Counter for Lane 3
-*/
-#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 403
+ * NV Link flow control CRC Error Counter for Lane 3
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 403
/*
-* NV Link flow control CRC Error Counter for Lane 4
-*/
-#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 404
+ * NV Link flow control CRC Error Counter for Lane 4
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 404
/*
-* NV Link flow control CRC Error Counter for Lane 5
-*/
-#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 405
+ * NV Link flow control CRC Error Counter for Lane 5
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 405
/*
-* NV Link flow control CRC Error Counter total for all Lanes
-*/
-#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL 409
+ * NV Link flow control CRC Error Counter total for all Lanes
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL 409
/*
-* NV Link data CRC Error Counter for Lane 0
-*/
-#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 410
+ * NV Link data CRC Error Counter for Lane 0
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 410
/*
-* NV Link data CRC Error Counter for Lane 1
-*/
-#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 411
+ * NV Link data CRC Error Counter for Lane 1
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 411
/*
-* NV Link data CRC Error Counter for Lane 2
-*/
-#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 412
+ * NV Link data CRC Error Counter for Lane 2
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 412
/*
-* NV Link data CRC Error Counter for Lane 3
-*/
-#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 413
+ * NV Link data CRC Error Counter for Lane 3
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 413
/*
-* NV Link data CRC Error Counter for Lane 4
-*/
-#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 414
+ * NV Link data CRC Error Counter for Lane 4
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 414
/*
-* NV Link data CRC Error Counter for Lane 5
-*/
-#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 415
+ * NV Link data CRC Error Counter for Lane 5
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 415
/*
-* NV Link data CRC Error Counter total for all Lanes
-*/
-#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL 419
+ * NV Link data CRC Error Counter total for all Lanes
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL 419
/*
-* NV Link Replay Error Counter for Lane 0
-*/
-#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 420
+ * NV Link Replay Error Counter for Lane 0
+ */
+#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 420
/*
-* NV Link Replay Error Counter for Lane 1
-*/
-#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 421
+ * NV Link Replay Error Counter for Lane 1
+ */
+#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 421
/*
-* NV Link Replay Error Counter for Lane 2
-*/
-#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 422
+ * NV Link Replay Error Counter for Lane 2
+ */
+#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 422
/*
-* NV Link Replay Error Counter for Lane 3
-*/
-#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 423
+ * NV Link Replay Error Counter for Lane 3
+ */
+#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 423
/*
-* NV Link Replay Error Counter for Lane 4
-*/
-#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 424
+ * NV Link Replay Error Counter for Lane 4
+ */
+#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 424
/*
-* NV Link Replay Error Counter for Lane 5
-*/
-#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 425
+ * NV Link Replay Error Counter for Lane 5
+ */
+#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 425
/*
-* NV Link Replay Error Counter total for all Lanes
-*/
-#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL 429
+ * NV Link Replay Error Counter total for all Lanes
+ */
+#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL 429
/*
-* NV Link Recovery Error Counter for Lane 0
-*/
-#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 430
+ * NV Link Recovery Error Counter for Lane 0
+ */
+#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 430
/*
-* NV Link Recovery Error Counter for Lane 1
-*/
-#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 431
+ * NV Link Recovery Error Counter for Lane 1
+ */
+#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 431
/*
-* NV Link Recovery Error Counter for Lane 2
-*/
-#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 432
+ * NV Link Recovery Error Counter for Lane 2
+ */
+#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 432
/*
-* NV Link Recovery Error Counter for Lane 3
-*/
-#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 433
+ * NV Link Recovery Error Counter for Lane 3
+ */
+#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 433
/*
-* NV Link Recovery Error Counter for Lane 4
-*/
-#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 434
+ * NV Link Recovery Error Counter for Lane 4
+ */
+#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 434
/*
-* NV Link Recovery Error Counter for Lane 5
-*/
-#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 435
+ * NV Link Recovery Error Counter for Lane 5
+ */
+#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 435
/*
-* NV Link Recovery Error Counter total for all Lanes
-*/
-#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL 439
+ * NV Link Recovery Error Counter total for all Lanes
+ */
+#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL 439
/*
-* NV Link Bandwidth Counter for Lane 0
-*/
-#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 440
+ * NV Link Bandwidth Counter for Lane 0 - Not supported in DCGM 2.0
+ */
+#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 440
/*
-* NV Link Bandwidth Counter for Lane 1
-*/
-#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 441
+ * NV Link Bandwidth Counter for Lane 1 - Not supported in DCGM 2.0
+ */
+#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 441
/*
-* NV Link Bandwidth Counter for Lane 2
-*/
-#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 442
+ * NV Link Bandwidth Counter for Lane 2 - Not supported in DCGM 2.0
+ */
+#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 442
/*
-* NV Link Bandwidth Counter for Lane 3
-*/
-#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 443
+ * NV Link Bandwidth Counter for Lane 3 - Not supported in DCGM 2.0
+ */
+#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 443
/*
-* NV Link Bandwidth Counter for Lane 4
-*/
-#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 444
+ * NV Link Bandwidth Counter for Lane 4 - Not supported in DCGM 2.0
+ */
+#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 444
/*
-* NV Link Bandwidth Counter for Lane 5
-*/
-#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 445
+ * NV Link Bandwidth Counter for Lane 5 - Not supported in DCGM 2.0
+ */
+#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 445
/*
-* NV Link Bandwidth Counter total for all Lanes
-*/
-#define DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL 449
+ * NV Link Bandwidth Counter total for all Lanes
+ */
+#define DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL 449
/*
-* GPU NVLink error information
-*/
-#define DCGM_FI_DEV_GPU_NVLINK_ERRORS 450
+ * GPU NVLink error information
+ */
+#define DCGM_FI_DEV_GPU_NVLINK_ERRORS 450
+
+/*
+ * NV Link flow control CRC Error Counter for Lane 6
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 451
+
+/*
+ * NV Link flow control CRC Error Counter for Lane 7
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 452
+
+/*
+ * NV Link flow control CRC Error Counter for Lane 8
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 453
+
+/*
+ * NV Link flow control CRC Error Counter for Lane 9
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 454
+
+/*
+ * NV Link flow control CRC Error Counter for Lane 10
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 455
+
+/*
+ * NV Link flow control CRC Error Counter for Lane 11
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 456
+
+/*
+ * NV Link data CRC Error Counter for Lane 6
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 457
+
+/*
+ * NV Link data CRC Error Counter for Lane 7
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 458
+
+/*
+ * NV Link data CRC Error Counter for Lane 8
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 459
+
+/*
+ * NV Link data CRC Error Counter for Lane 9
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 460
+
+/*
+ * NV Link data CRC Error Counter for Lane 10
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 461
+
+/*
+ * NV Link data CRC Error Counter for Lane 11
+ */
+#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 462
+
+/*
+ * NV Link Replay Error Counter for Lane 6
+ */
+#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 463
+
+/*
+ * NV Link Replay Error Counter for Lane 7
+ */
+#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 464
+
+/*
+ * NV Link Replay Error Counter for Lane 8
+ */
+#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 465
+
+/*
+ * NV Link Replay Error Counter for Lane 9
+ */
+#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 466
+
+/*
+ * NV Link Replay Error Counter for Lane 10
+ */
+#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 467
+
+/*
+ * NV Link Replay Error Counter for Lane 11
+ */
+#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 468
+
+/*
+ * NV Link Recovery Error Counter for Lane 6
+ */
+#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 469
+
+/*
+ * NV Link Recovery Error Counter for Lane 7
+ */
+#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 470
+
+/*
+ * NV Link Recovery Error Counter for Lane 8
+ */
+#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 471
+
+/*
+ * NV Link Recovery Error Counter for Lane 9
+ */
+#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 472
+
+/*
+ * NV Link Recovery Error Counter for Lane 10
+ */
+#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 473
+
+/*
+ * NV Link Recovery Error Counter for Lane 11
+ */
+#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 474
/**
- * Virtualization Mode corresponding to the GPU
+ * Virtualization Mode corresponding to the GPU.
+ *
+ * One of DCGM_GPU_VIRTUALIZATION_MODE_* constants.
*/
-#define DCGM_FI_DEV_VIRTUAL_MODE 500
+#define DCGM_FI_DEV_VIRTUAL_MODE 500
/**
* Includes Count and Static info of vGPU types supported on a device
*/
-#define DCGM_FI_DEV_SUPPORTED_TYPE_INFO 501
+#define DCGM_FI_DEV_SUPPORTED_TYPE_INFO 501
/**
* Includes Count and currently Creatable vGPU types on a device
*/
-#define DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS 502
+#define DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS 502
/**
* Includes Count and currently Active vGPU Instances on a device
*/
-#define DCGM_FI_DEV_VGPU_INSTANCE_IDS 503
+#define DCGM_FI_DEV_VGPU_INSTANCE_IDS 503
/**
* Utilization values for vGPUs running on the device
*/
-#define DCGM_FI_DEV_VGPU_UTILIZATIONS 504
+#define DCGM_FI_DEV_VGPU_UTILIZATIONS 504
/**
* Utilization values for processes running within vGPU VMs using the device
*/
-#define DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION 505
+#define DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION 505
/**
* Current encoder statistics for a given device
*/
-#define DCGM_FI_DEV_ENC_STATS 506
+#define DCGM_FI_DEV_ENC_STATS 506
/**
* Statistics of current active frame buffer capture sessions on a given device
*/
-#define DCGM_FI_DEV_FBC_STATS 507
+#define DCGM_FI_DEV_FBC_STATS 507
/**
* Information about active frame buffer capture sessions on a target device
*/
-#define DCGM_FI_DEV_FBC_SESSIONS_INFO 508
+#define DCGM_FI_DEV_FBC_SESSIONS_INFO 508
/**
* VM ID of the vGPU instance
*/
-#define DCGM_FI_DEV_VGPU_VM_ID 520
+#define DCGM_FI_DEV_VGPU_VM_ID 520
/**
* VM name of the vGPU instance
*/
-#define DCGM_FI_DEV_VGPU_VM_NAME 521
+#define DCGM_FI_DEV_VGPU_VM_NAME 521
/**
* vGPU type of the vGPU instance
*/
-#define DCGM_FI_DEV_VGPU_TYPE 522
+#define DCGM_FI_DEV_VGPU_TYPE 522
/**
* UUID of the vGPU instance
*/
-#define DCGM_FI_DEV_VGPU_UUID 523
+#define DCGM_FI_DEV_VGPU_UUID 523
/**
* Driver version of the vGPU instance
*/
-#define DCGM_FI_DEV_VGPU_DRIVER_VERSION 524
+#define DCGM_FI_DEV_VGPU_DRIVER_VERSION 524
/**
* Memory usage of the vGPU instance
*/
-#define DCGM_FI_DEV_VGPU_MEMORY_USAGE 525
+#define DCGM_FI_DEV_VGPU_MEMORY_USAGE 525
/**
* License status of the vGPU instance
*/
-#define DCGM_FI_DEV_VGPU_LICENSE_STATUS 526
+#define DCGM_FI_DEV_VGPU_LICENSE_STATUS 526
/**
* Frame rate limit of the vGPU instance
*/
-#define DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT 527
+#define DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT 527
/**
* Current encoder statistics of the vGPU instance
*/
-#define DCGM_FI_DEV_VGPU_ENC_STATS 528
+#define DCGM_FI_DEV_VGPU_ENC_STATS 528
/**
* Information about all active encoder sessions on the vGPU instance
*/
-#define DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO 529
+#define DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO 529
/**
* Statistics of current active frame buffer capture sessions on the vGPU instance
*/
-#define DCGM_FI_DEV_VGPU_FBC_STATS 530
+#define DCGM_FI_DEV_VGPU_FBC_STATS 530
/**
* Information about active frame buffer capture sessions on the vGPU instance
*/
-#define DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO 531
+#define DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO 531
/**
* Starting field ID of the vGPU instance
*/
-#define DCGM_FI_FIRST_VGPU_FIELD_ID 520
+#define DCGM_FI_FIRST_VGPU_FIELD_ID 520
/**
* Last field ID of the vGPU instance
*/
-#define DCGM_FI_LAST_VGPU_FIELD_ID 570
+#define DCGM_FI_LAST_VGPU_FIELD_ID 570
/**
* For now max vGPU field Ids taken as difference of DCGM_FI_LAST_VGPU_FIELD_ID and DCGM_FI_LAST_VGPU_FIELD_ID i.e. 50
*/
-#define DCGM_FI_MAX_VGPU_FIELDS DCGM_FI_LAST_VGPU_FIELD_ID - DCGM_FI_FIRST_VGPU_FIELD_ID
+#define DCGM_FI_MAX_VGPU_FIELDS DCGM_FI_LAST_VGPU_FIELD_ID - DCGM_FI_FIRST_VGPU_FIELD_ID
/**
* Starting ID for all the internal fields
*/
-#define DCGM_FI_INTERNAL_FIELDS_0_START 600
+#define DCGM_FI_INTERNAL_FIELDS_0_START 600
/**
* Last ID for all the internal fields
*/
/**
-*
-*
-*
-* NVSwitch entity field IDs start here.
-*
-*
-* NVSwitch latency bins for port 0
-*/
-
-#define DCGM_FI_INTERNAL_FIELDS_0_END 699
-
-
-/**
-* Low latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P00 700
-/**
-* Medium latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P00 701
-/**
-* High latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P00 702
-/**
-* Max latency bin
-*
-*
-* NVSwitch latency bins for port 1
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P00 703
-
-/**
-* Low latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P01 704
-/**
-* Medium latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P01 705
-/**
-* High latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P01 706
-/**
-* Max latency bin
-*
-*
-* NVSwitch latency bins for port 2
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P01 707
-
-/**
-* Low latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P02 708
-/**
-* Medium latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P02 709
-/**
-* High latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P02 710
-/**
-* Max latency bin
-*
-*
-* NVSwitch latency bins for port 3
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P02 711
-
-/**
-* Low latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P03 712
-/**
-* Medium latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P03 713
-/**
-* High latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P03 714
-/**
-* Max latency bin
-*
-*
-* NVSwitch latency bins for port 4
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P03 715
-
-/**
-* Low latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P04 716
-/**
-* Medium latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P04 717
-/**
-* High latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P04 718
-/**
-* Max latency bin
-*
-*
-* NVSwitch latency bins for port 5
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P04 719
-
-/**
-* Low latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P05 720
-/**
-* Medium latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P05 721
-/**
-* High latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P05 722
-/**
-* Max latency bin
-*
-*
-* NVSwitch latency bins for port 6
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P05 723
-
-/**
-* Low latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P06 724
-/**
-* Medium latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P06 725
-/**
-* High latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P06 726
-/**
-* Max latency bin
-*
-*
-* NVSwitch latency bins for port 7
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P06 727
-
-/**
-* Low latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P07 728
-/**
-* Medium latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P07 729
-/**
-* High latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P07 730
-/**
-* Max latency bin
-*
-*
-* NVSwitch latency bins for port 8
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P07 731
-
-/**
-* Low latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P08 732
-/**
-* Medium latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P08 733
-/**
-* High latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P08 734
-/**
-* Max latency bin
-*
-*
-* NVSwitch latency bins for port 9
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P08 735
-
-/**
-* Low latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P09 736
-/**
-* Medium latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P09 737
-/**
-* High latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P09 738
-/**
-* Max latency bin
-*
-*
-* NVSwitch latency bins for port 10
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P09 739
-
-/**
-* Low latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P10 740
-/**
-* Medium latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P10 741
-/**
-* High latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P10 742
-/**
-* Max latency bin
-*
-*
-* NVSwitch latency bins for port 11
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P10 743
-
-/**
-* Low latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P11 744
-/**
-* Medium latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P11 745
-/**
-* High latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P11 746
-/**
-* Max latency bin
-*
-*
-* NVSwitch latency bins for port 12
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P11 747
-
-/**
-* Low latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P12 748
-/**
-* Medium latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P12 749
-/**
-* High latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P12 750
-/**
-* Max latency bin
-*
-*
-* NVSwitch latency bins for port 13
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P12 751
-
-/**
-* Low latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P13 752
-/**
-* Medium latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P13 753
-/**
-* High latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P13 754
-/**
-* Max latency bin
-*
-*
-* NVSwitch latency bins for port 14
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P13 755
-
-/**
-* Low latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P14 756
-/**
-* Medium latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P14 757
-/**
-* High latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P14 758
-/**
-* Max latency bin
-*
-*
-* NVSwitch latency bins for port 15
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P14 759
-
-/**
-* Low latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P15 760
-/**
-* Medium latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P15 761
-/**
-* High latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P15 762
-/**
-* Max latency bin
-*
-*
-* NVSwitch latency bins for port 16
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P15 763
-
-/**
-* Low latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P16 764
-/**
-* Medium latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P16 765
-/**
-* High latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P16 766
-/**
-* Max latency bin
-*
-*
-* NVSwitch latency bins for port 17
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P16 767
-
-/**
-* Low latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P17 768
-/**
-* Medium latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P17 769
-/**
-* High latency bin
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P17 770
-/**
-* Max latency bin
-*
-*
-*
-* NVSwitch Tx and Rx Counter 0 for each port
-* By default, Counter 0 counts bytes.
-*/
-#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P17 771
-
-/**
-* NVSwitch Tx Bandwidth Counter 0 for port 0
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P00 780
-/**
-* NVSwitch Rx Bandwidth Counter 0 for port 0
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P00 781
-
-/**
-* NVSwitch Tx Bandwidth Counter 0 for port 1
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P01 782
-/**
-* NVSwitch Rx Bandwidth Counter 0 for port 1
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P01 783
-
-/**
-* NVSwitch Tx Bandwidth Counter 0 for port 2
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P02 784
-/**
-* NVSwitch Rx Bandwidth Counter 0 for port 2
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P02 785
-
-/**
-* NVSwitch Tx Bandwidth Counter 0 for port 3
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P03 786
+ *
+ *
+ *
+ * NVSwitch entity field IDs start here.
+ *
+ *
+ * NVSwitch latency bins for port 0
+ */
+
+#define DCGM_FI_INTERNAL_FIELDS_0_END 699
+
+
+/**
+ * Low latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P00 700
+/**
+ * Medium latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P00 701
+/**
+ * High latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P00 702
/**
-* NVSwitch Rx Bandwidth Counter 0 for port 3
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P03 787
+ * Max latency bin
+ *
+ *
+ * NVSwitch latency bins for port 1
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P00 703
/**
-* NVSwitch Tx Bandwidth Counter 0 for port 4
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P04 788
+ * Low latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P01 704
+/**
+ * Medium latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P01 705
+/**
+ * High latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P01 706
/**
-* NVSwitch Rx Bandwidth Counter 0 for port 4
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P04 789
+ * Max latency bin
+ *
+ *
+ * NVSwitch latency bins for port 2
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P01 707
/**
-* NVSwitch Tx Bandwidth Counter 0 for port 5
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P05 790
+ * Low latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P02 708
/**
-* NVSwitch Rx Bandwidth Counter 0 for port 5
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P05 791
+ * Medium latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P02 709
+/**
+ * High latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P02 710
+/**
+ * Max latency bin
+ *
+ *
+ * NVSwitch latency bins for port 3
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P02 711
/**
-* NVSwitch Tx Bandwidth Counter 0 for port 6
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P06 792
+ * Low latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P03 712
/**
-* NVSwitch Rx Bandwidth Counter 0 for port 6
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P06 793
+ * Medium latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P03 713
+/**
+ * High latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P03 714
+/**
+ * Max latency bin
+ *
+ *
+ * NVSwitch latency bins for port 4
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P03 715
+
+/**
+ * Low latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P04 716
+/**
+ * Medium latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P04 717
+/**
+ * High latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P04 718
+/**
+ * Max latency bin
+ *
+ *
+ * NVSwitch latency bins for port 5
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P04 719
+
+/**
+ * Low latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P05 720
+/**
+ * Medium latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P05 721
+/**
+ * High latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P05 722
+/**
+ * Max latency bin
+ *
+ *
+ * NVSwitch latency bins for port 6
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P05 723
+
+/**
+ * Low latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P06 724
+/**
+ * Medium latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P06 725
+/**
+ * High latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P06 726
+/**
+ * Max latency bin
+ *
+ *
+ * NVSwitch latency bins for port 7
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P06 727
+
+/**
+ * Low latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P07 728
+/**
+ * Medium latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P07 729
+/**
+ * High latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P07 730
+/**
+ * Max latency bin
+ *
+ *
+ * NVSwitch latency bins for port 8
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P07 731
+
+/**
+ * Low latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P08 732
+/**
+ * Medium latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P08 733
+/**
+ * High latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P08 734
+/**
+ * Max latency bin
+ *
+ *
+ * NVSwitch latency bins for port 9
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P08 735
+
+/**
+ * Low latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P09 736
+/**
+ * Medium latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P09 737
+/**
+ * High latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P09 738
+/**
+ * Max latency bin
+ *
+ *
+ * NVSwitch latency bins for port 10
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P09 739
+
+/**
+ * Low latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P10 740
+/**
+ * Medium latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P10 741
+/**
+ * High latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P10 742
+/**
+ * Max latency bin
+ *
+ *
+ * NVSwitch latency bins for port 11
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P10 743
+
+/**
+ * Low latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P11 744
+/**
+ * Medium latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P11 745
+/**
+ * High latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P11 746
+/**
+ * Max latency bin
+ *
+ *
+ * NVSwitch latency bins for port 12
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P11 747
+
+/**
+ * Low latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P12 748
+/**
+ * Medium latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P12 749
+/**
+ * High latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P12 750
+/**
+ * Max latency bin
+ *
+ *
+ * NVSwitch latency bins for port 13
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P12 751
+
+/**
+ * Low latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P13 752
+/**
+ * Medium latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P13 753
+/**
+ * High latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P13 754
+/**
+ * Max latency bin
+ *
+ *
+ * NVSwitch latency bins for port 14
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P13 755
+
+/**
+ * Low latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P14 756
+/**
+ * Medium latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P14 757
+/**
+ * High latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P14 758
+/**
+ * Max latency bin
+ *
+ *
+ * NVSwitch latency bins for port 15
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P14 759
+
+/**
+ * Low latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P15 760
+/**
+ * Medium latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P15 761
+/**
+ * High latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P15 762
+/**
+ * Max latency bin
+ *
+ *
+ * NVSwitch latency bins for port 16
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P15 763
+
+/**
+ * Low latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P16 764
+/**
+ * Medium latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P16 765
+/**
+ * High latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P16 766
+/**
+ * Max latency bin
+ *
+ *
+ * NVSwitch latency bins for port 17
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P16 767
/**
-* NVSwitch Tx Bandwidth Counter 0 for port 7
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P07 794
+ * Low latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P17 768
+/**
+ * Medium latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P17 769
+/**
+ * High latency bin
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P17 770
/**
-* NVSwitch Rx Bandwidth Counter 0 for port 7
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P07 795
+ * Max latency bin
+ *
+ *
+ *
+ * NVSwitch Tx and Rx Counter 0 for each port
+ * By default, Counter 0 counts bytes.
+ */
+#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P17 771
/**
-* NVSwitch Tx Bandwidth Counter 0 for port 8
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P08 796
+ * NVSwitch Tx Bandwidth Counter 0 for port 0
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P00 780
/**
-* NVSwitch Rx Bandwidth Counter 0 for port 8
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P08 797
+ * NVSwitch Rx Bandwidth Counter 0 for port 0
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P00 781
/**
-* NVSwitch Tx Bandwidth Counter 0 for port 9
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P09 798
+ * NVSwitch Tx Bandwidth Counter 0 for port 1
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P01 782
/**
-* NVSwitch Rx Bandwidth Counter 0 for port 9
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P09 799
+ * NVSwitch Rx Bandwidth Counter 0 for port 1
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P01 783
/**
-* NVSwitch Tx Bandwidth Counter 0 for port 10
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P10 800
+ * NVSwitch Tx Bandwidth Counter 0 for port 2
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P02 784
/**
-* NVSwitch Rx Bandwidth Counter 0 for port 10
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P10 801
+ * NVSwitch Rx Bandwidth Counter 0 for port 2
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P02 785
/**
-* NVSwitch Tx Bandwidth Counter 0 for port 11
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P11 802
+ * NVSwitch Tx Bandwidth Counter 0 for port 3
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P03 786
/**
-* NVSwitch Rx Bandwidth Counter 0 for port 11
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P11 803
-
+ * NVSwitch Rx Bandwidth Counter 0 for port 3
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P03 787
+
/**
-* NVSwitch Tx Bandwidth Counter 0 for port 12
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P12 804
+ * NVSwitch Tx Bandwidth Counter 0 for port 4
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P04 788
/**
-* NVSwitch Rx Bandwidth Counter 0 for port 12
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P12 805
+ * NVSwitch Rx Bandwidth Counter 0 for port 4
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P04 789
/**
-* NVSwitch Tx Bandwidth Counter 0 for port 13
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P13 806
+ * NVSwitch Tx Bandwidth Counter 0 for port 5
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P05 790
/**
-* NVSwitch Rx Bandwidth Counter 0 for port 13
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P13 807
+ * NVSwitch Rx Bandwidth Counter 0 for port 5
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P05 791
/**
-* NVSwitch Tx Bandwidth Counter 0 for port 14
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P14 808
+ * NVSwitch Tx Bandwidth Counter 0 for port 6
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P06 792
/**
-* NVSwitch Rx Bandwidth Counter 0 for port 14
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P14 809
+ * NVSwitch Rx Bandwidth Counter 0 for port 6
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P06 793
/**
-* NVSwitch Tx Bandwidth Counter 0 for port 15
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P15 810
+ * NVSwitch Tx Bandwidth Counter 0 for port 7
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P07 794
/**
-* NVSwitch Rx Bandwidth Counter 0 for port 15
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P15 811
+ * NVSwitch Rx Bandwidth Counter 0 for port 7
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P07 795
/**
-* NVSwitch Tx Bandwidth Counter 0 for port 16
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P16 812
+ * NVSwitch Tx Bandwidth Counter 0 for port 8
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P08 796
/**
-* NVSwitch Rx Bandwidth Counter 0 for port 16
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P16 813
+ * NVSwitch Rx Bandwidth Counter 0 for port 8
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P08 797
/**
-* NVSwitch Tx Bandwidth Counter 0 for port 17
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P17 814
+ * NVSwitch Tx Bandwidth Counter 0 for port 9
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P09 798
/**
-* NVSwitch Rx Bandwidth Counter 0 for port 17
-*
-*
-*
-* NVSwitch Tx and RX Bandwidth Counter 1 for each port
-* By default, Counter 1 counts packets.
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P17 815
+ * NVSwitch Rx Bandwidth Counter 0 for port 9
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P09 799
/**
-* NVSwitch Tx Bandwidth Counter 1 for port 0
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P00 820
+ * NVSwitch Tx Bandwidth Counter 0 for port 10
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P10 800
/**
-* NVSwitch Rx Bandwidth Counter 1 for port 0
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P00 821
+ * NVSwitch Rx Bandwidth Counter 0 for port 10
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P10 801
/**
-* NVSwitch Tx Bandwidth Counter 1 for port 1
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P01 822
+ * NVSwitch Tx Bandwidth Counter 0 for port 11
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P11 802
/**
-* NVSwitch Rx Bandwidth Counter 1 for port 1
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P01 823
+ * NVSwitch Rx Bandwidth Counter 0 for port 11
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P11 803
/**
-* NVSwitch Tx Bandwidth Counter 1 for port 2
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P02 824
+ * NVSwitch Tx Bandwidth Counter 0 for port 12
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P12 804
/**
-* NVSwitch Rx Bandwidth Counter 1 for port 2
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P02 825
+ * NVSwitch Rx Bandwidth Counter 0 for port 12
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P12 805
/**
-* NVSwitch Tx Bandwidth Counter 1 for port 3
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P03 826
+ * NVSwitch Tx Bandwidth Counter 0 for port 13
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P13 806
/**
-* NVSwitch Rx Bandwidth Counter 1 for port 3
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P03 827
+ * NVSwitch Rx Bandwidth Counter 0 for port 13
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P13 807
/**
-* NVSwitch Tx Bandwidth Counter 1 for port 4
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P04 828
+ * NVSwitch Tx Bandwidth Counter 0 for port 14
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P14 808
/**
-* NVSwitch Rx Bandwidth Counter 1 for port 4
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P04 829
+ * NVSwitch Rx Bandwidth Counter 0 for port 14
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P14 809
/**
-* NVSwitch Tx Bandwidth Counter 1 for port 5
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P05 830
+ * NVSwitch Tx Bandwidth Counter 0 for port 15
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P15 810
/**
-* NVSwitch Rx Bandwidth Counter 1 for port 5
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P05 831
+ * NVSwitch Rx Bandwidth Counter 0 for port 15
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P15 811
/**
-* NVSwitch Tx Bandwidth Counter 1 for port 6
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P06 832
+ * NVSwitch Tx Bandwidth Counter 0 for port 16
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P16 812
/**
-* NVSwitch Rx Bandwidth Counter 1 for port 6
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P06 833
+ * NVSwitch Rx Bandwidth Counter 0 for port 16
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P16 813
/**
-* NVSwitch Tx Bandwidth Counter 1 for port 7
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P07 834
+ * NVSwitch Tx Bandwidth Counter 0 for port 17
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P17 814
/**
-* NVSwitch Rx Bandwidth Counter 1 for port 7
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P07 835
+ * NVSwitch Rx Bandwidth Counter 0 for port 17
+ *
+ *
+ *
+ * NVSwitch Tx and RX Bandwidth Counter 1 for each port
+ * By default, Counter 1 counts packets.
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P17 815
/**
-* NVSwitch Tx Bandwidth Counter 1 for port 8
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P08 836
+ * NVSwitch Tx Bandwidth Counter 1 for port 0
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P00 820
/**
-* NVSwitch Rx Bandwidth Counter 1 for port 8
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P08 837
+ * NVSwitch Rx Bandwidth Counter 1 for port 0
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P00 821
/**
-* NVSwitch Tx Bandwidth Counter 1 for port 9
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P09 838
+ * NVSwitch Tx Bandwidth Counter 1 for port 1
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P01 822
/**
-* NVSwitch Rx Bandwidth Counter 1 for port 9
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P09 839
+ * NVSwitch Rx Bandwidth Counter 1 for port 1
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P01 823
/**
-* NVSwitch Tx Bandwidth Counter 0 for port 10
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P10 840
+ * NVSwitch Tx Bandwidth Counter 1 for port 2
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P02 824
/**
-* NVSwitch Rx Bandwidth Counter 1 for port 10
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P10 841
+ * NVSwitch Rx Bandwidth Counter 1 for port 2
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P02 825
/**
-* NVSwitch Tx Bandwidth Counter 1 for port 11
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P11 842
+ * NVSwitch Tx Bandwidth Counter 1 for port 3
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P03 826
/**
-* NVSwitch Rx Bandwidth Counter 1 for port 11
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P11 843
+ * NVSwitch Rx Bandwidth Counter 1 for port 3
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P03 827
/**
-* NVSwitch Tx Bandwidth Counter 1 for port 12
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P12 844
+ * NVSwitch Tx Bandwidth Counter 1 for port 4
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P04 828
/**
-* NVSwitch Rx Bandwidth Counter 1 for port 12
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P12 845
+ * NVSwitch Rx Bandwidth Counter 1 for port 4
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P04 829
/**
-* NVSwitch Tx Bandwidth Counter 0 for port 13
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P13 846
+ * NVSwitch Tx Bandwidth Counter 1 for port 5
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P05 830
/**
-* NVSwitch Rx Bandwidth Counter 1 for port 13
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P13 847
+ * NVSwitch Rx Bandwidth Counter 1 for port 5
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P05 831
/**
-* NVSwitch Tx Bandwidth Counter 1 for port 14
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P14 848
+ * NVSwitch Tx Bandwidth Counter 1 for port 6
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P06 832
/**
-* NVSwitch Rx Bandwidth Counter 1 for port 14
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P14 849
+ * NVSwitch Rx Bandwidth Counter 1 for port 6
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P06 833
/**
-* NVSwitch Tx Bandwidth Counter 1 for port 15
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P15 850
+ * NVSwitch Tx Bandwidth Counter 1 for port 7
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P07 834
/**
-* NVSwitch Rx Bandwidth Counter 1 for port 15
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P15 851
+ * NVSwitch Rx Bandwidth Counter 1 for port 7
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P07 835
/**
-* NVSwitch Tx Bandwidth Counter 1 for port 16
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P16 852
+ * NVSwitch Tx Bandwidth Counter 1 for port 8
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P08 836
/**
-* NVSwitch Rx Bandwidth Counter 1 for port 16
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P16 853
+ * NVSwitch Rx Bandwidth Counter 1 for port 8
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P08 837
/**
-* NVSwitch Tx Bandwidth Counter 1 for port 17
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P17 854
+ * NVSwitch Tx Bandwidth Counter 1 for port 9
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P09 838
+/**
+ * NVSwitch Rx Bandwidth Counter 1 for port 9
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P09 839
+
+/**
+ * NVSwitch Tx Bandwidth Counter 0 for port 10
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P10 840
+/**
+ * NVSwitch Rx Bandwidth Counter 1 for port 10
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P10 841
+
+/**
+ * NVSwitch Tx Bandwidth Counter 1 for port 11
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P11 842
+/**
+ * NVSwitch Rx Bandwidth Counter 1 for port 11
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P11 843
+
+/**
+ * NVSwitch Tx Bandwidth Counter 1 for port 12
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P12 844
+/**
+ * NVSwitch Rx Bandwidth Counter 1 for port 12
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P12 845
+
+/**
+ * NVSwitch Tx Bandwidth Counter 0 for port 13
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P13 846
+/**
+ * NVSwitch Rx Bandwidth Counter 1 for port 13
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P13 847
+
+/**
+ * NVSwitch Tx Bandwidth Counter 1 for port 14
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P14 848
+/**
+ * NVSwitch Rx Bandwidth Counter 1 for port 14
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P14 849
+
+/**
+ * NVSwitch Tx Bandwidth Counter 1 for port 15
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P15 850
/**
-* NVSwitch Rx Bandwidth Counter 1 for port 17
-*
-*
-*
-* NVSwitch error counters
-*/
-#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P17 855
+ * NVSwitch Rx Bandwidth Counter 1 for port 15
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P15 851
/**
-* NVSwitch fatal error information.
-* Note: value field indicates the specific SXid reported
-*/
-#define DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS 856
+ * NVSwitch Tx Bandwidth Counter 1 for port 16
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P16 852
+/**
+ * NVSwitch Rx Bandwidth Counter 1 for port 16
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P16 853
/**
-* NVSwitch non fatal error information.
-* Note: value field indicates the specific SXid reported
-*/
-#define DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS 857
+ * NVSwitch Tx Bandwidth Counter 1 for port 17
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P17 854
+/**
+ * NVSwitch Rx Bandwidth Counter 1 for port 17
+ *
+ *
+ *
+ * NVSwitch error counters
+ */
+#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P17 855
+
+/**
+ * NVSwitch fatal error information.
+ * Note: value field indicates the specific SXid reported
+ */
+#define DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS 856
+
+/**
+ * NVSwitch non fatal error information.
+ * Note: value field indicates the specific SXid reported
+ */
+#define DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS 857
/**
* Starting field ID of the NVSwitch instance
*/
-#define DCGM_FI_FIRST_NVSWITCH_FIELD_ID 700
+#define DCGM_FI_FIRST_NVSWITCH_FIELD_ID 700
/**
* Last field ID of the NVSwitch instance
*/
-#define DCGM_FI_LAST_NVSWITCH_FIELD_ID 860
+#define DCGM_FI_LAST_NVSWITCH_FIELD_ID 860
/**
- * For now max NVSwitch field Ids taken as difference of DCGM_FI_LAST_NVSWITCH_FIELD_ID and DCGM_FI_FIRST_NVSWITCH_FIELD_ID + 1 i.e. 200
+ * For now max NVSwitch field Ids taken as difference of DCGM_FI_LAST_NVSWITCH_FIELD_ID and
+ * DCGM_FI_FIRST_NVSWITCH_FIELD_ID + 1 i.e. 200
*/
-#define DCGM_FI_MAX_NVSWITCH_FIELDS DCGM_FI_LAST_NVSWITCH_FIELD_ID - DCGM_FI_FIRST_NVSWITCH_FIELD_ID + 1
+#define DCGM_FI_MAX_NVSWITCH_FIELDS DCGM_FI_LAST_NVSWITCH_FIELD_ID - DCGM_FI_FIRST_NVSWITCH_FIELD_ID + 1
/**
* Profiling Fields. These all start with DCGM_FI_PROF_*
*/
/**
- * Ratio of time the graphics engine is active. The graphics engine is
- * active if a graphics/compute context is bound and the graphics pipe or
+ * Ratio of time the graphics engine is active. The graphics engine is
+ * active if a graphics/compute context is bound and the graphics pipe or
* compute pipe is busy.
*/
-#define DCGM_FI_PROF_GR_ENGINE_ACTIVE 1001
+#define DCGM_FI_PROF_GR_ENGINE_ACTIVE 1001
/**
- * The ratio of cycles an SM has at least 1 warp assigned
- * (computed from the number of cycles and elapsed cycles)
+ * The ratio of cycles an SM has at least 1 warp assigned
+ * (computed from the number of cycles and elapsed cycles)
*/
-#define DCGM_FI_PROF_SM_ACTIVE 1002
+#define DCGM_FI_PROF_SM_ACTIVE 1002
/**
- * The ratio of number of warps resident on an SM.
- * (number of resident as a ratio of the theoretical
+ * The ratio of number of warps resident on an SM.
+ * (number of resident as a ratio of the theoretical
* maximum number of warps per elapsed cycle)
*/
-#define DCGM_FI_PROF_SM_OCCUPANCY 1003
+#define DCGM_FI_PROF_SM_OCCUPANCY 1003
/**
- * The ratio of cycles the tensor (HMMA) pipe is active
+ * The ratio of cycles the tensor (HMMA) pipe is active
* (off the peak sustained elapsed cycles)
*/
-#define DCGM_FI_PROF_PIPE_TENSOR_ACTIVE 1004
+#define DCGM_FI_PROF_PIPE_TENSOR_ACTIVE 1004
/**
- * The ratio of cycles the device memory interface is
+ * The ratio of cycles the device memory interface is
* active sending or receiving data.
*/
-#define DCGM_FI_PROF_DRAM_ACTIVE 1005
+#define DCGM_FI_PROF_DRAM_ACTIVE 1005
/**
* Ratio of cycles the fp64 pipe is active.
*/
-#define DCGM_FI_PROF_PIPE_FP64_ACTIVE 1006
+#define DCGM_FI_PROF_PIPE_FP64_ACTIVE 1006
/**
* Ratio of cycles the fp32 pipe is active.
*/
-#define DCGM_FI_PROF_PIPE_FP32_ACTIVE 1007
+#define DCGM_FI_PROF_PIPE_FP32_ACTIVE 1007
/**
* Ratio of cycles the fp16 pipe is active. This does not include HMMA.
*/
-#define DCGM_FI_PROF_PIPE_FP16_ACTIVE 1008
+#define DCGM_FI_PROF_PIPE_FP16_ACTIVE 1008
/**
* The number of bytes of active PCIe tx (transmit) data including both header and payload.
- *
+ *
* Note that this is from the perspective of the GPU, so copying data from device to host (DtoH)
* would be reflected in this metric.
*/
-#define DCGM_FI_PROF_PCIE_TX_BYTES 1009
+#define DCGM_FI_PROF_PCIE_TX_BYTES 1009
/**
* The number of bytes of active PCIe rx (read) data including both header and payload.
- *
+ *
* Note that this is from the perspective of the GPU, so copying data from host to device (HtoD)
* would be reflected in this metric.
*/
-#define DCGM_FI_PROF_PCIE_RX_BYTES 1010
+#define DCGM_FI_PROF_PCIE_RX_BYTES 1010
/**
* The number of bytes of active NvLink tx (transmit) data including both header and payload.
*/
-#define DCGM_FI_PROF_NVLINK_TX_BYTES 1011
+#define DCGM_FI_PROF_NVLINK_TX_BYTES 1011
/**
* The number of bytes of active NvLink rx (read) data including both header and payload.
*/
-#define DCGM_FI_PROF_NVLINK_RX_BYTES 1012
+#define DCGM_FI_PROF_NVLINK_RX_BYTES 1012
/**
* 1 greater than maximum fields above. This is the 1 greater than the maximum field id that could be allocated
*/
-#define DCGM_FI_MAX_FIELDS 1013
+#define DCGM_FI_MAX_FIELDS 1013
/** @} */
@@ -1903,11 +2099,11 @@ typedef unsigned int dcgm_field_eid_t;
*/
typedef struct
{
- char shortName[10]; /* Short name corresponding to field. This short name
- is used to identify columns in dmon output.*/
- char unit[4]; /* The unit of value. Eg: C(elsius), W(att), MB/s*/
- short width; /* Maximum width/number of digits that a value for field can have.*/
-} dcgm_field_output_format_t,*dcgm_field_output_format_p;
+ char shortName[10]; /*!< Short name corresponding to field. This short name is used to identify columns in dmon
+ output.*/
+ char unit[4]; /*!< The unit of value. Eg: C(elsius), W(att), MB/s*/
+ short width; /*!< Maximum width/number of digits that a value for field can have.*/
+} dcgm_field_output_format_t, *dcgm_field_output_format_p;
/**
* Structure to store meta data for the field
@@ -1915,15 +2111,18 @@ typedef struct
typedef struct
{
- unsigned short fieldId; /* Field identifier. DCGM_FI_? #define */
- char fieldType; /* Field type. DCGM_FT_? #define */
- unsigned char size; /* field size in bytes (raw value size). 0=variable (like DCGM_FT_STRING) */
- char tag[48]; /* Tag for this field for serialization like 'device_temperature' */
- int scope; /* Field scope. DCGM_FS_? #define of this field's association */
- int nvmlFieldId; /* Optional NVML field this DCGM field maps to. 0 = no mapping. Otherwise,
- this should be a NVML_FI_? #define from nvml.h */
-
- dcgm_field_output_format_p valueFormat; /* pointer to the structure that holds the formatting the values for fields */
+ unsigned short fieldId; /*!< Field identifier. DCGM_FI_? #define */
+ char fieldType; /*!< Field type. DCGM_FT_? #define */
+ unsigned char size; /*!< field size in bytes (raw value size). 0=variable (like DCGM_FT_STRING) */
+ char tag[48]; /*!< Tag for this field for serialization like 'device_temperature' */
+ int scope; /*!< Field scope. DCGM_FS_? #define of this field's association */
+ int nvmlFieldId; /*!< Optional NVML field this DCGM field maps to. 0 = no mapping.
+ Otherwise, this should be a NVML_FI_? #define from nvml.h */
+ dcgm_field_entity_group_t
+ entityLevel; /*!< Field entity level. DCGM_FE_? specifying at what level the field is queryable */
+
+ dcgm_field_output_format_p valueFormat; /*!< pointer to the structure that holds the formatting the
+ values for fields */
} dcgm_field_meta_t, *dcgm_field_meta_p;
/***************************************************************************************************/
@@ -1934,48 +2133,60 @@ typedef struct
/**
* Get a pointer to the metadata for a field by its field ID. See DCGM_FI_? for a list of field IDs.
- * @param fieldId IN: One of the field IDs (DCGM_FI_?)
+ *
+ * @param fieldId IN: One of the field IDs (DCGM_FI_?)
+ *
* @return
- * 0 On Failure
- * > 0 Pointer to field metadata structure if found.
+ * 0 On Failure
+ * >0 Pointer to field metadata structure if found.
+ *
*/
dcgm_field_meta_p DcgmFieldGetById(unsigned short fieldId);
/**
* Get a pointer to the metadata for a field by its field tag.
+ *
* @param tag IN: Tag for the field of interest
+ *
* @return
- * 0 On failure or not found
- * > 0 Pointer to field metadata structure if found
+ * 0 On failure or not found
+ * >0 Pointer to field metadata structure if found
+ *
*/
dcgm_field_meta_p DcgmFieldGetByTag(char *tag);
/**
* Initialize the DcgmFields module. Call this once from inside
* your program
- * @return
- * 0 On success
- * <0 On error
+ *
+ * @return
+ * 0 On success
+ * <0 On error
+ *
*/
int DcgmFieldsInit(void);
/**
* Terminates the DcgmFields module. Call this once from inside your program
- * @return
- * 0 On success
- * <0 On error
+ *
+ * @return
+ * 0 On success
+ * <0 On error
+ *
*/
int DcgmFieldsTerm(void);
/**
* Get the string version of a entityGroupId
*
- * Returns Pointer to a string like GPU/NvSwitch..etc
- * Null on error
+ * @returns
+ * - Pointer to a string like GPU/NvSwitch..etc
+ * - Null on error
+ *
*/
char *DcgmFieldsGetEntityGroupString(dcgm_field_entity_group_t entityGroupId);
-/** @} */
+/** @} */
#ifdef __cplusplus
@@ -1983,4 +2194,4 @@ char *DcgmFieldsGetEntityGroupString(dcgm_field_entity_group_t entityGroupId);
#endif
-#endif //DCGMFIELDS_H
+#endif // DCGMFIELDS_H
diff --git a/bindings/go/dcgm/dcgm_structs.h b/bindings/go/dcgm/dcgm_structs.h
index a882ce1..501de36 100644
--- a/bindings/go/dcgm/dcgm_structs.h
+++ b/bindings/go/dcgm/dcgm_structs.h
@@ -16,40 +16,41 @@
#ifndef DCGM_STRUCTS_H
#define DCGM_STRUCTS_H
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "dcgm_fields.h"
+#include "dcgm_fields.h"
#include
+
/***************************************************************************************************/
-/** @defgroup nvmlReturnEnums Enums and Macros
+/** @defgroup dcgmReturnEnums Enums and Macros
* @{
*/
-/***************************************************************************************************/
+/***************************************************************************************************/
+
+/**
+ * Creates a unique version number for each struct
+ */
+#define MAKE_DCGM_VERSION(typeName, ver) (unsigned int)(sizeof(typeName) | ((unsigned long)(ver) << 24U))
/**
* Represents value of the field which can be returned by Host Engine in case the
* operation is not successful
- *
*/
#ifndef DCGM_BLANK_VALUES
#define DCGM_BLANK_VALUES
-
+
/**
- * Base value for 32 bits integer blank. can be used as an unspecified blank
+ * Base value for 32 bits integer blank. can be used as an unspecified blank
*/
#define DCGM_INT32_BLANK 0x7ffffff0
-
+
/**
- * Base value for 64 bits integer blank. can be used as an unspecified blank
+ * Base value for 64 bits integer blank. can be used as an unspecified blank
*/
#define DCGM_INT64_BLANK 0x7ffffffffffffff0
/**
* Base value for double blank. 2 ** 47. FP 64 has 52 bits of mantissa,
- * so 47 bits can still increment by 1 and represent each value from 0-15
+ * so 47 bits can still increment by 1 and represent each value from 0-15
*/
#define DCGM_FP64_BLANK 140737488355328.0
@@ -58,201 +59,192 @@ extern "C" {
*/
#define DCGM_STR_BLANK "<<>>"
-/**
- * Represents an error where INT32 data was not found
- */
-#define DCGM_INT32_NOT_FOUND (DCGM_INT32_BLANK+1)
-
-/**
- * Represents an error where INT64 data was not found
- */
-#define DCGM_INT64_NOT_FOUND (DCGM_INT64_BLANK+1)
-
-/**
- * Represents an error where FP64 data was not found
- */
-#define DCGM_FP64_NOT_FOUND (DCGM_FP64_BLANK+1.0)
-
-/**
- * Represents an error where STR data was not found
- */
-#define DCGM_STR_NOT_FOUND "<<>>"
-
-/**
- * Represents an error where fetching the INT32 value is not supported
- */
-#define DCGM_INT32_NOT_SUPPORTED (DCGM_INT32_BLANK+2)
-
-/**
- * Represents an error where fetching the INT64 value is not supported
- */
-#define DCGM_INT64_NOT_SUPPORTED (DCGM_INT64_BLANK+2)
-
-/**
- * Represents an error where fetching the FP64 value is not supported
- */
-#define DCGM_FP64_NOT_SUPPORTED (DCGM_FP64_BLANK+2.0)
-
-/**
- * Represents an error where fetching the STR value is not supported
- */
-#define DCGM_STR_NOT_SUPPORTED "<<>>"
-
-/**
- * Represents and error where fetching the INT32 value is not allowed with our current credentials
- */
-#define DCGM_INT32_NOT_PERMISSIONED (DCGM_INT32_BLANK+3)
-
-/**
- * Represents and error where fetching the INT64 value is not allowed with our current credentials
- */
-#define DCGM_INT64_NOT_PERMISSIONED (DCGM_INT64_BLANK+3)
-
-/**
- * Represents and error where fetching the FP64 value is not allowed with our current credentials
- */
-#define DCGM_FP64_NOT_PERMISSIONED (DCGM_FP64_BLANK+3.0)
-
-/**
- * Represents and error where fetching the STR value is not allowed with our current credentials
- */
-#define DCGM_STR_NOT_PERMISSIONED "<<>>"
-
-/**
- * Macro to check if a INT32 value is blank or not
+/**
+ * Represents an error where INT32 data was not found
+ */
+#define DCGM_INT32_NOT_FOUND (DCGM_INT32_BLANK + 1)
+
+/**
+ * Represents an error where INT64 data was not found
+ */
+#define DCGM_INT64_NOT_FOUND (DCGM_INT64_BLANK + 1)
+
+/**
+ * Represents an error where FP64 data was not found
+ */
+#define DCGM_FP64_NOT_FOUND (DCGM_FP64_BLANK + 1.0)
+
+/**
+ * Represents an error where STR data was not found
+ */
+#define DCGM_STR_NOT_FOUND "<<>>"
+
+/**
+ * Represents an error where fetching the INT32 value is not supported
+ */
+#define DCGM_INT32_NOT_SUPPORTED (DCGM_INT32_BLANK + 2)
+
+/**
+ * Represents an error where fetching the INT64 value is not supported
+ */
+#define DCGM_INT64_NOT_SUPPORTED (DCGM_INT64_BLANK + 2)
+
+/**
+ * Represents an error where fetching the FP64 value is not supported
+ */
+#define DCGM_FP64_NOT_SUPPORTED (DCGM_FP64_BLANK + 2.0)
+
+/**
+ * Represents an error where fetching the STR value is not supported
+ */
+#define DCGM_STR_NOT_SUPPORTED "<<>>"
+
+/**
+ * Represents and error where fetching the INT32 value is not allowed with our current credentials
+ */
+#define DCGM_INT32_NOT_PERMISSIONED (DCGM_INT32_BLANK + 3)
+
+/**
+ * Represents and error where fetching the INT64 value is not allowed with our current credentials
+ */
+#define DCGM_INT64_NOT_PERMISSIONED (DCGM_INT64_BLANK + 3)
+
+/**
+ * Represents and error where fetching the FP64 value is not allowed with our current credentials
+ */
+#define DCGM_FP64_NOT_PERMISSIONED (DCGM_FP64_BLANK + 3.0)
+
+/**
+ * Represents and error where fetching the STR value is not allowed with our current credentials
+ */
+#define DCGM_STR_NOT_PERMISSIONED "<<>>"
+
+/**
+ * Macro to check if a INT32 value is blank or not
*/
#define DCGM_INT32_IS_BLANK(val) (((val) >= DCGM_INT32_BLANK) ? 1 : 0)
-
-/**
- * Macro to check if a INT64 value is blank or not
- */
+
+/**
+ * Macro to check if a INT64 value is blank or not
+ */
#define DCGM_INT64_IS_BLANK(val) (((val) >= DCGM_INT64_BLANK) ? 1 : 0)
-
-/**
- * Macro to check if a FP64 value is blank or not
- */
+
+/**
+ * Macro to check if a FP64 value is blank or not
+ */
#define DCGM_FP64_IS_BLANK(val) (((val) >= DCGM_FP64_BLANK ? 1 : 0))
-
-/**
- * Macro to check if a STR value is blank or not
+
+/**
+ * Macro to check if a STR value is blank or not
* Works on (char *). Looks for <<< at first position and >>> inside string
- */
+ */
#define DCGM_STR_IS_BLANK(val) (val == strstr(val, "<<<") && strstr(val, ">>>"))
-#endif //DCGM_BLANK_VALUES
+#endif // DCGM_BLANK_VALUES
/**
* Max number of GPUs supported by DCGM
- */
-#define DCGM_MAX_NUM_DEVICES 16
+ */
+#define DCGM_MAX_NUM_DEVICES 32 /* DCGM 2.0 and newer = 32. DCGM 1.8 and older = 16. */
/**
* Number of NvLink links per GPU supported by DCGM
- * This is 6 for Volta and 4 for Pascal
+ * This is 12 for Ampere, 6 for Volta, and 4 for Pascal
+ */
+#define DCGM_NVLINK_MAX_LINKS_PER_GPU 12
+
+/**
+ * Maximum NvLink links pre-Ampere
*/
-#define DCGM_NVLINK_MAX_LINKS_PER_GPU 6
+#define DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY1 6
/**
- * Max number of NvSwitches supported by DCGM
+ * Max number of NvSwitches supported by DCGM
**/
#define DCGM_MAX_NUM_SWITCHES 12
/**
* Number of NvLink links per NvSwitch supported by DCGM
*/
-#define DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH 18
+#define DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH 36
/**
* Maximum number of vGPU instances per physical GPU
*/
#define DCGM_MAX_VGPU_INSTANCES_PER_PGPU 32
-/**
- * Max number of vGPUs supported on DCGM
- */
-#define DCGM_MAX_NUM_VGPU_DEVICES DCGM_MAX_NUM_DEVICES * DCGM_MAX_VGPU_INSTANCES_PER_PGPU
-
/**
* Max length of the DCGM string field
*/
-#define DCGM_MAX_STR_LENGTH 256
+#define DCGM_MAX_STR_LENGTH 256
/**
* Max number of clocks supported for a device
*/
-#define DCGM_MAX_CLOCKS 256
+#define DCGM_MAX_CLOCKS 256
/**
* Max limit on the number of groups supported by DCGM
*/
-#define DCGM_MAX_NUM_GROUPS 64
+#define DCGM_MAX_NUM_GROUPS 64
/**
* Max number of active FBC sessions
*/
-#define DCGM_MAX_FBC_SESSIONS 256
-
+#define DCGM_MAX_FBC_SESSIONS 256
/**
- * Represents the size of a buffer that holds a vGPU type Name or vGPU class type or name of process running on vGPU instance.
+ * Represents the size of a buffer that holds a vGPU type Name or vGPU class type or name of process running on vGPU
+ * instance.
*/
-#define DCGM_VGPU_NAME_BUFFER_SIZE 64
+#define DCGM_VGPU_NAME_BUFFER_SIZE 64
/**
* Represents the size of a buffer that holds a vGPU license string
*/
-#define DCGM_GRID_LICENSE_BUFFER_SIZE 128
+#define DCGM_GRID_LICENSE_BUFFER_SIZE 128
/**
* Default compute mode -- multiple contexts per device
*/
-#define DCGM_CONFIG_COMPUTEMODE_DEFAULT 0
-
+#define DCGM_CONFIG_COMPUTEMODE_DEFAULT 0
+
/**
* Compute-prohibited mode -- no contexts per device
*/
-#define DCGM_CONFIG_COMPUTEMODE_PROHIBITED 1
-
+#define DCGM_CONFIG_COMPUTEMODE_PROHIBITED 1
+
/**
- * Compute-exclusive-process mode -- only one context per device, usable from multiple threads at
- * a time
+ * Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time
*/
-#define DCGM_CONFIG_COMPUTEMODE_EXCLUSIVE_PROCESS 2
-
+#define DCGM_CONFIG_COMPUTEMODE_EXCLUSIVE_PROCESS 2
/**
* Default Port Number for DCGM Host Engine
*/
#define DCGM_HE_PORT_NUMBER 5555
-
-/**
- * Creates a unique version number for each struct
- */
-#define MAKE_DCGM_VERSION(typeName,ver) (unsigned int)(sizeof(typeName) | ((ver)<<24))
-
-/***************************************************************************************************/
-
-
-
+#ifdef __cplusplus
+extern "C" {
+#endif
/**
* Operation mode for DCGM
- *
- * DCGM can run in auto-mode where it runs additional threads in the background to collect
+ *
+ * DCGM can run in auto-mode where it runs additional threads in the background to collect
* any metrics of interest and auto manages any operations needed for policy management.
- *
+ *
* DCGM can also operate in manual-mode where it's execution is controlled by the user. In
* this mode, the user has to periodically call APIs such as \ref dcgmPolicyTrigger and
* \ref dcgmUpdateAllFields which tells DCGM to wake up and perform data collection and
* operations needed for policy management.
*/
-typedef enum dcgmOperationMode_enum
+typedef enum dcgmOperationMode_enum
{
DCGM_OPERATION_MODE_AUTO = 1,
DCGM_OPERATION_MODE_MANUAL = 2
} dcgmOperationMode_t;
-
+
/**
* When more than one value is returned from a query, which order should it be returned in?
*/
@@ -262,12 +254,12 @@ typedef enum dcgmOrder_enum
DCGM_ORDER_DESCENDING = 2 //!< Data with latest (highest) timestamps returned first
} dcgmOrder_t;
-/**
- * Return values for DCGM API calls.
+/**
+ * Return values for DCGM API calls.
*/
typedef enum dcgmReturn_enum
{
- DCGM_ST_OK = 0, //!< Success
+ DCGM_ST_OK = 0, //!< Success
DCGM_ST_BADPARAM = -1, //!< A bad parameter was passed to a function
DCGM_ST_GENERIC_ERROR = -3, //!< A generic, unspecified error
DCGM_ST_MEMORY = -4, //!< An out of memory error occurred
@@ -287,166 +279,105 @@ typedef enum dcgmReturn_enum
DCGM_ST_GPU_IS_LOST = -18, //!< GPU is no longer reachable
DCGM_ST_RESET_REQUIRED = -19, //!< GPU requires a reset
DCGM_ST_FUNCTION_NOT_FOUND = -20, //!< The function that was requested was not found (bindings only error)
- DCGM_ST_CONNECTION_NOT_VALID = -21, //!< The connection to the host engine is not valid any longer
+ DCGM_ST_CONNECTION_NOT_VALID = -21, //!< The connection to the host engine is not valid any longer
DCGM_ST_GPU_NOT_SUPPORTED = -22, //!< This GPU is not supported by DCGM
- DCGM_ST_GROUP_INCOMPATIBLE = -23, //!< The GPUs of the provided group are not compatible with each other for the requested operation
- DCGM_ST_MAX_LIMIT = -24, //!< Max limit reached for the object
- DCGM_ST_LIBRARY_NOT_FOUND = -25, //!< DCGM library could not be found
- DCGM_ST_DUPLICATE_KEY = -26, //!< Duplicate key passed to a function
- DCGM_ST_GPU_IN_SYNC_BOOST_GROUP = -27, //! Maxwell, setting this implies autoBoost=0
-}dcgmConfigPerfStateSettings_t;
+ unsigned int syncBoost; //!< Sync Boost Mode (0: Disabled, 1 : Enabled, DCGM_INT32_BLANK : Ignored). Note that
+ //!< using this setting may result in lower clocks than targetClocks
+ dcgmClockSet_t targetClocks; //!< Target clocks. Set smClock and memClock to DCGM_INT32_BLANK to ignore/use
+ //!< compatible values. For GPUs > Maxwell, setting this implies autoBoost=0
+} dcgmConfigPerfStateSettings_t;
/**
- * Used to represents the power capping limit for each GPU in the group or to represent the power
+ * Used to represents the power capping limit for each GPU in the group or to represent the power
* budget for the entire group
*/
typedef struct
{
- dcgmConfigPowerLimitType_t type; //!< Flag to represent power cap for each GPU or power budget for the group of GPUs
- unsigned int val; //!< Power Limit in Watts (Set a value OR DCGM_INT32_BLANK to Ignore)
-}dcgmConfigPowerLimit_t;
+ dcgmConfigPowerLimitType_t type; //!< Flag to represent power cap for each GPU or power budget for the group of GPUs
+ unsigned int val; //!< Power Limit in Watts (Set a value OR DCGM_INT32_BLANK to Ignore)
+} dcgmConfigPowerLimit_t;
/**
* Structure to represent default and target configuration for a device
*/
typedef struct
{
- unsigned int version; //!< Version number (dcgmConfig_version)
- unsigned int gpuId; //!< GPU ID
- unsigned int eccMode; //!< ECC Mode (0: Disabled, 1 : Enabled, DCGM_INT32_BLANK : Ignored)
- unsigned int computeMode; //!< Compute Mode (One of DCGM_CONFIG_COMPUTEMODE_? OR DCGM_INT32_BLANK to Ignore)
- dcgmConfigPerfStateSettings_t perfState; //!< Performance State Settings (clocks / boost mode)
- dcgmConfigPowerLimit_t powerLimit; //!< Power Limits
-}dcgmConfig_v1;
+ unsigned int version; //!< Version number (dcgmConfig_version)
+ unsigned int gpuId; //!< GPU ID
+ unsigned int eccMode; //!< ECC Mode (0: Disabled, 1 : Enabled, DCGM_INT32_BLANK : Ignored)
+ unsigned int computeMode; //!< Compute Mode (One of DCGM_CONFIG_COMPUTEMODE_? OR DCGM_INT32_BLANK to Ignore)
+ dcgmConfigPerfStateSettings_t perfState; //!< Performance State Settings (clocks / boost mode)
+ dcgmConfigPowerLimit_t powerLimit; //!< Power Limits
+} dcgmConfig_v1;
/**
* Typedef for \ref dcgmConfig_v1
@@ -1272,56 +1234,28 @@ typedef dcgmConfig_v1 dcgmConfig_t;
*/
#define dcgmConfig_version dcgmConfig_version1
-/**
- * Structure to represent default and target vgpu configuration for a device
- */
-typedef struct
-{
- unsigned int version; //!< Version number (dcgmConfig_version)
- unsigned int gpuId; //!< GPU ID
- unsigned int eccMode; //!< ECC Mode (0: Disabled, 1 : Enabled, DCGM_INT32_BLANK : Ignored)
- unsigned int computeMode; //!< Compute Mode (One of DCGM_CONFIG_COMPUTEMODE_? OR DCGM_INT32_BLANK to Ignore)
- dcgmConfigPerfStateSettings_t perfState; //!< Performance State Settings (clocks / boost mode)
- dcgmConfigPowerLimit_t powerLimit; //!< Power Limits
-}dcgmVgpuConfig_v1;
-
-/**
- * Typedef for \ref dcgmVgpuConfig_v1
- */
-typedef dcgmVgpuConfig_v1 dcgmVgpuConfig_t;
-
-/**
- * Version 1 for \ref dcgmVgpuConfig_v1
- */
-#define dcgmVgpuConfig_version1 MAKE_DCGM_VERSION(dcgmVgpuConfig_v1, 1)
-
-/**
- * Latest version for \ref dcgmVgpuConfig_t
- */
-#define dcgmVgpuConfig_version dcgmVgpuConfig_version1
-
/**
* Represents a callback to receive updates from asynchronous functions.
* Currently the only implemented callback function is dcgmPolicyRegister
* and the void * data will be a pointer to dcgmPolicyCallbackResponse_t.
* Ex.
* dcgmPolicyCallbackResponse_t *callbackResponse = (dcgmPolicyCallbackResponse_t *) userData;
- *
+ *
*/
typedef int (*fpRecvUpdates)(void *userData);
/*Remove from doxygen documentation
*
- * Define the structure that contains specific policy information
+ * Define the structure that contains specific policy information
*/
-typedef struct
+typedef struct
{
// version must always be first
- unsigned int version; //!< Version number (dcgmPolicyViolation_version)
+ unsigned int version; //!< Version number (dcgmPolicyViolation_version)
- unsigned int notifyOnEccDbe; //!< true/false notification on ECC Double Bit Errors
- unsigned int notifyOnPciEvent; //!< true/false notification on PCI Events
- unsigned int notifyOnMaxRetiredPages; //!< number of retired pages to occur before notification
+ unsigned int notifyOnEccDbe; //!< true/false notification on ECC Double Bit Errors
+ unsigned int notifyOnPciEvent; //!< true/false notification on PCI Events
+ unsigned int notifyOnMaxRetiredPages; //!< number of retired pages to occur before notification
} dcgmPolicyViolation_v1;
/*Remove from doxygen documentation
@@ -1344,21 +1278,22 @@ typedef dcgmPolicyViolation_v1 dcgmPolicyViolation_t;
*/
#define dcgmPolicyViolation_version dcgmPolicyViolation_version1
-/**
+/**
* Enumeration for policy conditions.
- * When used as part of dcgmPolicy_t these have corresponding parameters to
+ * When used as part of dcgmPolicy_t these have corresponding parameters to
* allow them to be switched on/off or set specific violation thresholds
*/
typedef enum dcgmPolicyCondition_enum
{
// these are bitwise rather than sequential
- DCGM_POLICY_COND_DBE = 0x1, //!< Double bit errors -- boolean in dcgmPolicyConditionParms_t
- DCGM_POLICY_COND_PCI = 0x2, //!< PCI events/errors -- boolean in dcgmPolicyConditionParms_t
- DCGM_POLICY_COND_MAX_PAGES_RETIRED = 0x4, //!< Maximum number of retired pages -- number required in dcgmPolicyConditionParms_t
- DCGM_POLICY_COND_THERMAL = 0x8, //!< Thermal violation -- number required in dcgmPolicyConditionParms_t
- DCGM_POLICY_COND_POWER = 0x10, //!< Power violation -- number required in dcgmPolicyConditionParms_t
- DCGM_POLICY_COND_NVLINK = 0x20, //!< NVLINK errors -- boolean in dcgmPolicyConditionParms_t
- DCGM_POLICY_COND_XID = 0x40, //!< XID errors -- number required in dcgmPolicyConditionParms_t
+ DCGM_POLICY_COND_DBE = 0x1, //!< Double bit errors -- boolean in dcgmPolicyConditionParams_t
+ DCGM_POLICY_COND_PCI = 0x2, //!< PCI events/errors -- boolean in dcgmPolicyConditionParams_t
+ DCGM_POLICY_COND_MAX_PAGES_RETIRED = 0x4, //!< Maximum number of retired pages -- number
+ //!< required in dcgmPolicyConditionParams_t
+ DCGM_POLICY_COND_THERMAL = 0x8, //!< Thermal violation -- number required in dcgmPolicyConditionParams_t
+ DCGM_POLICY_COND_POWER = 0x10, //!< Power violation -- number required in dcgmPolicyConditionParams_t
+ DCGM_POLICY_COND_NVLINK = 0x20, //!< NVLINK errors -- boolean in dcgmPolicyConditionParams_t
+ DCGM_POLICY_COND_XID = 0x40, //!< XID errors -- number required in dcgmPolicyConditionParams_t
} dcgmPolicyCondition_t;
#define DCGM_POLICY_COND_MAX 7
@@ -1369,22 +1304,27 @@ typedef enum dcgmPolicyCondition_enum
* as well as a "val" which is a union of the possible value types. For example,
* to pass a true boolean: tag = BOOL, val.boolean = 1.
*/
-typedef struct dcgmPolicyConditionParms_st
+typedef struct dcgmPolicyConditionParams_st
{
- enum {BOOL, LLONG} tag;
- union {
- unsigned int boolean;
+ enum
+ {
+ BOOL,
+ LLONG
+ } tag;
+ union
+ {
+ unsigned int boolean;
unsigned long long llval;
} val;
-} dcgmPolicyConditionParms_t;
+} dcgmPolicyConditionParams_t;
/**
* Enumeration for policy modes
*/
typedef enum dcgmPolicyMode_enum
{
- DCGM_POLICY_MODE_AUTOMATED = 0, //!< automatic mode
- DCGM_POLICY_MODE_MANUAL = 1, //!< manual mode
+ DCGM_POLICY_MODE_AUTOMATED = 0, //!< automatic mode
+ DCGM_POLICY_MODE_MANUAL = 1, //!< manual mode
} dcgmPolicyMode_t;
/**
@@ -1392,7 +1332,7 @@ typedef enum dcgmPolicyMode_enum
*/
typedef enum dcgmPolicyIsolation_enum
{
- DCGM_POLICY_ISOLATION_NONE = 0, //!< no isolation of GPUs on error
+ DCGM_POLICY_ISOLATION_NONE = 0, //!< no isolation of GPUs on error
} dcgmPolicyIsolation_t;
/**
@@ -1400,8 +1340,8 @@ typedef enum dcgmPolicyIsolation_enum
*/
typedef enum dcgmPolicyAction_enum
{
- DCGM_POLICY_ACTION_NONE = 0, //!< no action
- DCGM_POLICY_ACTION_GPURESET = 1, //!< perform a GPU reset on violation
+ DCGM_POLICY_ACTION_NONE = 0, //!< no action
+ DCGM_POLICY_ACTION_GPURESET = 1, //!< Deprecated - perform a GPU reset on violation
} dcgmPolicyAction_t;
/**
@@ -1409,10 +1349,10 @@ typedef enum dcgmPolicyAction_enum
*/
typedef enum dcgmPolicyValidation_enum
{
- DCGM_POLICY_VALID_NONE = 0, //!< no validation after an action is performed
- DCGM_POLICY_VALID_SV_SHORT = 1, //!< run a short System Validation on the system after failure
- DCGM_POLICY_VALID_SV_MED = 2, //!< run a medium System Validation test after failure
- DCGM_POLICY_VALID_SV_LONG = 3, //!< run a extensive System Validation test after failure
+ DCGM_POLICY_VALID_NONE = 0, //!< no validation after an action is performed
+ DCGM_POLICY_VALID_SV_SHORT = 1, //!< run a short System Validation on the system after failure
+ DCGM_POLICY_VALID_SV_MED = 2, //!< run a medium System Validation test after failure
+ DCGM_POLICY_VALID_SV_LONG = 3, //!< run a extensive System Validation test after failure
} dcgmPolicyValidation_t;
/**
@@ -1420,33 +1360,33 @@ typedef enum dcgmPolicyValidation_enum
*/
typedef enum dcgmPolicyFailureResp_enum
{
- DCGM_POLICY_FAILURE_NONE = 0, //!< on failure of validation perform no action
+ DCGM_POLICY_FAILURE_NONE = 0, //!< on failure of validation perform no action
} dcgmPolicyFailureResp_t;
-/**
+/**
* Structure to fill when a user queries for policy violations
*/
-typedef struct
+typedef struct
{
- unsigned int gpuId; //!< gpu ID
- unsigned int violationOccurred; //!< a violation based on the bit values in \ref dcgmPolicyCondition_t
+ unsigned int gpuId; //!< gpu ID
+ unsigned int violationOccurred; //!< a violation based on the bit values in \ref dcgmPolicyCondition_t
} dcgmPolicyViolationNotify_t;
/**
- * Define the structure that specifies a policy to be enforced for a GPU
+ * Define the structure that specifies a policy to be enforced for a GPU
*/
-typedef struct
+typedef struct
{
// version must always be first
- unsigned int version; //!< version number (dcgmPolicy_version)
-
- dcgmPolicyCondition_t condition; //!< Condition(s) to access \ref dcgmPolicyCondition_t
- dcgmPolicyMode_t mode; //!< Mode of operation \ref dcgmPolicyMode_t
- dcgmPolicyIsolation_t isolation; //!< Isolation level after a policy violation \ref dcgmPolicyIsolation_t
- dcgmPolicyAction_t action; //!< Action to perform after a policy violation \ref dcgmPolicyAction_t action
- dcgmPolicyValidation_t validation; //!< Validation to perform after action is taken \ref dcgmPolicyValidation_t
- dcgmPolicyFailureResp_t response; //!< Failure to validation response \ref dcgmPolicyFailureResp_t
- dcgmPolicyConditionParms_t parms[DCGM_POLICY_COND_MAX]; //!< Parameters for the \a condition fields
+ unsigned int version; //!< version number (dcgmPolicy_version)
+
+ dcgmPolicyCondition_t condition; //!< Condition(s) to access \ref dcgmPolicyCondition_t
+ dcgmPolicyMode_t mode; //!< Mode of operation \ref dcgmPolicyMode_t
+ dcgmPolicyIsolation_t isolation; //!< Isolation level after a policy violation \ref dcgmPolicyIsolation_t
+ dcgmPolicyAction_t action; //!< Action to perform after a policy violation \ref dcgmPolicyAction_t action
+ dcgmPolicyValidation_t validation; //!< Validation to perform after action is taken \ref dcgmPolicyValidation_t
+ dcgmPolicyFailureResp_t response; //!< Failure to validation response \ref dcgmPolicyFailureResp_t
+ dcgmPolicyConditionParams_t parms[DCGM_POLICY_COND_MAX]; //!< Parameters for the \a condition fields
} dcgmPolicy_v1;
/**
@@ -1470,9 +1410,16 @@ typedef dcgmPolicy_v1 dcgmPolicy_t;
*/
typedef struct
{
- long long timestamp; //!< timestamp of the error
- enum {L1, L2, DEVICE, REGISTER, TEXTURE} location; //!< location of the error
- unsigned int numerrors; //!< number of errors
+ long long timestamp; //!< timestamp of the error
+ enum
+ {
+ L1,
+ L2,
+ DEVICE,
+ REGISTER,
+ TEXTURE
+ } location; //!< location of the error
+ unsigned int numerrors; //!< number of errors
} dcgmPolicyConditionDbe_t;
/**
@@ -1480,8 +1427,8 @@ typedef struct
*/
typedef struct
{
- long long timestamp; //!< timestamp of the error
- unsigned int counter; //!< value of the PCIe replay counter
+ long long timestamp; //!< timestamp of the error
+ unsigned int counter; //!< value of the PCIe replay counter
} dcgmPolicyConditionPci_t;
/**
@@ -1489,37 +1436,37 @@ typedef struct
*/
typedef struct
{
- long long timestamp; //!< timestamp of the error
- unsigned int sbepages; //!< number of pending pages due to SBE
- unsigned int dbepages; //!< number of pending pages due to DBE
+ long long timestamp; //!< timestamp of the error
+ unsigned int sbepages; //!< number of pending pages due to SBE
+ unsigned int dbepages; //!< number of pending pages due to DBE
} dcgmPolicyConditionMpr_t;
-/**
+/**
* Define the thermal policy violations return structure
*/
typedef struct
{
- long long timestamp; //!< timestamp of the error
- unsigned int thermalViolation; //!< Temperature reached that violated policy
+ long long timestamp; //!< timestamp of the error
+ unsigned int thermalViolation; //!< Temperature reached that violated policy
} dcgmPolicyConditionThermal_t;
-/**
+/**
* Define the power policy violations return structure
*/
typedef struct
{
- long long timestamp; //!< timestamp of the error
- unsigned int powerViolation; //!< Power value reached that violated policy
+ long long timestamp; //!< timestamp of the error
+ unsigned int powerViolation; //!< Power value reached that violated policy
} dcgmPolicyConditionPower_t;
-/**
+/**
* Define the nvlink policy violations return structure
*/
typedef struct
{
- long long timestamp; //!< timestamp of the error
- unsigned short fieldId; //!
+ * Every pair is separated by a colon char (:). Only the very first colon is considered as a separation.
+ * Values can contain colon chars. Values and Keys cannot contain semicolon chars.
+ * Usually defined keys are:
+ *
+ * version : DCGM Version.
+ * arch : Target DCGM Architecture.
+ * buildid : Build ID. Usually a sequential number.
+ * commit : Commit ID (Usually a git commit hash).
+ * author : Author of the commit above.
+ * branch : Branch (Usually a git branch that was used for the build).
+ * buildtype : Build Type.
+ * builddate : Date of the build.
+ * buildplatform : Platform where the build was made.
+ *
+ * Any or all keys may be absent.
+ * This values are for reference only are not supposed to participate in some complicated logic.
+ */
+ char rawBuildInfoString[DCGM_MAX_STR_LENGTH * 2];
+} dcgmVersionInfo_v2;
+
+/**
+ * Version 2 of the dcgmVersionInfo_v2
+ */
+#define dcgmVersionInfo_version2 MAKE_DCGM_VERSION(dcgmVersionInfo_v2, 2)
+
+#define dcgmVersionInfo_version dcgmVersionInfo_version2
+typedef dcgmVersionInfo_v2 dcgmVersionInfo_t;
/** @} */
-#ifdef __cplusplus
+#ifdef __cplusplus
}
#endif
-#endif /* DCGM_STRUCTS_H */
+#endif /* DCGM_STRUCTS_H */
diff --git a/bindings/go/dcgm/device_info.go b/bindings/go/dcgm/device_info.go
index 8c61a55..bda27a1 100644
--- a/bindings/go/dcgm/device_info.go
+++ b/bindings/go/dcgm/device_info.go
@@ -120,7 +120,7 @@ func getPciBandwidth(gpuId uint) (int64, error) {
func getDeviceInfo(gpuid uint) (deviceInfo Device, err error) {
var device C.dcgmDeviceAttributes_t
- device.version = makeVersion1(unsafe.Sizeof(device))
+ device.version = makeVersion2(unsafe.Sizeof(device))
result := C.dcgmGetDeviceAttributes(handle.handle, C.uint(gpuid), &device)
if err = errorString(result); err != nil {
diff --git a/bindings/go/dcgm/health.go b/bindings/go/dcgm/health.go
index 5ce3abc..e611e72 100644
--- a/bindings/go/dcgm/health.go
+++ b/bindings/go/dcgm/health.go
@@ -48,8 +48,8 @@ func healthCheckByGpuId(gpuId uint) (deviceHealth DeviceHealth, err error) {
return
}
- var healthResults C.dcgmHealthResponse_v1
- healthResults.version = makeVersion1(unsafe.Sizeof(healthResults))
+ var healthResults C.dcgmHealthResponse_v4
+ healthResults.version = makeVersion2(unsafe.Sizeof(healthResults))
result := C.dcgmHealthCheck(handle.handle, groupId.handle, (*C.dcgmHealthResponse_t)(unsafe.Pointer(&healthResults)))
@@ -60,18 +60,15 @@ func healthCheckByGpuId(gpuId uint) (deviceHealth DeviceHealth, err error) {
status := healthStatus(int8(healthResults.overallHealth))
watches := []SystemWatch{}
- // only 1 gpu
- i := 0
-
// number of watches that encountred error/warning
- incidents := uint(healthResults.gpu[i].incidentCount)
+ incidents := uint(healthResults.incidentCount)
for j := uint(0); j < incidents; j++ {
watch := SystemWatch{
- Type: systemWatch(int(healthResults.gpu[i].systems[j].system)),
- Status: healthStatus(int8(healthResults.gpu[i].systems[j].health)),
+ Type: systemWatch(int(healthResults.incidents[j].system)),
+ Status: healthStatus(int8(healthResults.incidents[j].health)),
- Error: *stringPtr(&healthResults.gpu[i].systems[j].errorString[0]),
+ Error: *stringPtr(&healthResults.incidents[j].error.msg[0]),
}
watches = append(watches, watch)
}
diff --git a/bindings/go/dcgm/hostengine_status.go b/bindings/go/dcgm/hostengine_status.go
index 7848f09..4e6e6b9 100644
--- a/bindings/go/dcgm/hostengine_status.go
+++ b/bindings/go/dcgm/hostengine_status.go
@@ -24,7 +24,7 @@ func introspect() (engine DcgmStatus, err error) {
}
var memory C.dcgmIntrospectMemory_t
- memory.version = makeVersion1(unsafe.Sizeof(memory))
+ memory.version = makeVersion2(unsafe.Sizeof(memory))
waitIfNoData := 1
result = C.dcgmIntrospectGetHostengineMemoryUsage(handle.handle, &memory, C.int(waitIfNoData))
@@ -34,7 +34,7 @@ func introspect() (engine DcgmStatus, err error) {
var cpu C.dcgmIntrospectCpuUtil_t
- cpu.version = makeVersion1(unsafe.Sizeof(cpu))
+ cpu.version = makeVersion2(unsafe.Sizeof(cpu))
result = C.dcgmIntrospectGetHostengineCpuUtilization(handle.handle, &cpu, C.int(waitIfNoData))
if err = errorString(result); err != nil {
diff --git a/bindings/go/dcgm/policy.go b/bindings/go/dcgm/policy.go
index 9352529..06be22f 100644
--- a/bindings/go/dcgm/policy.go
+++ b/bindings/go/dcgm/policy.go
@@ -250,7 +250,7 @@ func ViolationRegistration(data unsafe.Pointer) int {
func setPolicy(groupId GroupHandle, condition C.dcgmPolicyCondition_t, paramList []policyIndex) (err error) {
var policy C.dcgmPolicy_t
- policy.version = makeVersion1(unsafe.Sizeof(policy))
+ policy.version = makeVersion2(unsafe.Sizeof(policy))
policy.mode = C.dcgmPolicyMode_t(C.DCGM_OPERATION_MODE_AUTO)
policy.action = C.DCGM_POLICY_ACTION_NONE
policy.isolation = C.DCGM_POLICY_ISOLATION_NONE
diff --git a/bindings/go/dcgm/process_info.go b/bindings/go/dcgm/process_info.go
index 16f7e33..64227cf 100644
--- a/bindings/go/dcgm/process_info.go
+++ b/bindings/go/dcgm/process_info.go
@@ -95,7 +95,7 @@ func watchPidFields(gpus ...uint) (groupId GroupHandle, err error) {
func getProcessInfo(groupId GroupHandle, pid uint) (processInfo []ProcessInfo, err error) {
var pidInfo C.dcgmPidInfo_t
- pidInfo.version = makeVersion1(unsafe.Sizeof(pidInfo))
+ pidInfo.version = makeVersion2(unsafe.Sizeof(pidInfo))
pidInfo.pid = C.uint(pid)
result := C.dcgmGetPidInfo(handle.handle, groupId.handle, &pidInfo)
diff --git a/bindings/go/dcgm/topology.go b/bindings/go/dcgm/topology.go
index cf1dbd5..f3afc38 100644
--- a/bindings/go/dcgm/topology.go
+++ b/bindings/go/dcgm/topology.go
@@ -97,7 +97,7 @@ func getCPUAffinity(busid string) (string, error) {
func getBusid(gpuid uint) (string, error) {
var device C.dcgmDeviceAttributes_t
- device.version = makeVersion1(unsafe.Sizeof(device))
+ device.version = makeVersion2(unsafe.Sizeof(device))
result := C.dcgmGetDeviceAttributes(handle.handle, C.uint(gpuid), &device)
if err := errorString(result); err != nil {
@@ -108,7 +108,7 @@ func getBusid(gpuid uint) (string, error) {
func getDeviceTopology(gpuid uint) (links []P2PLink, err error) {
var topology C.dcgmDeviceTopology_t
- topology.version = makeVersion1(unsafe.Sizeof(topology))
+ topology.version = makeVersion2(unsafe.Sizeof(topology))
result := C.dcgmGetDeviceTopology(handle.handle, C.uint(gpuid), &topology)
if result == C.DCGM_ST_NOT_SUPPORTED {
diff --git a/docker/Dockerfile.ubi8 b/docker/Dockerfile.ubi8
index 37e9c0d..9adf2da 100644
--- a/docker/Dockerfile.ubi8
+++ b/docker/Dockerfile.ubi8
@@ -6,7 +6,7 @@ COPY . .
RUN make binary check-format
-FROM registry.access.redhat.com/ubi8:latest
+FROM nvidia/cuda:11.0-base-ubi8
LABEL io.k8s.display-name="NVIDIA DCGM Exporter"
ARG DCGM_VERSION
@@ -19,9 +19,8 @@ COPY --from=builder /go/src/github.com/NVIDIA/gpu-monitoring-tools/dcgm-exporter
COPY etc/dcgm-exporter /etc/dcgm-exporter
ENV NVIDIA_VISIBLE_DEVICES=all
-
-RUN useradd dcgm-exporter
-USER dcgm-exporter
+# Required for DCP metrics
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32
ARG VERSION
@@ -35,4 +34,8 @@ LABEL description="See summary"
COPY ./LICENSE ./licenses/LICENSE
-ENTRYPOINT ["/usr/bin/dcgm-exporter"]
+ENV NO_SETCAP=
+COPY docker/docker-entrypoint.sh /usr/local/dcgm/docker-entrypoint.sh
+RUN chmod +x /usr/local/dcgm/docker-entrypoint.sh
+
+ENTRYPOINT ["/usr/local/dcgm/docker-entrypoint.sh"]
diff --git a/docker/Dockerfile.ubuntu18.04 b/docker/Dockerfile.ubuntu18.04
index 0eb50a7..4c61bd2 100644
--- a/docker/Dockerfile.ubuntu18.04
+++ b/docker/Dockerfile.ubuntu18.04
@@ -6,7 +6,7 @@ COPY . .
RUN make binary check-format
-FROM ubuntu:18.04
+FROM nvidia/cuda:11.0-base-ubuntu18.04
LABEL io.k8s.display-name="NVIDIA DCGM Exporter"
COPY --from=builder /go/src/github.com/NVIDIA/gpu-monitoring-tools/dcgm-exporter /usr/bin/
@@ -14,6 +14,7 @@ COPY etc/dcgm-exporter /etc/dcgm-exporter
ARG DCGM_VERSION
RUN apt-get update && apt-get install -y --no-install-recommends \
+ libcap2-bin \
libgomp1 \
wget && \
rm -rf /var/lib/apt/lists/* && \
@@ -21,9 +22,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
dpkg -i datacenter-gpu-manager_*.deb && \
rm -f datacenter-gpu-manager_*.deb
+# Required for DCP metrics
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32
+
ENV NVIDIA_VISIBLE_DEVICES=all
-RUN useradd dcgm-exporter
-USER dcgm-exporter
+ENV NO_SETCAP=
+COPY docker/docker-entrypoint.sh /usr/local/dcgm/docker-entrypoint.sh
+RUN chmod +x /usr/local/dcgm/docker-entrypoint.sh
-ENTRYPOINT ["/usr/bin/dcgm-exporter"]
+ENTRYPOINT ["/usr/local/dcgm/docker-entrypoint.sh"]
diff --git a/docker/docker-entrypoint.sh b/docker/docker-entrypoint.sh
new file mode 100644
index 0000000..d6c8ea6
--- /dev/null
+++ b/docker/docker-entrypoint.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# We want to setcap only when the container is started with the right permissions
+DCGM_EXPORTER=$(readlink -f $(which dcgm-exporter))
+if [ -z "$NO_SETCAP" ]; then
+ setcap 'cap_sys_admin=+ep' $DCGM_EXPORTER
+
+ if ! $DCGM_EXPORTER -v 1>/dev/null 2>/dev/null; then
+ >&2 echo "dcgm-exporter doesn't have sufficient privileges to expose profiling metrics. To use dcgm-exporter for profiling metrics use --cap-add SYS_ADMIN"
+ setcap 'cap_sys_admin=-ep' $DCGM_EXPORTER
+ fi
+fi
+
+# Pass the command line arguments to dcgm-exporter
+set -- $DCGM_EXPORTER "$@"
+exec "$@"