From a26a2fe5e4a00bd3d6aa96746813ca29d66a9b35 Mon Sep 17 00:00:00 2001 From: Pramod Ramarao Date: Tue, 29 Sep 2020 21:04:21 +0000 Subject: [PATCH] Setcap only when the right permissions are passed in at runtime To support profiling metrics, dcgm-exporter needs at least cap_sys_admin. Making this the default prevents users from running the dcgm-exporter image when they don't provide this capability. --- Makefile | 4 +- README.md | 47 +- bindings/go/dcgm/admin.go | 2 +- bindings/go/dcgm/dcgm_agent.h | 1588 ++++++++------- bindings/go/dcgm/dcgm_errors.h | 636 +++--- bindings/go/dcgm/dcgm_fields.h | 2241 +++++++++++---------- bindings/go/dcgm/dcgm_structs.h | 2649 ++++++++++++------------- bindings/go/dcgm/device_info.go | 2 +- bindings/go/dcgm/health.go | 15 +- bindings/go/dcgm/hostengine_status.go | 4 +- bindings/go/dcgm/policy.go | 2 +- bindings/go/dcgm/process_info.go | 2 +- bindings/go/dcgm/topology.go | 4 +- docker/Dockerfile.ubi8 | 13 +- docker/Dockerfile.ubuntu18.04 | 13 +- docker/docker-entrypoint.sh | 17 + 16 files changed, 3820 insertions(+), 3419 deletions(-) create mode 100644 docker/docker-entrypoint.sh diff --git a/Makefile b/Makefile index a6d54aa..df61577 100644 --- a/Makefile +++ b/Makefile @@ -16,9 +16,9 @@ DOCKER ?= docker MKDIR ?= mkdir REGISTRY ?= nvidia -DCGM_VERSION := 1.7.2 +DCGM_VERSION := 2.0.10 GOLANG_VERSION := 1.14.2 -VERSION := 2.0.0-rc.7 +VERSION := 2.1.0-rc.1 FULL_VERSION := $(DCGM_VERSION)-$(VERSION) .PHONY: all binary install check-format diff --git a/README.md b/README.md index 34d7498..0a83eec 100644 --- a/README.md +++ b/README.md @@ -4,13 +4,13 @@ This Github repository contains Golang bindings for the following two libraries: - [NVIDIA Management Library (NVML)](https://docs.nvidia.com/deploy/nvml-api/nvml-api-reference.html#nvml-api-reference) is a C-based API for monitoring and managing NVIDIA GPU devices. -- [NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/data-center-gpu-manager-dcgm) is a set of tools for managing and monitoring NVIDIA GPUs in cluster environments. It's a low overhead tool suite that performs a variety of functions on each host system including active health monitoring, diagnostics, system validation, policies, power and clock management, group configuration and accounting. +- [NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/dcgm) is a set of tools for managing and monitoring NVIDIA GPUs in cluster environments. It's a low overhead tool suite that performs a variety of functions on each host system including active health monitoring, diagnostics, system validation, policies, power and clock management, group configuration and accounting. You will also find samples for both of these bindings in this repository. ## DCGM exporter -This Github repository also contains the DCGM exporter software. It exposes GPU metrics exporter for [Prometheus](https://prometheus.io/) leveraging [NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/data-center-gpu-manager-dcgm). +This Github repository also contains the DCGM exporter software. It exposes GPU metrics exporter for [Prometheus](https://prometheus.io/) leveraging [NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/dcgm). Find the installation and run instructions [here](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/exporters/prometheus-dcgm/README.md). @@ -60,48 +60,9 @@ DCGM_FI_DEV_MEM_CLOCK{gpu="0", UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52",c DCGM_FI_DEV_MEMORY_TEMP{gpu="0", UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52",container="",namespace="",pod=""} 9223372036854775794 ... -# If you are using the Prometheus operator -# Note on exporters here: -# https://github.com/coreos/prometheus-operator/blob/release-0.38/Documentation/user-guides/running-exporters.md - -$ helm repo add stable https://kubernetes-charts.storage.googleapis.com -$ helm install stable/prometheus-operator --generate-name \ - --set "prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false" -$ kubectl create -f \ - https://raw.githubusercontent.com/NVIDIA/gpu-monitoring-tools/2.0.0-rc.12/service-monitor.yaml - -# Note might take ~1-2 minutes for prometheus to pickup the metrics and display them -# You can also check in the WebUI the servce-discovery tab (in the Status category) -$ NAME=$(kubectl get svc -l app=prometheus-operator-prometheus -o jsonpath='{.items[0].metadata.name}') -$ kubectl port-forward $NAME 9090:9090 & -$ curl -sL http://127.0.01:9090/api/v1/query?query=DCGM_FI_DEV_MEMORY_TEMP" -{ - status: "success", - data: { - resultType: "vector", - result: [ - { - metric: { - UUID: "GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52", - __name__: "DCGM_FI_DEV_MEMORY_TEMP", - __container__: "", - __pod__: "", - __namespace__: "", - ... - pod: "dcgm-exporter-fn7fm", - service: "dcgm-exporter" - }, - value: [ - 1588399049.227, - "9223372036854776000" - ] - }, - ... - ] - } -} ``` - +To integrate `dcgm-exporter` with Prometheus and Grafana, see the full instructions in the [user guide](https://docs.nvidia.com/datacenter/cloud-native/kubernetes/dcgme2e.html#gpu-telemetry). +`dcgm-exporter` is deployed as part of the GPU Operator. To get started with integrating with Prometheus, check the Operator [user guide](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html#gpu-telemetry). ### Building From source and Running on Bare Metal diff --git a/bindings/go/dcgm/admin.go b/bindings/go/dcgm/admin.go index 30bcacd..066f8b0 100644 --- a/bindings/go/dcgm/admin.go +++ b/bindings/go/dcgm/admin.go @@ -57,7 +57,7 @@ var ( func initDcgm(m mode, args ...string) (err error) { const ( - dcgmLib = "libdcgm.so.1" + dcgmLib = "libdcgm.so" ) lib := C.CString(dcgmLib) defer freeCString(lib) diff --git a/bindings/go/dcgm/dcgm_agent.h b/bindings/go/dcgm/dcgm_agent.h index e06dc58..3ade17e 100644 --- a/bindings/go/dcgm/dcgm_agent.h +++ b/bindings/go/dcgm/dcgm_agent.h @@ -10,22 +10,22 @@ */ #ifndef DCGM_AGENT_H -#define DCGM_AGENT_H +#define DCGM_AGENT_H -#ifdef __cplusplus +#include "dcgm_structs.h" + +#ifdef __cplusplus extern "C" { #endif -#include "dcgm_structs.h" - #define DECLDIR /***************************************************************************************************/ /** @defgroup DCGMAPI_Admin Administrative - * + * * This chapter describes the administration interfaces for DCGM. - * It is the user's responsibility to call \ref dcgmInit() before calling any other methods, - * and \ref dcgmShutdown() once DCGM is no longer being used. The APIs in Administrative module + * It is the user's responsibility to call \ref dcgmInit() before calling any other methods, + * and \ref dcgmShutdown() once DCGM is no longer being used. The APIs in Administrative module * can be broken down into following categories: * @{ */ @@ -33,17 +33,17 @@ extern "C" { /***************************************************************************************************/ /** @defgroup DCGMAPI_Admin_InitShut Init and Shutdown - * + * * Describes APIs to Initialize and Shutdown the DCGM Engine. * @{ */ /***************************************************************************************************/ - + /** * This method is used to initialize DCGM within this process. This must be called before * dcgmStartEmbedded() or dcgmConnect() - * - * * @return + * + * * @return * - \ref DCGM_ST_OK if DCGM has been properly initialized * - \ref DCGM_ST_INIT_ERROR if there was an error initializing the library */ @@ -52,8 +52,8 @@ dcgmReturn_t DECLDIR dcgmInit(void); /** * This method is used to shut down DCGM. Any embedded host engines or remote connections will automatically * be shut down as well. - * - * @return + * + * @return * - \ref DCGM_ST_OK if DCGM has been properly shut down * - \ref DCGM_ST_UNINITIALIZED if the library was not shut down properly */ @@ -68,8 +68,8 @@ dcgmReturn_t DECLDIR dcgmShutdown(void); * \ref dcgmUpdateAllFields which tells DCGM to wake up and perform data collection and * operations needed for policy management. * - * @param opMode IN : Collect data automatically or manually when asked by the user. - * @param pDcgmHandle OUT : DCGM Handle to use for API calls + * @param opMode IN: Collect data automatically or manually when asked by the user. + * @param pDcgmHandle OUT: DCGM Handle to use for API calls * * @return * - \ref DCGM_ST_OK if DCGM was started successfully within our process @@ -78,6 +78,24 @@ dcgmReturn_t DECLDIR dcgmShutdown(void); */ dcgmReturn_t DECLDIR dcgmStartEmbedded(dcgmOperationMode_t opMode, dcgmHandle_t *pDcgmHandle); +/** + * Start an embedded host engine agent within this process. + * + * The agent is loaded as a shared library. This mode is provided to avoid any + * extra jitter associated with an additional autonomous agent needs to be managed. In + * this mode, the user has to periodically call APIs such as \ref dcgmPolicyTrigger and + * \ref dcgmUpdateAllFields which tells DCGM to wake up and perform data collection and + * operations needed for policy management. + * + * @param params IN/OUT: See \ref dcgmStartEmbeddedV2Params_v1 for details. + * + * @return + * - \ref DCGM_ST_OK if DCGM was started successfully within our process + * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit yet + * + */ +dcgmReturn_t DECLDIR dcgmStartEmbedded_v2(dcgmStartEmbeddedV2Params_v1 *params); + /** * Stop the embedded host engine within this process that was started with dcgmStartEmbedded * @@ -98,20 +116,19 @@ dcgmReturn_t DECLDIR dcgmStopEmbedded(dcgmHandle_t pDcgmHandle); * * NOTE: dcgmConnect_v2 provides additional connection options. * - * @param ipAddress IN : Valid IP address for the remote host engine to connect to. - * If ipAddress is specified as x.x.x.x it will attempt to connect to the default - * port specified by DCGM_HE_PORT_NUMBER - * If ipAddress is specified as x.x.x.x:yyyy it will attempt to connect to the - * port specified by yyyy - * @param pDcgmHandle OUT : DCGM Handle of the remote host engine + * @param ipAddress IN: Valid IP address for the remote host engine to connect to. + * If ipAddress is specified as x.x.x.x it will attempt to connect to the default + * port specified by DCGM_HE_PORT_NUMBER + * If ipAddress is specified as x.x.x.x:yyyy it will attempt to connect to the + * port specified by yyyy + * @param pDcgmHandle OUT: DCGM Handle of the remote host engine * * @return * - \ref DCGM_ST_OK if we successfully connected to the remote host engine * - \ref DCGM_ST_CONNECTION_NOT_VALID if the remote host engine could not be reached * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit. * - \ref DCGM_ST_BADPARAM if pDcgmHandle is NULL or ipAddress is invalid - * - \ref DCGM_ST_INIT_ERROR if DCGM encountered an error while initializing the remote - * client library + * - \ref DCGM_ST_INIT_ERROR if DCGM encountered an error while initializing the remote client library * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit */ dcgmReturn_t DECLDIR dcgmConnect(char *ipAddress, dcgmHandle_t *pDcgmHandle); @@ -120,24 +137,23 @@ dcgmReturn_t DECLDIR dcgmConnect(char *ipAddress, dcgmHandle_t *pDcgmHandle); * This method is used to connect to a stand-alone host engine process. Remote host engines are started * by running the nv-hostengine command. * - * @param ipAddress IN : Valid IP address for the remote host engine to connect to. - * If ipAddress is specified as x.x.x.x it will attempt to connect to the default - * port specified by DCGM_HE_PORT_NUMBER - * If ipAddress is specified as x.x.x.x:yyyy it will attempt to connect to the - * port specified by yyyy - * @param connectParams IN : Additional connection parameters. See \ref dcgmConnectV2Params_t for details. - * @param pDcgmHandle OUT : DCGM Handle of the remote host engine + * @param ipAddress IN: Valid IP address for the remote host engine to connect to. + * If ipAddress is specified as x.x.x.x it will attempt to connect to the default port + * specified by DCGM_HE_PORT_NUMBER. + * If ipAddress is specified as x.x.x.x:yyyy it will attempt to connect to the port + * specified by yyyy + * @param connectParams IN: Additional connection parameters. See \ref dcgmConnectV2Params_t for details. + * @param pDcgmHandle OUT: DCGM Handle of the remote host engine * * @return * - \ref DCGM_ST_OK if we successfully connected to the remote host engine * - \ref DCGM_ST_CONNECTION_NOT_VALID if the remote host engine could not be reached * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit. * - \ref DCGM_ST_BADPARAM if pDcgmHandle is NULL or ipAddress is invalid - * - \ref DCGM_ST_INIT_ERROR if DCGM encountered an error while initializing the remote - * client library + * - \ref DCGM_ST_INIT_ERROR if DCGM encountered an error while initializing the remote client library * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit */ - dcgmReturn_t DECLDIR dcgmConnect_v2(char *ipAddress, dcgmConnectV2Params_t *connectParams, dcgmHandle_t *pDcgmHandle); +dcgmReturn_t DECLDIR dcgmConnect_v2(char *ipAddress, dcgmConnectV2Params_t *connectParams, dcgmHandle_t *pDcgmHandle); /** * This method is used to disconnect from a stand-alone host engine process. @@ -166,14 +182,44 @@ dcgmReturn_t DECLDIR dcgmDisconnect(dcgmHandle_t pDcgmHandle); /** * This method is used to return information about the build environment where DCGM was built. * - * @param pVersionInfo OUT : Build environment information + * @param pVersionInfo OUT: Build environment information * * @return * - \ref DCGM_ST_OK if build information is sucessfully obtained * - \ref DCGM_ST_BADPARAM if pVersionInfo is null * - \ref DCGM_ST_VER_MISMATCH if the expected and provided versions of dcgmVersionInfo_t do not match */ -dcgmReturn_t DECLDIR dcgmVersionInfo(dcgmVersionInfo_t* pVersionInfo); +dcgmReturn_t DECLDIR dcgmVersionInfo(dcgmVersionInfo_t *pVersionInfo); + + +/** + * This method is used to set the logging severity on HostEngine for the specified logger + * + * @param pDcgmHandle IN: DCGM Handle + * @param logging IN: dcgmSettingsSetLoggingSeverity_t struct containing the target logger and severity + * + * @return + * - \ref DCGM_ST_OK Severity successfuly set + * - \ref DCGM_ST_BADPARAM Bad logger/severity string + * - \ref DCGM_ST_VER_MISMATCH if the expected and provided versions of dcgmSettingsSetLoggingSeverity_t + * do not match + */ +dcgmReturn_t DECLDIR dcgmHostengineSetLoggingSeverity(dcgmHandle_t pDcgmHandle, + dcgmSettingsSetLoggingSeverity_t *logging); + +/** + * This function is used to return whether or not the host engine considers itself healthy + * + * @param[in] pDcgmHandle - the handle to DCGM + * @param[out] heHealth - struct describing the health of the hostengine. if heHealth.hostengineHealth is 0, + * then the hostengine is healthy. Non-zero indicates not healthy with error codes + * determining the cause. + * + * @return + * - \ref DCGM_ST_OK Able to gauge health + * - \ref DCGM_ST_BADPARAM isHealthy is not a valid pointer + */ +dcgmReturn_t DECLDIR dcgmHostengineIsHealthy(dcgmHandle_t pDcgmHandle, dcgmHostengineHealth_t *heHealth); /** @} */ // Closing DCGMAPI_Admin_Info @@ -185,7 +231,7 @@ dcgmReturn_t DECLDIR dcgmVersionInfo(dcgmVersionInfo_t* pVersionInfo); * @{ * This chapter describes the APIs used to identify set of GPUs on the node, grouping functions to * provide mechanism to operate on a group of GPUs, and status management APIs in - * order to get individual statuses for each operation. The APIs in System module can be + * order to get individual statuses for each operation. The APIs in System module can be * broken down into following categories: */ /***************************************************************************************************/ @@ -198,22 +244,24 @@ dcgmReturn_t DECLDIR dcgmVersionInfo(dcgmVersionInfo_t* pVersionInfo); /***************************************************************************************************/ /** - * This method is used to get identifiers corresponding to all the devices on the system. The - * identifier represents DCGM GPU Id corresponding to each GPU on the system and is immutable during + * This method is used to get identifiers corresponding to all the devices on the system. The + * identifier represents DCGM GPU Id corresponding to each GPU on the system and is immutable during * the lifespan of the engine. The list should be queried again if the engine is restarted. - * + * * The GPUs returned from this function include gpuIds of GPUs that are not supported by DCGM. * To only get gpuIds of GPUs that are supported by DCGM, use dcgmGetAllSupportedDevices(). * - * @param pDcgmHandle IN : DCGM Handle - * @param gpuIdList OUT : Array reference to fill GPU Ids present on the system. - * @param count OUT : Number of GPUs returned in \a gpuIdList. + * @param pDcgmHandle IN: DCGM Handle + * @param gpuIdList OUT: Array reference to fill GPU Ids present on the system. + * @param count OUT: Number of GPUs returned in \a gpuIdList. * - * @return + * @return * - \ref DCGM_ST_OK if the call was successful. * - \ref DCGM_ST_BADPARAM if \a gpuIdList or \a count were not valid. */ -dcgmReturn_t DECLDIR dcgmGetAllDevices(dcgmHandle_t pDcgmHandle, unsigned int gpuIdList[DCGM_MAX_NUM_DEVICES], int *count); +dcgmReturn_t DECLDIR dcgmGetAllDevices(dcgmHandle_t pDcgmHandle, + unsigned int gpuIdList[DCGM_MAX_NUM_DEVICES], + int *count); /** * This method is used to get identifiers corresponding to all the DCGM-supported devices on the system. The @@ -224,169 +272,191 @@ dcgmReturn_t DECLDIR dcgmGetAllDevices(dcgmHandle_t pDcgmHandle, unsigned int gp * To get gpuIds of all GPUs in the system, use dcgmGetAllDevices(). * * - * @param pDcgmHandle IN : DCGM Handle - * @param gpuIdList OUT : Array reference to fill GPU Ids present on the system. - * @param count OUT : Number of GPUs returned in \a gpuIdList. + * @param pDcgmHandle IN: DCGM Handle + * @param gpuIdList OUT: Array reference to fill GPU Ids present on the system. + * @param count OUT: Number of GPUs returned in \a gpuIdList. * * @return * - \ref DCGM_ST_OK if the call was successful. * - \ref DCGM_ST_BADPARAM if \a gpuIdList or \a count were not valid. */ -dcgmReturn_t DECLDIR dcgmGetAllSupportedDevices(dcgmHandle_t pDcgmHandle, unsigned int gpuIdList[DCGM_MAX_NUM_DEVICES], int *count); +dcgmReturn_t DECLDIR dcgmGetAllSupportedDevices(dcgmHandle_t pDcgmHandle, + unsigned int gpuIdList[DCGM_MAX_NUM_DEVICES], + int *count); /** - * Gets device attributes corresponding to the \a gpuId. If operation is not successful for any of - * the requested fields then the field is populated with one of DCGM_BLANK_VALUES defined in + * Gets device attributes corresponding to the \a gpuId. If operation is not successful for any of + * the requested fields then the field is populated with one of DCGM_BLANK_VALUES defined in * dcgm_structs.h. - * - * @param pDcgmHandle IN : DCGM Handle - * @param gpuId IN : GPU Id corresponding to which the attributes - * should be fetched - * @param pDcgmAttr IN/OUT : Device attributes corresponding to \a gpuId.
- * pDcgmAttr->version should be set to - * \ref dcgmDeviceAttributes_version before this - * call. + * + * @param pDcgmHandle IN: DCGM Handle + * @param gpuId IN: GPU Id corresponding to which the attributes should be fetched + * @param pDcgmAttr IN/OUT: Device attributes corresponding to \a gpuId.
pDcgmAttr->version should be set to + * \ref dcgmDeviceAttributes_version before this call. * * @return * - \ref DCGM_ST_OK if the call was successful. * - \ref DCGM_ST_VER_MISMATCH if pDcgmAttr->version is not set or is invalid. */ -dcgmReturn_t DECLDIR dcgmGetDeviceAttributes(dcgmHandle_t pDcgmHandle, unsigned int gpuId, dcgmDeviceAttributes_t *pDcgmAttr); +dcgmReturn_t DECLDIR dcgmGetDeviceAttributes(dcgmHandle_t pDcgmHandle, + unsigned int gpuId, + dcgmDeviceAttributes_t *pDcgmAttr); /** * Gets the list of entities that exist for a given entity group. This API can be used in place of - * \ref dcgmGetAllDevices. - * + * \ref dcgmGetAllDevices. + * * @param dcgmHandle IN: DCGM Handle * @param entityGroup IN: Entity group to list entities of * @param entities OUT: Array of entities for entityGroup - * @param numEntities IN/OUT: Upon calling, this should be the number of entities that entityList[] - * can hold. Upon return, this will contain the number of entities actually - * saved to entityList. - * @param flags IN: Flags to modify the behavior of this request. + * @param numEntities IN/OUT: Upon calling, this should be the number of entities that entityList[] can hold. Upon + * return, this will contain the number of entities actually saved to entityList. + * @param flags IN: Flags to modify the behavior of this request. * See DCGM_GEGE_FLAG_* #defines in dcgm_structs.h - * + * * @return * - \ref DCGM_ST_OK if the call was successful. - * - \ref DCGM_ST_INSUFFICIENT_SIZE if numEntities was not large enough to hold the number of - * entities in the entityGroup. numEntities will contain - * the capacity needed to complete this request successfully. + * - \ref DCGM_ST_INSUFFICIENT_SIZE if numEntities was not large enough to hold the number of entities in the + * entityGroup. numEntities will contain the capacity needed to complete this + * request successfully. * - \ref DCGM_ST_NOT_SUPPORTED if the given entityGroup does not support enumeration. * - \ref DCGM_ST_BADPARAM if any parameter is invalid */ -dcgmReturn_t DECLDIR dcgmGetEntityGroupEntities(dcgmHandle_t dcgmHandle, dcgm_field_entity_group_t entityGroup, - dcgm_field_eid_t *entities, int *numEntities, unsigned int flags); +dcgmReturn_t DECLDIR dcgmGetEntityGroupEntities(dcgmHandle_t dcgmHandle, + dcgm_field_entity_group_t entityGroup, + dcgm_field_eid_t *entities, + int *numEntities, + unsigned int flags); + +/** + * Gets the hierarchy of GPUs, GPU Instances, and Compute Instances by populating a list of each entity with + * a reference to their parent + * + * @param dcgmHandle IN: DCGM Handle + * @param entities OUT: array of entities in the hierarchy + * @param numEntities IN/OUT: Upon calling, this should be the capacity of entities. + * Upon return, this will contain the number of entities actually saved to entities. + * + * @return + * - \ref DCGM_ST_OK if the call was successful. + * - \ref DCGM_ST_VER_MISMATCH if the struct version is incorrect + * - \ref DCGM_ST_BADPARAM if any parameter is invalid + */ +dcgmReturn_t DECLDIR dcgmGetGpuInstanceHierarchy(dcgmHandle_t dcgmHandle, dcgmMigHierarchy_v1 *hierarchy); /** * Get the NvLink link status for every NvLink in this system. This includes the NvLinks of both GPUs and * NvSwitches. Note that only NvSwitches and GPUs that are visible to the current environment will be * returned in this structure. - * + * * @param dcgmHandle IN: DCGM Handle * @param linkStatus OUT: Structure in which to store NvLink link statuses. .version should be set to - * dcgmNvLinkStatus_version1 before calling this. - * + * dcgmNvLinkStatus_version1 before calling this. + * * @return * - \ref DCGM_ST_OK if the call was successful. * - \ref DCGM_ST_NOT_SUPPORTED if the given entityGroup does not support enumeration. * - \ref DCGM_ST_BADPARAM if any parameter is invalid */ -dcgmReturn_t DECLDIR dcgmGetNvLinkLinkStatus(dcgmHandle_t dcgmHandle, dcgmNvLinkStatus_v1 *linkStatus); +dcgmReturn_t DECLDIR dcgmGetNvLinkLinkStatus(dcgmHandle_t dcgmHandle, dcgmNvLinkStatus_v2 *linkStatus); /** @} */ /***************************************************************************************************/ /** @defgroup DCGM_GROUPING Grouping - * The following APIs are used for group management. The user can create a group of entities and + * The following APIs are used for group management. The user can create a group of entities and * perform an operation on a group of entities. If grouping is not needed and the user wishes - * to run commands on all GPUs seen by DCGM then the user can use DCGM_GROUP_ALL_GPUS or + * to run commands on all GPUs seen by DCGM then the user can use DCGM_GROUP_ALL_GPUS or * DCGM_GROUP_ALL_NVSWITCHES in place of group IDs when needed. * @{ */ /***************************************************************************************************/ /** - * Used to create a entity group handle which can store one or more entity Ids as an opaque handle + * Used to create a entity group handle which can store one or more entity Ids as an opaque handle * returned in \a pDcgmGrpId. Instead of executing an operation separately for each entity, the - * DCGM group enables the user to execute same operation on all the entities present in the group as a + * DCGM group enables the user to execute same operation on all the entities present in the group as a * single API call. - * - * To create the group with all the entities present on the system, the \a type field should be - * specified as \a DCGM_GROUP_DEFAULT or \a DCGM_GROUP_ALL_NVSWITCHES. To create an empty group, - * the \a type field should be specified as \a DCGM_GROUP_EMPTY. The empty group can be updated - * with the desired set of entities using the APIs \ref dcgmGroupAddDevice, \ref dcgmGroupAddEntity, + * + * To create the group with all the entities present on the system, the \a type field should be + * specified as \a DCGM_GROUP_DEFAULT or \a DCGM_GROUP_ALL_NVSWITCHES. To create an empty group, + * the \a type field should be specified as \a DCGM_GROUP_EMPTY. The empty group can be updated + * with the desired set of entities using the APIs \ref dcgmGroupAddDevice, \ref dcgmGroupAddEntity, * \ref dcgmGroupRemoveDevice, and \ref dcgmGroupRemoveEntity. - * - * @param pDcgmHandle IN : DCGM Handle - * @param type IN : Type of Entity Group to be formed - * @param groupName IN : Desired name of the GPU group specified as NULL terminated C string - * @param pDcgmGrpId OUT : Reference to group ID - * @return - * - \ref DCGM_ST_OK if the group has been created - * - \ref DCGM_ST_BADPARAM if any of \a type, \a groupName, \a length or \a pDcgmGrpId - * is invalid - * - \ref DCGM_ST_MAX_LIMIT if number of groups on the system has reached the max limit - * \a DCGM_MAX_NUM_GROUPS - * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized + * + * @param pDcgmHandle IN: DCGM Handle + * @param type IN: Type of Entity Group to be formed + * @param groupName IN: Desired name of the GPU group specified as NULL terminated C string + * @param pDcgmGrpId OUT: Reference to group ID + * + * @return + * - \ref DCGM_ST_OK if the group has been created + * - \ref DCGM_ST_BADPARAM if any of \a type, \a groupName, \a length or \a pDcgmGrpId is invalid + * - \ref DCGM_ST_MAX_LIMIT if number of groups on the system has reached the max limit \a DCGM_MAX_NUM_GROUPS + * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized */ -dcgmReturn_t DECLDIR dcgmGroupCreate(dcgmHandle_t pDcgmHandle, dcgmGroupType_t type, char *groupName, - dcgmGpuGrp_t *pDcgmGrpId); +dcgmReturn_t DECLDIR dcgmGroupCreate(dcgmHandle_t pDcgmHandle, + dcgmGroupType_t type, + char *groupName, + dcgmGpuGrp_t *pDcgmGrpId); /** - * Used to destroy a group represented by \a groupId. + * Used to destroy a group represented by \a groupId. * Since DCGM group is a logical grouping of entities, the properties applied on the group stay intact * for the individual entities even after the group is destroyed. * - * @param pDcgmHandle IN : DCGM Handle - * @param groupId IN : Group ID + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID * * @return * - \ref DCGM_ST_OK if the group has been destroyed * - \ref DCGM_ST_BADPARAM if \a groupId is invalid * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized - * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group does not exists + * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group does not exists */ dcgmReturn_t DECLDIR dcgmGroupDestroy(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId); /** * Used to add specified GPU Id to the group represented by \a groupId. - * - * @param pDcgmHandle IN : DCGM Handle - * @param groupId IN : Group Id to which device should be added - * @param gpuId IN : DCGM GPU Id - * @return - * - \ref DCGM_ST_OK if the GPU Id has been successfully added - * to the group + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group Id to which device should be added + * @param gpuId IN: DCGM GPU Id + * + * @return + * - \ref DCGM_ST_OK if the GPU Id has been successfully added to the group * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized - * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group (\a groupId) does not exists + * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group (\a groupId) does not exists * - \ref DCGM_ST_BADPARAM if \a gpuId is invalid or already part of the specified group */ dcgmReturn_t dcgmGroupAddDevice(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, unsigned int gpuId); /** * Used to add specified entity to the group represented by \a groupId. - * - * @param pDcgmHandle IN : DCGM Handle - * @param groupId IN : Group Id to which device should be added - * @param entityGroupId IN : Entity group that entityId belongs to - * @param entityId IN : DCGM entityId - * @return - * - \ref DCGM_ST_OK if the entity has been successfully added - * to the group + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group Id to which device should be added + * @param entityGroupId IN: Entity group that entityId belongs to + * @param entityId IN: DCGM entityId + * + * @return + * - \ref DCGM_ST_OK if the entity has been successfully added to the group * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized - * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group (\a groupId) does not exists + * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group (\a groupId) does not exists * - \ref DCGM_ST_BADPARAM if \a entityId is invalid or already part of the specified group */ -dcgmReturn_t dcgmGroupAddEntity(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, - dcgm_field_entity_group_t entityGroupId, +dcgmReturn_t dcgmGroupAddEntity(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgm_field_entity_group_t entityGroupId, dcgm_field_eid_t entityId); /** * Used to remove specified GPU Id from the group represented by \a groupId. - * @param pDcgmHandle IN : DCGM Handle - * @param groupId IN : Group ID from which device should be removed - * @param gpuId IN : DCGM GPU Id - * @return + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID from which device should be removed + * @param gpuId IN: DCGM GPU Id + * + * @return * - \ref DCGM_ST_OK if the GPU Id has been successfully removed from the group * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group (\a groupId) does not exists @@ -396,30 +466,31 @@ dcgmReturn_t dcgmGroupRemoveDevice(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupI /** * Used to remove specified entity from the group represented by \a groupId. - * @param pDcgmHandle IN : DCGM Handle - * @param groupId IN : Group ID from which device should be removed - * @param entityGroupId IN : Entity group that entityId belongs to - * @param entityId IN : DCGM entityId + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID from which device should be removed + * @param entityGroupId IN: Entity group that entityId belongs to + * @param entityId IN: DCGM entityId * - * @return + * @return * - \ref DCGM_ST_OK if the entity has been successfully removed from the group * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group (\a groupId) does not exists * - \ref DCGM_ST_BADPARAM if \a entityId is invalid or not part of the specified group */ -dcgmReturn_t dcgmGroupRemoveEntity(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, - dcgm_field_entity_group_t entityGroupId, +dcgmReturn_t dcgmGroupRemoveEntity(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgm_field_entity_group_t entityGroupId, dcgm_field_eid_t entityId); - /** - * Used to get information corresponding to the group represented by \a groupId. The information - * returned in \a pDcgmGroupInfo consists of group name, and the list of entities present in the + * Used to get information corresponding to the group represented by \a groupId. The information + * returned in \a pDcgmGroupInfo consists of group name, and the list of entities present in the * group. - * - * @param pDcgmHandle IN : DCGM Handle - * @param groupId IN : Group ID for which information to be fetched - * @param pDcgmGroupInfo OUT : Group Information + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID for which information to be fetched + * @param pDcgmGroupInfo OUT: Group Information + * * @return * - \ref DCGM_ST_OK if the group info is successfully received. * - \ref DCGM_ST_BADPARAM if any of \a groupId or \a pDcgmGroupInfo is invalid. @@ -431,12 +502,13 @@ dcgmReturn_t dcgmGroupGetInfo(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dc /** * Used to get the Ids of all groups of entities. The information returned is a list of group ids - * in \a groupIdList as well as a count of how many ids there are in \a count. Please allocate enough + * in \a groupIdList as well as a count of how many ids there are in \a count. Please allocate enough * memory for \a groupIdList. Memory of size MAX_NUM_GROUPS should be allocated for \a groupIdList. * - * @param pDcgmHandle IN : DCGM Handle - * @param groupIdList OUT : List of Group Ids - * @param count OUT : The number of Group ids in the list + * @param pDcgmHandle IN: DCGM Handle + * @param groupIdList OUT: List of Group Ids + * @param count OUT: The number of Group ids in the list + * * @return * - \ref DCGM_ST_OK if the ids of the groups were successfully retrieved * - \ref DCGM_ST_BADPARAM if either of the \a groupIdList or \a count is null @@ -457,11 +529,11 @@ dcgmReturn_t dcgmGroupGetAllIds(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupIdLi * Used to create a group of fields and return the handle in dcgmFieldGroupId * * @param dcgmHandle IN: DCGM handle - * @param numFieldIds IN: Number of field IDs that are being provided in fieldIds[]. Must be - * between 1 and DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP. + * @param numFieldIds IN: Number of field IDs that are being provided in fieldIds[]. Must be between 1 and + * DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP. * @param fieldIds IN: Field IDs to be added to the newly-created field group - * @param fieldGroupName IN: Unique name for this group of fields. This must not be the same - * as any existing field groups. + * @param fieldGroupName IN: Unique name for this group of fields. This must not be the same as any existing field + * groups. * @param dcgmFieldGroupId OUT: Handle to the newly-created field group * * @return @@ -471,8 +543,11 @@ dcgmReturn_t dcgmGroupGetAllIds(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupIdLi * - \ref DCGM_ST_MAX_LIMIT if too many field groups already exist * */ -dcgmReturn_t dcgmFieldGroupCreate(dcgmHandle_t dcgmHandle, int numFieldIds, unsigned short *fieldIds, - char *fieldGroupName, dcgmFieldGrp_t *dcgmFieldGroupId); +dcgmReturn_t dcgmFieldGroupCreate(dcgmHandle_t dcgmHandle, + int numFieldIds, + unsigned short *fieldIds, + char *fieldGroupName, + dcgmFieldGrp_t *dcgmFieldGroupId); /** * Used to remove a field group that was created with \ref dcgmFieldGroupCreate @@ -493,11 +568,9 @@ dcgmReturn_t dcgmFieldGroupDestroy(dcgmHandle_t dcgmHandle, dcgmFieldGrp_t dcgmF * Used to get information about a field group that was created with \ref dcgmFieldGroupCreate. * * @param dcgmHandle IN: DCGM handle - * @param fieldGroupInfo IN/OUT: Info about all of the field groups that - * exist.
.version should be set to - * \ref dcgmFieldGroupInfo_version before this - * call
.fieldGroupId should contain the - * fieldGroupId you are interested in querying + * @param fieldGroupInfo IN/OUT: Info about all of the field groups that exist.
+ * .version should be set to \ref dcgmFieldGroupInfo_version before this call
+ * .fieldGroupId should contain the fieldGroupId you are interested in querying * information for. * * @return @@ -509,16 +582,12 @@ dcgmReturn_t dcgmFieldGroupDestroy(dcgmHandle_t dcgmHandle, dcgmFieldGrp_t dcgmF */ dcgmReturn_t dcgmFieldGroupGetInfo(dcgmHandle_t dcgmHandle, dcgmFieldGroupInfo_t *fieldGroupInfo); - /** * Used to get information about all field groups in the system. * * @param dcgmHandle IN: DCGM handle - * @param allGroupInfo IN/OUT: Info about all of the field groups that - * exist.
- * .version should be set to - * \ref dcgmAllFieldGroup_version before - * this call. + * @param allGroupInfo IN/OUT: Info about all of the field groups that exist.
+ * .version should be set to \ref dcgmAllFieldGroup_version before this call. * * @return * - \ref DCGM_ST_OK if the field group info was successfully returned @@ -532,7 +601,6 @@ dcgmReturn_t dcgmFieldGroupGetAll(dcgmHandle_t dcgmHandle, dcgmAllFieldGroup_t * /** @} */ - /***************************************************************************************************/ /** @defgroup DCGMAPI_ST Status handling * The following APIs are used to manage statuses for multiple operations on one or more GPUs. @@ -541,66 +609,76 @@ dcgmReturn_t dcgmFieldGroupGetAll(dcgmHandle_t dcgmHandle, dcgmAllFieldGroup_t * /***************************************************************************************************/ /** - * Creates reference to DCGM status handler which can be used to get the statuses for multiple + * Creates reference to DCGM status handler which can be used to get the statuses for multiple * operations on one or more devices. - * - * The multiple statuses are useful when the operations are performed at group level. The status + * + * The multiple statuses are useful when the operations are performed at group level. The status * handle provides a mechanism to access error attributes for the failed operations. - * - * The number of errors stored behind the opaque handle can be accessed using the the API - * \ref dcgmStatusGetCount. The errors are accessed from the opaque handle \a statusHandle - * using the API \ref dcgmStatusPopError. The user can invoke \ref dcgmStatusPopError + * + * The number of errors stored behind the opaque handle can be accessed using the the API + * \ref dcgmStatusGetCount. The errors are accessed from the opaque handle \a statusHandle + * using the API \ref dcgmStatusPopError. The user can invoke \ref dcgmStatusPopError * for the number of errors or until all the errors are fetched. - * - * When the status handle is not required any further then it should be deleted using the API + * + * When the status handle is not required any further then it should be deleted using the API * \ref dcgmStatusDestroy. - * @param statusHandle OUT : Reference to handle for list of statuses - * @return + * @param statusHandle OUT: Reference to handle for list of statuses + * + * @return * - \ref DCGM_ST_OK if the status handle is successfully created * - \ref DCGM_ST_BADPARAM if \a statusHandle is invalid + * */ dcgmReturn_t dcgmStatusCreate(dcgmStatus_t *statusHandle); /** * Used to destroy status handle created using \ref dcgmStatusCreate. - * @param statusHandle IN : Handle to list of statuses + * @param statusHandle IN: Handle to list of statuses + * * @return * - \ref DCGM_ST_OK if the status handle is successfully created * - \ref DCGM_ST_BADPARAM if \a statusHandle is invalid + * */ dcgmReturn_t dcgmStatusDestroy(dcgmStatus_t statusHandle); /** * Used to get count of error entries stored inside the opaque handle \a statusHandle. - * @param statusHandle IN : Handle to list of statuses - * @param count OUT : Number of error entries present in the list of statuses - * @return + * @param statusHandle IN: Handle to list of statuses + * @param count OUT: Number of error entries present in the list of statuses + * + * @return * - \ref DCGM_ST_OK if the error count is successfully received * - \ref DCGM_ST_BADPARAM if any of \a statusHandle or \a count is invalid + * */ dcgmReturn_t dcgmStatusGetCount(dcgmStatus_t statusHandle, unsigned int *count); /** - * Used to iterate through the list of errors maintained behind \a statusHandle. The method pops the - * first error from the list of DCGM statuses. In order to iterate through all the errors, the user + * Used to iterate through the list of errors maintained behind \a statusHandle. The method pops the + * first error from the list of DCGM statuses. In order to iterate through all the errors, the user * can invoke this API for the number of errors or until all the errors are fetched. - * @param statusHandle IN : Handle to list of statuses - * @param pDcgmErrorInfo OUT : First error from the list of statuses + * @param statusHandle IN: Handle to list of statuses + * @param pDcgmErrorInfo OUT: First error from the list of statuses + * * @return * - \ref DCGM_ST_OK if the error entry is successfully fetched * - \ref DCGM_ST_BADPARAM if any of \a statusHandle or \a pDcgmErrorInfo is invalid * - \ref DCGM_ST_NO_DATA if the status handle list is empty + * */ dcgmReturn_t dcgmStatusPopError(dcgmStatus_t statusHandle, dcgmErrorInfo_t *pDcgmErrorInfo); /** * Used to clear all the errors in the status handle created by the API - * \ref dcgmStatusCreate. After one set of operation, the \a statusHandle + * \ref dcgmStatusCreate. After one set of operation, the \a statusHandle * can be cleared and reused for the next set of operation. - * @param statusHandle IN : Handle to list of statuses - * @return + * @param statusHandle IN: Handle to list of statuses + * + * @return * - \ref DCGM_ST_OK if the errors are successfully cleared * - \ref DCGM_ST_BADPARAM if \a statusHandle is invalid + * */ dcgmReturn_t dcgmStatusClear(dcgmStatus_t statusHandle); @@ -618,7 +696,6 @@ dcgmReturn_t dcgmStatusClear(dcgmStatus_t statusHandle); */ /***************************************************************************************************/ - /***************************************************************************************************/ /** @defgroup DCGMAPI_DC_Setup Setup and management * Describes APIs to Get/Set configuration on the group of GPUs. @@ -627,100 +704,98 @@ dcgmReturn_t dcgmStatusClear(dcgmStatus_t statusHandle); /***************************************************************************************************/ /** - * Used to set configuration for the group of one or more GPUs identified by \a groupId. - * - * The configuration settings specified in \a pDeviceConfig are applied to all the GPUs in the - * group. Since DCGM group is a logical grouping of GPUs, the configuration settings stays intact - * for the individual GPUs even after the group is destroyed. - * - * If the user wishes to ignore the configuration of one or more properties in the input - * \a pDeviceConfig then the property should be specified as one of \a DCGM_INT32_BLANK, - * \a DCGM_INT64_BLANK, \a DCGM_FP64_BLANK or \a DCGM_STR_BLANK based on the data type of the - * property to be ignored. - * - * If any of the properties fail to be configured for any of the GPUs in the group then the API - * returns an error. The status handle \a statusHandle should be further evaluated to access error - * attributes for the failed operations. Please refer to status management APIs at \ref DCGMAPI_ST - * to access the error attributes. - * - * To find out valid supported clock values that can be passed to dcgmConfigSet, look at the device - * attributes of a GPU in the group using the API dcgmGetDeviceAttributes. - - * @param pDcgmHandle IN : DCGM Handle - * - * @param groupId IN : Group ID representing collection of one or more GPUs. Look - * at \ref dcgmGroupCreate for details on creating the - * group. - * @param pDeviceConfig IN : Pointer to memory to hold desired configuration to be - * applied for all the GPU in the group represented by - * \a groupId. The caller must populate the version field of - * \a pDeviceConfig. - * @param statusHandle IN/OUT : Resulting error status for multiple operations. Pass it as - * NULL if the detailed error information is not needed. - * Look at \ref dcgmStatusCreate for details on creating - * status handle. - - * @return - * - \ref DCGM_ST_OK if the configuration has been successfully set. - * - \ref DCGM_ST_BADPARAM if any of \a groupId or \a pDeviceConfig is invalid. - * - \ref DCGM_ST_VER_MISMATCH if \a pDeviceConfig has the incorrect version. - * - \ref DCGM_ST_GENERIC_ERROR if an unknown error has occurred. - */ -dcgmReturn_t DECLDIR dcgmConfigSet(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmConfig_t *pDeviceConfig, - dcgmStatus_t statusHandle); - -/** - * Used to get configuration for all the GPUs present in the group. - * - * This API can get the most recent target or desired configuration set by \ref dcgmConfigSet. - * Set type as \a DCGM_CONFIG_TARGET_STATE to get target configuration. The target configuration - * properties are maintained by DCGM and are automatically enforced after a GPU reset or - * reinitialization is completed. - * - * The method can also be used to get the actual configuration state for the GPUs in the group. - * Set type as \a DCGM_CONFIG_CURRENT_STATE to get the actually configuration state. Ideally, the - * actual configuration state will be exact same as the target configuration state. - * - * If any of the property in the target configuration is unknown then the property value in the - * output is populated as one of DCGM_INT32_BLANK, DCGM_INT64_BLANK, DCGM_FP64_BLANK or - * DCGM_STR_BLANK based on the data type of the property. - * - * If any of the property in the current configuration state is not supported then the property - * value in the output is populated as one of DCGM_INT32_NOT_SUPPORTED, DCGM_INT64_NOT_SUPPORTED, - * DCGM_FP64_NOT_SUPPORTED or DCGM_STR_NOT_SUPPORTED based on the data type of the property. - * - * If any of the properties can't be fetched for any of the GPUs in the group then the API returns - * an error. The status handle \a statusHandle should be further evaluated to access error - * attributes for the failed operations. Please refer to status management APIs at \ref DCGMAPI_ST - * to access the error attributes. - * - * @param pDcgmHandle IN : DCGM Handle - * @param groupId IN : Group ID representing collection of one or more GPUs. Look - * at \ref dcgmGroupCreate for details on creating the - * group. - * @param type IN : Type of configuration values to be fetched. - * @param count IN : The number of entries that \a deviceConfigList array can - * store. - * @param deviceConfigList OUT : Pointer to memory to hold requested configuration - * corresponding to all the GPUs in the group (\a groupId). The - * size of the memory must be greater than or equal to hold - * output information for the number of GPUs present in the - * group (\a groupId). - * @param statusHandle IN/OUT : Resulting error status for multiple operations. Pass it as - * NULL if the detailed error information is not needed. - * Look at \ref dcgmStatusCreate for details on creating - * status handle. - - * @return - * - \ref DCGM_ST_OK if the configuration has been successfully fetched. - * - \ref DCGM_ST_BADPARAM if any of \a groupId, \a type, \a count, - * or \a deviceConfigList is invalid. - * - \ref DCGM_ST_NOT_CONFIGURED if the target configuration is not already set. - * - \ref DCGM_ST_VER_MISMATCH if \a deviceConfigList has the incorrect version. - * - \ref DCGM_ST_GENERIC_ERROR if an unknown error has occurred. - */ -dcgmReturn_t DECLDIR dcgmConfigGet(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmConfigType_t type, int count, - dcgmConfig_t deviceConfigList[], dcgmStatus_t statusHandle); +* Used to set configuration for the group of one or more GPUs identified by \a groupId. +* +* The configuration settings specified in \a pDeviceConfig are applied to all the GPUs in the +* group. Since DCGM group is a logical grouping of GPUs, the configuration settings stays intact +* for the individual GPUs even after the group is destroyed. +* +* If the user wishes to ignore the configuration of one or more properties in the input +* \a pDeviceConfig then the property should be specified as one of \a DCGM_INT32_BLANK, +* \a DCGM_INT64_BLANK, \a DCGM_FP64_BLANK or \a DCGM_STR_BLANK based on the data type of the +* property to be ignored. +* +* If any of the properties fail to be configured for any of the GPUs in the group then the API +* returns an error. The status handle \a statusHandle should be further evaluated to access error +* attributes for the failed operations. Please refer to status management APIs at \ref DCGMAPI_ST +* to access the error attributes. +* +* To find out valid supported clock values that can be passed to dcgmConfigSet, look at the device +* attributes of a GPU in the group using the API dcgmGetDeviceAttributes. + +* @param pDcgmHandle IN: DCGM Handle +* @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate +* for details on creating the group. +* @param pDeviceConfig IN: Pointer to memory to hold desired configuration to be applied for all the GPU in the +* group represented by \a groupId. +* The caller must populate the version field of \a pDeviceConfig. +* @param statusHandle IN/OUT: Resulting error status for multiple operations. Pass it as NULL if the detailed +* error information is not needed. +* Look at \ref dcgmStatusCreate for details on creating status handle. + +* @return +* - \ref DCGM_ST_OK if the configuration has been successfully set. +* - \ref DCGM_ST_BADPARAM if any of \a groupId or \a pDeviceConfig is invalid. +* - \ref DCGM_ST_VER_MISMATCH if \a pDeviceConfig has the incorrect version. +* - \ref DCGM_ST_GENERIC_ERROR if an unknown error has occurred. +* +*/ +dcgmReturn_t DECLDIR dcgmConfigSet(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmConfig_t *pDeviceConfig, + dcgmStatus_t statusHandle); + +/** +* Used to get configuration for all the GPUs present in the group. +* +* This API can get the most recent target or desired configuration set by \ref dcgmConfigSet. +* Set type as \a DCGM_CONFIG_TARGET_STATE to get target configuration. The target configuration +* properties are maintained by DCGM and are automatically enforced after a GPU reset or +* reinitialization is completed. +* +* The method can also be used to get the actual configuration state for the GPUs in the group. +* Set type as \a DCGM_CONFIG_CURRENT_STATE to get the actually configuration state. Ideally, the +* actual configuration state will be exact same as the target configuration state. +* +* If any of the property in the target configuration is unknown then the property value in the +* output is populated as one of DCGM_INT32_BLANK, DCGM_INT64_BLANK, DCGM_FP64_BLANK or +* DCGM_STR_BLANK based on the data type of the property. +* +* If any of the property in the current configuration state is not supported then the property +* value in the output is populated as one of DCGM_INT32_NOT_SUPPORTED, DCGM_INT64_NOT_SUPPORTED, +* DCGM_FP64_NOT_SUPPORTED or DCGM_STR_NOT_SUPPORTED based on the data type of the property. +* +* If any of the properties can't be fetched for any of the GPUs in the group then the API returns +* an error. The status handle \a statusHandle should be further evaluated to access error +* attributes for the failed operations. Please refer to status management APIs at \ref DCGMAPI_ST +* to access the error attributes. +* +* @param pDcgmHandle IN: DCGM Handle +* @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate +* for details on creating the group. +* @param type IN: Type of configuration values to be fetched. +* @param count IN: The number of entries that \a deviceConfigList array can store. +* @param deviceConfigList OUT: Pointer to memory to hold requested configuration corresponding to all the GPUs in +* the group (\a groupId). The size of the memory must be greater than or equal to hold +* output information for the number of GPUs present in the group (\a groupId). +* @param statusHandle IN/OUT: Resulting error status for multiple operations. Pass it as NULL if the detailed +* error information is not needed. +* Look at \ref dcgmStatusCreate for details on creating status handle. + +* @return +* - \ref DCGM_ST_OK if the configuration has been successfully fetched. +* - \ref DCGM_ST_BADPARAM if any of \a groupId, \a type, \a count, or \a deviceConfigList is invalid. +* - \ref DCGM_ST_NOT_CONFIGURED if the target configuration is not already set. +* - \ref DCGM_ST_VER_MISMATCH if \a deviceConfigList has the incorrect version. +* - \ref DCGM_ST_GENERIC_ERROR if an unknown error has occurred. +* +*/ +dcgmReturn_t DECLDIR dcgmConfigGet(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmConfigType_t type, + int count, + dcgmConfig_t deviceConfigList[], + dcgmStatus_t statusHandle); /** @} */ // Closing for DCGMAPI_DC_Setup @@ -732,47 +807,45 @@ dcgmReturn_t DECLDIR dcgmConfigGet(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupI */ /***************************************************************************************************/ - /** * Used to enforce previously set configuration for all the GPUs present in the group. - * + * * This API provides a mechanism to the users to manually enforce the configuration at any point of - * time. The configuration can only be enforced if it's already configured using the API \ref + * time. The configuration can only be enforced if it's already configured using the API \ref * dcgmConfigSet. - * - * If any of the properties can't be enforced for any of the GPUs in the group then the API returns - * an error. The status handle \a statusHandle should be further evaluated to access error - * attributes for the failed operations. Please refer to status management APIs at \ref DCGMAPI_ST + * + * If any of the properties can't be enforced for any of the GPUs in the group then the API returns + * an error. The status handle \a statusHandle should be further evaluated to access error + * attributes for the failed operations. Please refer to status management APIs at \ref DCGMAPI_ST * to access the error attributes. - * - * @param pDcgmHandle IN : DCGM Handle - * - * @param groupId IN : Group ID representing collection of one or more GPUs. Look at - * \ref dcgmGroupCreate for details on creating the group. - * Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS - * to perform operation on all the GPUs. - * @param statusHandle IN/OUT : Resulting error status for multiple operations. Pass it as - * NULL if the detailed error information is not needed. - * Look at \ref dcgmStatusCreate for details on creating - * status handle. + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate + * for details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. + * @param statusHandle IN/OUT: Resulting error status for multiple operations. Pass it as NULL if the detailed + * error information is not needed. Look at \ref dcgmStatusCreate for details on + * creating status handle. + * * @return * - \ref DCGM_ST_OK if the configuration has been successfully enforced. * - \ref DCGM_ST_BADPARAM if \a groupId is invalid. * - \ref DCGM_ST_NOT_CONFIGURED if the target configuration is not already set. * - \ref DCGM_ST_GENERIC_ERROR if an unknown error has occurred. + * */ dcgmReturn_t DECLDIR dcgmConfigEnforce(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmStatus_t statusHandle); /** @} */ // Closing for DCGMAPI_DC_MI /** @} */ // Closing for DCGMAPI_DC - + /***************************************************************************************************/ /** @defgroup DCGMAPI_FI Field APIs - * + * * These APIs are responsible for watching, unwatching, and updating specific fields as defined * by DCGM_FI_* - * + * * @{ */ /***************************************************************************************************/ @@ -784,11 +857,10 @@ dcgmReturn_t DECLDIR dcgmConfigEnforce(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t gr * To force a field update cycle, call dcgmUpdateAllFields(1). * * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more entities. Look at - * \ref dcgmGroupCreate for details on creating the group. - * Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS - * to perform operation on all the GPUs or \a DCGM_GROUP_ALL_NVSWITCHES to - * to perform the operation on all NvSwitches. + * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate + * for details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or + * \a DCGM_GROUP_ALL_NVSWITCHES to to perform the operation on all NvSwitches. * @param fieldGroupId IN: Fields to watch. * @param updateFreq IN: How often to update this field in usec * @param maxKeepAge IN: How long to keep data for this field in seconds @@ -797,183 +869,223 @@ dcgmReturn_t DECLDIR dcgmConfigEnforce(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t gr * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * */ -dcgmReturn_t dcgmWatchFields(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmFieldGrp_t fieldGroupId, - long long updateFreq, double maxKeepAge, int maxKeepSamples); +dcgmReturn_t dcgmWatchFields(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmFieldGrp_t fieldGroupId, + long long updateFreq, + double maxKeepAge, + int maxKeepSamples); /** * Request that DCGM stop recording updates for a given field collection. * * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more entities. Look at - * \ref dcgmGroupCreate for details on creating the group. - * Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS - * to perform operation on all the GPUs or \a DCGM_GROUP_ALL_NVSWITCHES to - * to perform the operation on all NvSwitches. + * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate + * for details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or + * \a DCGM_GROUP_ALL_NVSWITCHES to to perform the operation on all NvSwitches. * @param fieldGroupId IN: Fields to unwatch. * * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * */ - dcgmReturn_t dcgmUnwatchFields(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmFieldGrp_t fieldGroupId); +dcgmReturn_t dcgmUnwatchFields(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmFieldGrp_t fieldGroupId); /** * Request updates for all field values that have updated since a given timestamp - * - * This version only works with GPU entities. Use \ref dcgmGetValuesSince_v2 for entity groups + * + * This version only works with GPU entities. Use \ref dcgmGetValuesSince_v2 for entity groups * containing NvSwitches. * * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at - * \ref dcgmGroupCreate for details on creating the group. - * Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS - * to perform operation on all the GPUs. + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for + * details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. * @param fieldGroupId IN: Fields to return data for - * @param sinceTimestamp IN: Timestamp to request values since in usec since 1970. This will - * be returned in nextSinceTimestamp for subsequent calls - * 0 = request all data + * @param sinceTimestamp IN: Timestamp to request values since in usec since 1970. This will be returned in + * nextSinceTimestamp for subsequent calls 0 = request all data * @param nextSinceTimestamp OUT: Timestamp to use for sinceTimestamp on next call to this function - * @param enumCB IN: Callback to invoke for every field value update. Note that - * multiple updates can be returned in each invocation + * @param enumCB IN: Callback to invoke for every field value update. Note that multiple updates can be + * returned in each invocation * @param userData IN: User data pointer to pass to the userData field of enumCB. * * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_NOT_SUPPORTED if one of the entities was from a non-GPU type * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * */ - -dcgmReturn_t dcgmGetValuesSince(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmFieldGrp_t fieldGroupId, - long long sinceTimestamp, long long *nextSinceTimestamp, - dcgmFieldValueEnumeration_f enumCB, void *userData); +dcgmReturn_t dcgmGetValuesSince(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmFieldGrp_t fieldGroupId, + long long sinceTimestamp, + long long *nextSinceTimestamp, + dcgmFieldValueEnumeration_f enumCB, + void *userData); /** * Request updates for all field values that have updated since a given timestamp - * + * * This version works with non-GPU entities like NvSwitches * * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more entities. Look at - * \ref dcgmGroupCreate for details on creating the group. - * Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS - * to perform operation on all the GPUs or \a DCGM_GROUP_ALL_NVSWITCHES to - * perform the operation on all NvSwitches. + * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate + * for details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or + * \a DCGM_GROUP_ALL_NVSWITCHES to perform the operation on all NvSwitches. * @param fieldGroupId IN: Fields to return data for - * @param sinceTimestamp IN: Timestamp to request values since in usec since 1970. This will - * be returned in nextSinceTimestamp for subsequent calls - * 0 = request all data + * @param sinceTimestamp IN: Timestamp to request values since in usec since 1970. This will be returned in + * nextSinceTimestamp for subsequent calls 0 = request all data * @param nextSinceTimestamp OUT: Timestamp to use for sinceTimestamp on next call to this function - * @param enumCB IN: Callback to invoke for every field value update. Note that - * multiple updates can be returned in each invocation + * @param enumCB IN: Callback to invoke for every field value update. Note that multiple updates can be + * returned in each invocation * @param userData IN: User data pointer to pass to the userData field of enumCB. * * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * */ - -dcgmReturn_t dcgmGetValuesSince_v2(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmFieldGrp_t fieldGroupId, - long long sinceTimestamp, long long *nextSinceTimestamp, - dcgmFieldValueEntityEnumeration_f enumCB, void *userData); +dcgmReturn_t dcgmGetValuesSince_v2(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmFieldGrp_t fieldGroupId, + long long sinceTimestamp, + long long *nextSinceTimestamp, + dcgmFieldValueEntityEnumeration_f enumCB, + void *userData); /** * Request latest cached field value for a field value collection - * - * This version only works with GPU entities. Use \ref dcgmGetLatestValues_v2 for entity groups + * + * This version only works with GPU entities. Use \ref dcgmGetLatestValues_v2 for entity groups * containing NvSwitches. * * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at - * \ref dcgmGroupCreate for details on creating the group. - * Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS - * to perform operation on all the GPUs. + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for + * details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. * @param fieldGroupId IN: Fields to return data for. - * @param enumCB IN: Callback to invoke for every field value update. Note that - * multiple updates can be returned in each invocation + * @param enumCB IN: Callback to invoke for every field value update. Note that multiple updates can be + * returned in each invocation * @param userData IN: User data pointer to pass to the userData field of enumCB. * + * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_NOT_SUPPORTED if one of the entities was from a non-GPU type * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * */ -dcgmReturn_t dcgmGetLatestValues(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmFieldGrp_t fieldGroupId, - dcgmFieldValueEnumeration_f enumCB, void *userData); +dcgmReturn_t dcgmGetLatestValues(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmFieldGrp_t fieldGroupId, + dcgmFieldValueEnumeration_f enumCB, + void *userData); /** * Request latest cached field value for a field value collection - * + * * This version works with non-GPU entities like NvSwitches * * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more entities. Look at - * \ref dcgmGroupCreate for details on creating the group. - * Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS - * to perform operation on all the GPUs or \a DCGM_GROUP_ALL_NVSWITCHES to - * perform the operation on all NvSwitches. + * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate + * for details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or + * \a DCGM_GROUP_ALL_NVSWITCHES to perform the operation on all NvSwitches. * @param fieldGroupId IN: Fields to return data for. - * @param enumCB IN: Callback to invoke for every field value update. Note that - * multiple updates can be returned in each invocation + * @param enumCB IN: Callback to invoke for every field value update. Note that multiple updates can be + * returned in each invocation * @param userData IN: User data pointer to pass to the userData field of enumCB. * + * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_NOT_SUPPORTED if one of the entities was from a non-GPU type * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * */ -dcgmReturn_t dcgmGetLatestValues_v2(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmFieldGrp_t fieldGroupId, - dcgmFieldValueEntityEnumeration_f enumCB, void *userData); +dcgmReturn_t dcgmGetLatestValues_v2(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmFieldGrp_t fieldGroupId, + dcgmFieldValueEntityEnumeration_f enumCB, + void *userData); /** * Request latest cached field value for a GPU * - * @param pDcgmHandle IN: DCGM Handle - * @param gpuId IN: Gpu ID representing the GPU for which the fields are being requested. - * @param fields IN: Field IDs to return data for. See the definitions in dcgm_fields.h that - * start with DCGM_FI_. - * @param count IN: Number of field IDs in fields[] array. - * @param values OUT: Latest field values for the fields in fields[]. + * @param pDcgmHandle IN: DCGM Handle + * @param gpuId IN: Gpu ID representing the GPU for which the fields are being requested. + * @param fields IN: Field IDs to return data for. See the definitions in dcgm_fields.h that start with DCGM_FI_. + * @param count IN: Number of field IDs in fields[] array. + * @param values OUT: Latest field values for the fields in fields[]. + * */ -dcgmReturn_t dcgmGetLatestValuesForFields(dcgmHandle_t pDcgmHandle, int gpuId, unsigned short fields[], - unsigned int count, dcgmFieldValue_v1 values[]); +dcgmReturn_t dcgmGetLatestValuesForFields(dcgmHandle_t pDcgmHandle, + int gpuId, + unsigned short fields[], + unsigned int count, + dcgmFieldValue_v1 values[]); /** * Request latest cached field value for a group of fields for a specific entity * - * @param pDcgmHandle IN: DCGM Handle - * @param entityGroup IN: entity_group_t (e.g. switch) - * @param entityId IN: entity ID representing the rntity for which the fields are being requested. - * @param fields IN: Field IDs to return data for. See the definitions in dcgm_fields.h that - * start with DCGM_FI_. - * @param count IN: Number of field IDs in fields[] array. - * @param values OUT: Latest field values for the fields in fields[]. - */ -dcgmReturn_t dcgmEntityGetLatestValues(dcgmHandle_t pDcgmHandle, dcgm_field_entity_group_t entityGroup, - int entityId, unsigned short fields[], unsigned int count, + * @param pDcgmHandle IN: DCGM Handle + * @param entityGroup IN: entity_group_t (e.g. switch) + * @param entityId IN: entity ID representing the rntity for which the fields are being requested. + * @param fields IN: Field IDs to return data for. See the definitions in dcgm_fields.h that start with DCGM_FI_. + * @param count IN: Number of field IDs in fields[] array. + * @param values OUT: Latest field values for the fields in fields[]. + * + */ +dcgmReturn_t dcgmEntityGetLatestValues(dcgmHandle_t pDcgmHandle, + dcgm_field_entity_group_t entityGroup, + int entityId, + unsigned short fields[], + unsigned int count, dcgmFieldValue_v1 values[]); /** * Request the latest cached or live field value for a list of fields for a group of entities * * Note: The returned entities are not guaranteed to be in any order. Reordering can occur internally - * in order to optimize calls to the NVIDIA driver. - * - * @param pDcgmHandle IN: DCGM Handle - * @param entities IN: List of entities to get values for - * @param entityCount IN: Number of entries in entities[] - * @param fields IN: Field IDs to return data for. See the definitions in dcgm_fields.h that - * start with DCGM_FI_. - * @param fieldCount IN: Number of field IDs in fields[] array. - * @param flags IN: Optional flags that affect how this request is processed. Pass - * \ref DCGM_FV_FLAG_LIVE_DATA here to retrieve a live driver value rather - * than a cached value. See that flag's documentation for caveats. - * @param values OUT: Latest field values for the fields requested. This must be able to hold - * entityCount * fieldCount field value records. - */ -dcgmReturn_t dcgmEntitiesGetLatestValues(dcgmHandle_t pDcgmHandle, dcgmGroupEntityPair_t entities[], - unsigned int entityCount, unsigned short fields[], - unsigned int fieldCount, unsigned int flags, + * in order to optimize calls to the NVIDIA driver. + * + * @param pDcgmHandle IN: DCGM Handle + * @param entities IN: List of entities to get values for + * @param entityCount IN: Number of entries in entities[] + * @param fields IN: Field IDs to return data for. See the definitions in dcgm_fields.h that start with DCGM_FI_. + * @param fieldCount IN: Number of field IDs in fields[] array. + * @param flags IN: Optional flags that affect how this request is processed. Pass \ref DCGM_FV_FLAG_LIVE_DATA + * here to retrieve a live driver value rather than a cached value. See that flag's + * documentation for caveats. + * @param values OUT: Latest field values for the fields requested. This must be able to hold entityCount * + * fieldCount field value records. + * + */ +dcgmReturn_t dcgmEntitiesGetLatestValues(dcgmHandle_t pDcgmHandle, + dcgmGroupEntityPair_t entities[], + unsigned int entityCount, + unsigned short fields[], + unsigned int fieldCount, + unsigned int flags, dcgmFieldValue_v2 values[]); +/*************************************************************************/ +/** + * Get a summary of the values for a field id over a period of time. + * + * @param pDcgmHandle IN: DCGM Handle + * @param request IN/OUT: a pointer to the struct detailing the request and containing the response + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_FIELD_UNSUPPORTED_BY_API if the field is not int64 or double type + * + */ +dcgmReturn_t DECLDIR dcgmGetFieldSummary(dcgmHandle_t pDcgmHandle, dcgmFieldSummaryRequest_t *request); + /** @} */ /***************************************************************************************************/ @@ -984,19 +1096,20 @@ dcgmReturn_t dcgmEntitiesGetLatestValues(dcgmHandle_t pDcgmHandle, dcgmGroupEnti /** * This method is used to tell the DCGM module to update all the fields being watched. - * + * * Note: If the if the operation mode was set to manual mode (DCGM_OPERATION_MODE_MANUAL) during * initialization (\ref dcgmInit), this method must be caused periodically to allow field value watches * the opportunity to gather samples. - * + * * @param pDcgmHandle IN: DCGM Handle - * @param waitForUpdate IN: Whether or not to wait for the update loop to - * complete before returning to the caller 1=wait. 0=do not wait. + * @param waitForUpdate IN: Whether or not to wait for the update loop to complete before returning to the + * caller 1=wait. 0=do not wait. * - * @return + * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_BADPARAM if \a waitForUpdate is invalid * - \ref DCGM_ST_GENERIC_ERROR if an unspecified DCGM error occurs + * */ dcgmReturn_t dcgmUpdateAllFields(dcgmHandle_t pDcgmHandle, int waitForUpdate); @@ -1005,7 +1118,7 @@ dcgmReturn_t dcgmUpdateAllFields(dcgmHandle_t pDcgmHandle, int waitForUpdate); /***************************************************************************************************/ /** @defgroup DCGMAPI_PROCESS_STATS Process Statistics - * Describes APIs to investigate statistics such as accounting, performance and errors during the + * Describes APIs to investigate statistics such as accounting, performance and errors during the * lifetime of a GPU process * @{ */ @@ -1018,10 +1131,9 @@ dcgmReturn_t dcgmUpdateAllFields(dcgmHandle_t pDcgmHandle, int waitForUpdate); * To force a field update cycle, call dcgmUpdateAllFields(1). * * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at - * \ref dcgmGroupCreate for details on creating the group. - * Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS - * to perform operation on all the GPUs. + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for + * details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. * @param updateFreq IN: How often to update this field in usec * @param maxKeepAge IN: How long to keep data for this field in seconds * @param maxKeepSamples IN: Maximum number of samples to keep. 0=no limit @@ -1029,14 +1141,16 @@ dcgmReturn_t dcgmUpdateAllFields(dcgmHandle_t pDcgmHandle, int waitForUpdate); * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - \ref DCGM_ST_REQUIRES_ROOT if the host engine is being run as non-root, and - * accounting mode could not be enabled (requires root). - * Run "nvidia-smi -am 1" as root on the node before starting - * DCGM to fix this. + * - \ref DCGM_ST_REQUIRES_ROOT if the host engine is being run as non-root, and accounting mode could not + * be enabled (requires root). Run "nvidia-smi -am 1" as root on the node + * before starting DCGM to fix this. + * */ - -dcgmReturn_t dcgmWatchPidFields(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, - long long updateFreq, double maxKeepAge, int maxKeepSamples); +dcgmReturn_t dcgmWatchPidFields(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + long long updateFreq, + double maxKeepAge, + int maxKeepSamples); /** * @@ -1048,13 +1162,12 @@ dcgmReturn_t dcgmWatchPidFields(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, * * @param pDcgmHandle IN: DCGM Handle * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS - * to perform operation on all the GPUs. - * @param pidInfo IN/OUT: Structure to return information about pid in. - * pidInfo->pid must be set to the pid in question. + * for details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. + * @param pidInfo IN/OUT: Structure to return information about pid in. pidInfo->pid must be set to the pid in question. * pidInfo->version should be set to dcgmPidInfo_version. * - * @return + * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_NO_DATA if the PID did not run on any GPU * @@ -1065,9 +1178,9 @@ dcgmReturn_t dcgmGetPidInfo(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgm /***************************************************************************************************/ /** @defgroup DCGMAPI_JOB_STATS Job Statistics - * The client can invoke DCGM APIs to start and stop collecting the stats at the process boundaries - * (during prologue and epilogue). This will enable DCGM to monitor all the PIDs while the job is - * in progress, and provide a summary of active processes and resource usage during the window of + * The client can invoke DCGM APIs to start and stop collecting the stats at the process boundaries + * (during prologue and epilogue). This will enable DCGM to monitor all the PIDs while the job is + * in progress, and provide a summary of active processes and resource usage during the window of * interest. * @{ */ @@ -1080,10 +1193,9 @@ dcgmReturn_t dcgmGetPidInfo(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgm * To force a field update cycle, call dcgmUpdateAllFields(1). * * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at - * \ref dcgmGroupCreate for details on creating the group. - * Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS - * to perform operation on all the GPUs. + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for + * details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. * @param updateFreq IN: How often to update this field in usec * @param maxKeepAge IN: How long to keep data for this field in seconds * @param maxKeepSamples IN: Maximum number of samples to keep. 0=no limit @@ -1095,59 +1207,65 @@ dcgmReturn_t dcgmGetPidInfo(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgm * accounting mode could not be enabled (requires root). * Run "nvidia-smi -am 1" as root on the node before starting * DCGM to fix this. + * */ - -dcgmReturn_t dcgmWatchJobFields(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, - long long updateFreq, double maxKeepAge, int maxKeepSamples); - +dcgmReturn_t dcgmWatchJobFields(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + long long updateFreq, + double maxKeepAge, + int maxKeepSamples); /** * This API is used by the client to notify DCGM about the job to be started. Should be invoked as * part of job prologue - * - * @param pDcgmHandle IN : DCGM Handle - * @param groupId IN : Group ID representing collection of one or more GPUs. Look at - * \ref dcgmGroupCreate for details on creating the group. - * Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS - * to perform operation on all the GPUs. - * @param jobId IN : User provided string to represent the job - * @return + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for + * details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. + * @param jobId IN: User provided string to represent the job + * + * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_BADPARAM if a parameter is invalid * - \ref DCGM_ST_DUPLICATE_KEY if the specified \a jobId is already in use + * */ dcgmReturn_t dcgmJobStartStats(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, char jobId[64]); /** - * This API is used by the clients to notify DCGM to stop collecting stats for the job represented + * This API is used by the clients to notify DCGM to stop collecting stats for the job represented * by job id. Should be invoked as part of job epilogue. * The job Id remains available to view the stats at any point but cannot be used to start a new job. * You must call dcgmWatchJobFields() before this call to enable watching of job - * - * @param pDcgmHandle IN : DCGM Handle - * @param jobId IN : User provided string to represent the job + * + * @param pDcgmHandle IN: DCGM Handle + * @param jobId IN: User provided string to represent the job + * * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_BADPARAM if a parameter is invalid * - \ref DCGM_ST_NO_DATA if \a jobId is not a valid job identifier. + * */ dcgmReturn_t dcgmJobStopStats(dcgmHandle_t pDcgmHandle, char jobId[64]); /** - * Get stats for the job identified by DCGM generated job id. The stats can be retrieved at any + * Get stats for the job identified by DCGM generated job id. The stats can be retrieved at any * point when the job is in process. * If you want to reuse this jobId, call \ref dcgmJobRemove after this call. - * - * @param pDcgmHandle IN : DCGM Handle - * @param jobId IN : User provided string to represent the job - * @param pJobInfo IN/OUT : Structure to return information about the - * job.
.version should be set to - * \ref dcgmJobInfo_version before this call. + * + * @param pDcgmHandle IN: DCGM Handle + * @param jobId IN: User provided string to represent the job + * @param pJobInfo IN/OUT: Structure to return information about the job.
.version should be set to + * \ref dcgmJobInfo_version before this call. + * * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_BADPARAM if a parameter is invalid * - \ref DCGM_ST_NO_DATA if \a jobId is not a valid job identifier. * - \ref DCGM_ST_VER_MISMATCH if .version is not set or is invalid. + * */ dcgmReturn_t dcgmJobGetStats(dcgmHandle_t pDcgmHandle, char jobId[64], dcgmJobInfo_t *pJobInfo); @@ -1156,13 +1274,14 @@ dcgmReturn_t dcgmJobGetStats(dcgmHandle_t pDcgmHandle, char jobId[64], dcgmJobIn * be able to call dcgmJobGetStats() on this jobId. However, you will be able to reuse jobId after * this call. * - * @param pDcgmHandle IN : DCGM Handle - * @param jobId IN : User provided string to represent the job + * @param pDcgmHandle IN: DCGM Handle + * @param jobId IN: User provided string to represent the job * * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_BADPARAM if a parameter is invalid * - \ref DCGM_ST_NO_DATA if \a jobId is not a valid job identifier. + * */ dcgmReturn_t dcgmJobRemove(dcgmHandle_t pDcgmHandle, char jobId[64]); @@ -1171,7 +1290,7 @@ dcgmReturn_t dcgmJobRemove(dcgmHandle_t pDcgmHandle, char jobId[64]); * be able to call dcgmJobGetStats() any jobs until you call dcgmJobStartStats again. * You will be able to reuse any previously-used jobIds after this call. * - * @param pDcgmHandle IN : DCGM Handle + * @param pDcgmHandle IN: DCGM Handle * * @return * - \ref DCGM_ST_OK if the call was successful @@ -1185,7 +1304,7 @@ dcgmReturn_t dcgmJobRemoveAll(dcgmHandle_t pDcgmHandle); /** @defgroup DCGMAPI_HM Health Monitor * * This chapter describes the methods that handle the GPU health monitor. - * + * * @{ */ /***************************************************************************************************/ @@ -1193,40 +1312,53 @@ dcgmReturn_t dcgmJobRemoveAll(dcgmHandle_t pDcgmHandle); /** * Enable the DCGM health check system for the given systems defined in \ref dcgmHealthSystems_t * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more entities. Look - * at \ref dcgmGroupCreate for details on creating the group. - * Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS - * to perform operation on all the GPUs or \a DCGM_GROUP_ALL_NVSWITCHES - * to perform operation on all the NvSwitches. - * @param systems IN: An enum representing systems that should be enabled for health - * checks logically OR'd together. Refer to \ref dcgmHealthSystems_t - * for details. + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate + * for details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or + * \a DCGM_GROUP_ALL_NVSWITCHES to perform operation on all the NvSwitches. + * @param systems IN: An enum representing systems that should be enabled for health checks logically OR'd + * together. Refer to \ref dcgmHealthSystems_t for details. * * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * */ - dcgmReturn_t dcgmHealthSet(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmHealthSystems_t systems); /** - * Retrieve the current state of the DCGM health check system + * Enable the DCGM health check system for the given systems defined in \ref dcgmHealthSystems_t + * + * Since DCGM 2.0 * * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more entities. Look - * at \ref dcgmGroupCreate for details on creating the group. - * Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS - * to perform operation on all the GPUs or \a DCGM_GROUP_ALL_NVSWITCHES - * to perform operation on all the NvSwitches. - * @param systems OUT: An integer representing the enabled systems for the given group - * Refer to \ref dcgmHealthSystems_t for details. + * @param healthSet IN: Parameters to use when setting health watches. See + * \ref dcgmHealthSetParams_v2 for the description of each parameter. * * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_BADPARAM if a parameter is invalid */ - + +dcgmReturn_t dcgmHealthSet_v2(dcgmHandle_t pDcgmHandle, dcgmHealthSetParams_v2 *params); + +/** + * Retrieve the current state of the DCGM health check system + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate + * for details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or + * \a DCGM_GROUP_ALL_NVSWITCHES to perform operation on all the NvSwitches. + * @param systems OUT: An integer representing the enabled systems for the given group Refer to + * \ref dcgmHealthSystems_t for details. + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * + */ dcgmReturn_t dcgmHealthGet(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmHealthSystems_t *systems); @@ -1236,7 +1368,7 @@ dcgmReturn_t dcgmHealthGet(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmH * about all of the enabled watches within a group is created but no error results are * provided. On subsequent calls, any error information will be returned. * - * + * * @param pDcgmHandle IN: DCGM Handle * @param groupId IN: Group ID representing a collection of one or more entities. * Refer to \ref dcgmGroupCreate for details on creating a group @@ -1247,8 +1379,8 @@ dcgmReturn_t dcgmHealthGet(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmH * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_BADPARAM if a parameter is invalid * - \ref DCGM_ST_VER_MISMATCH if results->version is not dcgmHealthResponse_version + * */ - dcgmReturn_t dcgmHealthCheck(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmHealthResponse_t *results); /** @} */ @@ -1258,14 +1390,14 @@ dcgmReturn_t dcgmHealthCheck(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcg * * This chapter describes the methods that handle system policy management and violation settings. * The APIs in Policies module can be broken down into following categories: - * + * * @{ */ /***************************************************************************************************/ /***************************************************************************************************/ /** @defgroup DCGMAPI_PO_Setup Setup and Management - * Describes APIs for setting up policies and registering callbacks to receive notification in + * Describes APIs for setting up policies and registering callbacks to receive notification in * case specific policy condition has been violated. * @{ */ @@ -1275,98 +1407,99 @@ dcgmReturn_t dcgmHealthCheck(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcg * Set the current violation policy inside the policy manager. Given the conditions within the * \ref dcgmPolicy_t structure, if a violation has occurred, subsequent action(s) may be performed to * either report or contain the failure. - * - * This API is only supported on Tesla GPUs and will return DCGM_ST_NOT_SUPPORTED if any non-Tesla GPUs - * are part of the GPU group specified in groupId. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS - * to perform operation on all the GPUs. - * @param policy IN: A reference to \ref dcgmPolicy_t that will be applied to all - * GPUs in the group. - * @param statusHandle IN/OUT: Resulting status for the operation. Pass it as NULL if - * the detailed error information is not needed. - * Refer to \ref dcgmStatusCreate for details on - * creating a status handle. - * @return + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for + * details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. + * @param policy IN: A reference to \ref dcgmPolicy_t that will be applied to all GPUs in the group. + * @param statusHandle IN/OUT: Resulting status for the operation. Pass it as NULL if the detailed error information + * is not needed. Refer to \ref dcgmStatusCreate for details on creating a status handle. + * + * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_BADPARAM if \a groupId or \a policy is invalid - * - \ref DCGM_ST_NOT_SUPPORTED if any non-Tesla GPUs are part of the GPU group specified in groupId + * - \ref DCGM_ST_NOT_SUPPORTED if any unsupported GPUs are part of the GPU group specified in groupId * - DCGM_ST_* a different error has occurred and is stored in \a statusHandle. * Refer to \ref dcgmReturn_t + * */ -dcgmReturn_t dcgmPolicySet(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmPolicy_t *policy, dcgmStatus_t statusHandle); +dcgmReturn_t dcgmPolicySet(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmPolicy_t *policy, + dcgmStatus_t statusHandle); /** - * Get the current violation policy inside the policy manager. Given a groupId, a number of + * Get the current violation policy inside the policy manager. Given a groupId, a number of * policy structures are retrieved. * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS - * to perform operation on all the GPUs. - * @param count IN: The size of the policy array. This is the maximum number of policies - * that will be retrieved and ultimately should correspond to the number - * of GPUs specified in the group. - * @param policy OUT: A reference to \ref dcgmPolicy_t that will used as storage for the - * current policies applied to each GPU in the group. - * @param statusHandle IN/OUT: Resulting status for the operation. Pass it as NULL if - * the detailed error information for the operation is not - * needed. Refer to \ref dcgmStatusCreate - * for details on creating a status handle. - * - * @return + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for + * details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. + * @param count IN: The size of the policy array. This is the maximum number of policies that will be + * retrieved and ultimately should correspond to the number of GPUs specified in the + * group. + * @param policy OUT: A reference to \ref dcgmPolicy_t that will used as storage for the current policies + * applied to each GPU in the group. + * @param statusHandle IN/OUT: Resulting status for the operation. Pass it as NULL if the detailed error information + * for the operation is not needed. Refer to \ref dcgmStatusCreate for details on + * creating a status handle. + * + * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_BADPARAM if \a groupId or \a policy is invalid * - DCGM_ST_* a different error has occurred and is stored in \a statusHandle. * Refer to \ref dcgmReturn_t + * */ -dcgmReturn_t dcgmPolicyGet(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, int count, - dcgmPolicy_t *policy, dcgmStatus_t statusHandle); +dcgmReturn_t dcgmPolicyGet(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + int count, + dcgmPolicy_t *policy, + dcgmStatus_t statusHandle); /** - * Register a function to be called when a specific policy condition (see \ref dcgmPolicyCondition_t) has been - * violated. This callback(s) will be called automatically when in DCGM_OPERATION_MODE_AUTO mode and only after + * Register a function to be called when a specific policy condition (see \ref dcgmPolicyCondition_t) has been + * violated. This callback(s) will be called automatically when in DCGM_OPERATION_MODE_AUTO mode and only after * dcgmPolicyTrigger when in DCGM_OPERATION_MODE_MANUAL mode. All callbacks are made within a separate thread. * - * This API is only supported on Tesla GPUs and will return DCGM_ST_NOT_SUPPORTED if any non-Tesla GPUs - * are part of the GPU group specified in groupId. + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for + * details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. + * @param condition IN: The set of conditions specified as an OR'd list (see \ref dcgmPolicyCondition_t) for + * which to register a callback function + * @param beginCallback IN: A reference to a function that should be called should a violation occur. + * This function will be called prior to any actions specified by the policy are taken. + * @param finishCallback IN: A reference to a function that should be called should a violation occur. + * This function will be called after any action specified by the policy are completed. * - * @param pDcgmHandle IN: DCGM Handle - * - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS - * to perform operation on all the GPUs. - * @param condition IN: The set of conditions specified as an OR'd list (see \ref dcgmPolicyCondition_t) - * for which to register a callback function - * @param beginCallback IN: A reference to a function that should be called should a violation occur. This - * function will be called prior to any actions specified by the policy are taken. - * @param finishCallback IN: A reference to a function that should be called should a violation occur. This - * function will be called after any action specified by the policy are completed. - * - * @return + * @return * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if \a groupId, \a condition, is invalid, \a beginCallback, or \a finishCallback is NULL - * - \ref DCGM_ST_NOT_SUPPORTED if any non-Tesla GPUs are part of the GPU group specified in groupId + * - \ref DCGM_ST_BADPARAM if \a groupId, \a condition, is invalid, \a beginCallback, or + * \a finishCallback is NULL + * - \ref DCGM_ST_NOT_SUPPORTED if any unsupported GPUs are part of the GPU group specified in groupId * */ -dcgmReturn_t dcgmPolicyRegister(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmPolicyCondition_t condition, - fpRecvUpdates beginCallback, fpRecvUpdates finishCallback); +dcgmReturn_t dcgmPolicyRegister(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmPolicyCondition_t condition, + fpRecvUpdates beginCallback, + fpRecvUpdates finishCallback); /** * Unregister a function to be called for a specific policy condition (see \ref dcgmPolicyCondition_t). * This function will unregister all callbacks for a given condition and handle. - * - * @param pDcgmHandle IN: DCGM Handle * - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS - * to perform operation on all the GPUs. - * @param condition IN: The set of conditions specified as an OR'd list (see \ref dcgmPolicyCondition_t) - * for which to unregister a callback function + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for + * details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. + * @param condition IN: The set of conditions specified as an OR'd list (see \ref dcgmPolicyCondition_t) for + * which to unregister a callback function * - * @return + * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_BADPARAM if \a groupId, \a condition, is invalid or \a callback is NULL * @@ -1377,7 +1510,7 @@ dcgmReturn_t dcgmPolicyUnregister(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId /***************************************************************************************************/ /** @defgroup DCGMAPI_PO_MI Manual Invocation - * Describes APIs which can be used to perform direct actions (e.g. Perform GPU Reset, Run Health + * Describes APIs which can be used to perform direct actions (e.g. Perform GPU Reset, Run Health * Diagnostics) on a group of GPUs. * @{ */ @@ -1387,62 +1520,60 @@ dcgmReturn_t dcgmPolicyUnregister(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId * Inform the action manager to perform a manual validation of a group of GPUs on the system * * *************************************** DEPRECATED *************************************** - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS - * to perform operation on all the GPUs. - * @param validate IN: The validation to perform after the action. - * @param response OUT: Result of the validation process. Refer to \ref dcgmDiagResponse_t for details. - * - * - * @return + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for + * details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. + * @param validate IN: The validation to perform after the action. + * @param response OUT: Result of the validation process. Refer to \ref dcgmDiagResponse_t for details. + * + * @return * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NOT_SUPPORTED if running the specified \a validate is not supported. This is usually due to the - * Tesla recommended driver not being installed on the system. + * - \ref DCGM_ST_NOT_SUPPORTED if running the specified \a validate is not supported. This is usually due + * to the Tesla recommended driver not being installed on the system. * - \ref DCGM_ST_BADPARAM if \a groupId, \a validate, or \a statusHandle is invalid * - \ref DCGM_ST_GENERIC_ERROR an internal error has occurred - * - \ref DCGM_ST_GROUP_INCOMPATIBLE if \a groupId refers to a group of non-homogeneous GPUs. This is currently not allowed. + * - \ref DCGM_ST_GROUP_INCOMPATIBLE if \a groupId refers to a group of non-homogeneous GPUs. This is currently + * not allowed. + * */ -dcgmReturn_t dcgmActionValidate(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmPolicyValidation_t validate, +dcgmReturn_t dcgmActionValidate(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmPolicyValidation_t validate, dcgmDiagResponse_t *response); /** * Inform the action manager to perform a manual validation of a group of GPUs on the system - * - * @param pDcgmHandle IN: DCGM Handle - * @param drd IN: Contains the group id, test names, test parameters, struct version, and - * the validation that should be performed. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id - * as \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param response OUT: Result of the validation process. Refer to \ref dcgmDiagResponse_t for details. - * - * - * @return + * + * @param pDcgmHandle IN: DCGM Handle + * @param drd IN: Contains the group id, test names, test parameters, struct version, and the validation + * that should be performed. Look at \ref dcgmGroupCreate for details on creating the + * group. Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS to perform + * operation on all the GPUs. + * @param response OUT: Result of the validation process. Refer to \ref dcgmDiagResponse_t for details. + * + * @return * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NOT_SUPPORTED if running the specified \a validate is not supported. This is usually due to the - * Tesla recommended driver not being installed on the system. + * - \ref DCGM_ST_NOT_SUPPORTED if running the specified \a validate is not supported. This is usually + * due to the Tesla recommended driver not being installed on the system. * - \ref DCGM_ST_BADPARAM if \a groupId, \a validate, or \a statusHandle is invalid * - \ref DCGM_ST_GENERIC_ERROR an internal error has occurred - * - \ref DCGM_ST_GROUP_INCOMPATIBLE if \a groupId refers to a group of non-homogeneous GPUs. This is currently not allowed. + * - \ref DCGM_ST_GROUP_INCOMPATIBLE if \a groupId refers to a group of non-homogeneous GPUs. This is + * currently not allowed. */ -dcgmReturn_t dcgmActionValidate_v2(dcgmHandle_t pDcgmHandle, dcgmRunDiag_t *drd, dcgmDiagResponse_t *response); +dcgmReturn_t dcgmActionValidate_v2(dcgmHandle_t pDcgmHandle, dcgmRunDiag_v6 *drd, dcgmDiagResponse_t *response); /** * Run a diagnostic on a group of GPUs * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one - * or more GPUs. Look at \ref dcgmGroupCreate - * for details on creating the group. - * Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform - * operation on all the GPUs. - * @param diagLevel IN: Diagnostic level to run - * @param diagResponse IN/OUT: Result of running the DCGM diagnostic.
- * .version should be set to - * \ref dcgmDiagResponse_version before this - * call. + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate + * for details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. + * @param diagLevel IN: Diagnostic level to run + * @param diagResponse IN/OUT: Result of running the DCGM diagnostic.
+ * .version should be set to \ref dcgmDiagResponse_version before this call. * * @return * - \ref DCGM_ST_OK if the call was successful @@ -1450,10 +1581,14 @@ dcgmReturn_t dcgmActionValidate_v2(dcgmHandle_t pDcgmHandle, dcgmRunDiag_t *drd, * Tesla recommended driver not being installed on the system. * - \ref DCGM_ST_BADPARAM if a provided parameter is invalid or missing * - \ref DCGM_ST_GENERIC_ERROR an internal error has occurred - * - \ref DCGM_ST_GROUP_INCOMPATIBLE if \a groupId refers to a group of non-homogeneous GPUs. This is currently not allowed. + * - \ref DCGM_ST_GROUP_INCOMPATIBLE if \a groupId refers to a group of non-homogeneous GPUs. This is + * currently not allowed. * - \ref DCGM_ST_VER_MISMATCH if .version is not set or is invalid. + * */ -dcgmReturn_t dcgmRunDiagnostic(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmDiagnosticLevel_t diagLevel, +dcgmReturn_t dcgmRunDiagnostic(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmDiagnosticLevel_t diagLevel, dcgmDiagResponse_t *diagResponse); /** @} */ // Closing for DCGMAPI_PO_MI @@ -1469,14 +1604,14 @@ dcgmReturn_t dcgmRunDiagnostic(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, d /** * Inform the policy manager loop to perform an iteration and trigger the callbacks of any * registered functions. Callback functions will be called from a separate thread as the calling function. - * - * Note: The GPU monitoring and management agent must call this method periodically if the operation - * mode is set to manual mode (DCGM_OPERATION_MODE_MANUAL) during initialization + * + * Note: The GPU monitoring and management agent must call this method periodically if the operation + * mode is set to manual mode (DCGM_OPERATION_MODE_MANUAL) during initialization * (\ref dcgmInit). - * + * * @param pDcgmHandle IN: DCGM Handle - * - * @return + * + * @return * - \ref DCGM_ST_OK If the call was successful * - DCGM_ST_GENERIC_ERROR The policy manager was unable to perform another iteration. */ @@ -1495,29 +1630,34 @@ dcgmReturn_t dcgmPolicyTrigger(dcgmHandle_t pDcgmHandle); * * @param pDcgmHandle IN: DCGM Handle * @param gpuId IN: GPU Id corresponding to which topology information should be fetched - * @param pDcgmDeviceTopology IN/OUT: Topology information corresponding to \a gpuId. pDcgmDeviceTopology->version - * must be set to dcgmDeviceTopology_version before this call. + * @param pDcgmDeviceTopology IN/OUT: Topology information corresponding to \a gpuId. pDcgmDeviceTopology->version must + * be set to dcgmDeviceTopology_version before this call. * @return * - \ref DCGM_ST_OK if the call was successful. * - \ref DCGM_ST_BADPARAM if \a gpuId or \a pDcgmDeviceTopology were not valid. * - \ref DCGM_ST_VER_MISMATCH if pDcgmDeviceTopology->version was not set to dcgmDeviceTopology_version. + * */ - -dcgmReturn_t DECLDIR dcgmGetDeviceTopology(dcgmHandle_t pDcgmHandle, unsigned int gpuId, dcgmDeviceTopology_t *pDcgmDeviceTopology); +dcgmReturn_t DECLDIR dcgmGetDeviceTopology(dcgmHandle_t pDcgmHandle, + unsigned int gpuId, + dcgmDeviceTopology_t *pDcgmDeviceTopology); /** * Gets group topology corresponding to the \a groupId. * * @param pDcgmHandle IN: DCGM Handle * @param groupId IN: GroupId corresponding to which topology information should be fetched - * @param pDcgmGroupTopology IN/OUT: Topology information corresponding to \a groupId. pDcgmgroupTopology->version - * must be set to dcgmGroupTopology_version. + * @param pDcgmGroupTopology IN/OUT: Topology information corresponding to \a groupId. pDcgmgroupTopology->version must + * be set to dcgmGroupTopology_version. * @return * - \ref DCGM_ST_OK if the call was successful. * - \ref DCGM_ST_BADPARAM if \a groupId or \a pDcgmGroupTopology were not valid. * - \ref DCGM_ST_VER_MISMATCH if pDcgmgroupTopology->version was not set to dcgmGroupTopology_version. + * */ -dcgmReturn_t DECLDIR dcgmGetGroupTopology(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmGroupTopology_t *pDcgmGroupTopology); +dcgmReturn_t DECLDIR dcgmGetGroupTopology(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmGroupTopology_t *pDcgmGroupTopology); /** @} */ // Closing for DCGMAPI_Topo @@ -1529,14 +1669,16 @@ dcgmReturn_t DECLDIR dcgmGetGroupTopology(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t /***************************************************************************************************/ /** - * Toggle the state of introspection metadata gathering in DCGM. Metadata gathering will increase the memory usage of DCGM - * so that it can store the metadata it gathers. + * Toggle the state of introspection metadata gathering in DCGM. Metadata gathering will increase the memory usage + * of DCGM so that it can store the metadata it gathers. * * @param pDcgmHandle IN: DCGM Handle * @param enabledState IN: The state to set gathering of introspection data to + * * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_BADPARAM enabledState is an invalid state for metadata gathering + * */ dcgmReturn_t DECLDIR dcgmIntrospectToggleState(dcgmHandle_t pDcgmHandle, dcgmIntrospectState_t enabledState); @@ -1545,19 +1687,19 @@ dcgmReturn_t DECLDIR dcgmIntrospectToggleState(dcgmHandle_t pDcgmHandle, dcgmInt * Get the current amount of memory used to store the given field collection. * * @param pDcgmHandle IN: DCGM Handle - * @param context IN: see \ref dcgmIntrospectContext_t. This identifies the level - * of fields to do introspection for (ex: all fields, field groups) - * context->version must be set to dcgmIntrospectContext_version prior - * to this call. - * @param memoryInfo IN/OUT: see \ref dcgmIntrospectFullMemory_t. memoryInfo->version must be set - * to dcgmIntrospectFullMemory_version prior to this call. - * @param waitIfNoData IN: if no metadata has been gathered, should this call block until data - * has been gathered (1), or should this call just return DCGM_ST_NO_DATA (0). + * @param context IN: see \ref dcgmIntrospectContext_t. This identifies the level of fields to do + * introspection for (ex: all fields, field groups) context->version must be + * set to dcgmIntrospectContext_version prior to this call. + * @param memoryInfo IN/OUT: see \ref dcgmIntrospectFullMemory_t. memoryInfo->version must be set to + * dcgmIntrospectFullMemory_version prior to this call. + * @param waitIfNoData IN: if no metadata has been gathered, should this call block until data has been + * gathered (1), or should this call just return DCGM_ST_NO_DATA (0). * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_NOT_CONFIGURED if metadata gathering state is \a DCGM_INTROSPECT_STATE_DISABLED * - \ref DCGM_ST_NO_DATA if \a waitIfNoData is false and metadata has not been gathered yet * - \ref DCGM_ST_VER_MISMATCH if context->version or memoryInfo->version is 0 or invalid. + * */ dcgmReturn_t DECLDIR dcgmIntrospectGetFieldsMemoryUsage(dcgmHandle_t pDcgmHandle, dcgmIntrospectContext_t *context, @@ -1571,17 +1713,17 @@ dcgmReturn_t DECLDIR dcgmIntrospectGetFieldsMemoryUsage(dcgmHandle_t pDcgmHandle * This measurement represents both the resident set size (what is currently in RAM) and * the swapped memory that belongs to the process. * - * @param pDcgmHandle IN: DCGM Handle - * @param memoryInfo IN/OUT: see \ref dcgmIntrospectMemory_t. memoryInfo->version must be set to - * dcgmIntrospectMemory_version prior to this call. - * @param waitIfNoData IN: if no metadata is gathered wait till this occurs (!0) - * or return DCGM_ST_NO_DATA (0) + * @param pDcgmHandle IN: DCGM Handle + * @param memoryInfo IN/OUT: see \ref dcgmIntrospectMemory_t. memoryInfo->version must be set to + * dcgmIntrospectMemory_version prior to this call. + * @param waitIfNoData IN: if no metadata is gathered wait till this occurs (!0) or return DCGM_ST_NO_DATA (0) * * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_NOT_CONFIGURED if metadata gathering state is \a DCGM_INTROSPECT_STATE_DISABLED * - \ref DCGM_ST_NO_DATA if \a waitIfNoData is false and metadata has not been gathered yet * - \ref DCGM_ST_VER_MISMATCH if memoryInfo->version is 0 or invalid. + * */ dcgmReturn_t DECLDIR dcgmIntrospectGetHostengineMemoryUsage(dcgmHandle_t pDcgmHandle, dcgmIntrospectMemory_t *memoryInfo, @@ -1592,20 +1734,20 @@ dcgmReturn_t DECLDIR dcgmIntrospectGetHostengineMemoryUsage(dcgmHandle_t pDcgmHa * Get introspection info relating to execution time needed to update the fields * identified by \a context. * - * @param pDcgmHandle IN: DCGM Handle - * @param context IN: see \ref dcgmIntrospectContext_t. This identifies the level - * of fields to do introspection for (ex: all fields, field group ) - * context->version must be set to dcgmIntrospectContext_version prior - * to this call. - * @param execTime IN/OUT: see \ref dcgmIntrospectFullFieldsExecTime_t. execTime->version must be - * set to dcgmIntrospectFullFieldsExecTime_version prior to this call. - * @param waitIfNoData IN: if no metadata is gathered, wait until data has been gathered (1) - * or return DCGM_ST_NO_DATA (0) + * @param pDcgmHandle IN: DCGM Handle + * @param context IN: see \ref dcgmIntrospectContext_t. This identifies the level of fields to do + * introspection for (ex: all fields, field group ) context->version must be set to + * dcgmIntrospectContext_version prior to this call. + * @param execTime IN/OUT: see \ref dcgmIntrospectFullFieldsExecTime_t. execTime->version must be set to + * dcgmIntrospectFullFieldsExecTime_version prior to this call. + * @param waitIfNoData IN: if no metadata is gathered, wait until data has been gathered (1) or return + * DCGM_ST_NO_DATA (0) * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_NOT_CONFIGURED if metadata gathering state is \a DCGM_INTROSPECT_STATE_DISABLED * - \ref DCGM_ST_NO_DATA if \a waitIfNoData is false and metadata has not been gathered yet * - \ref DCGM_ST_VER_MISMATCH if context->version or execTime->version is 0 or invalid. + * */ dcgmReturn_t DECLDIR dcgmIntrospectGetFieldsExecTime(dcgmHandle_t pDcgmHandle, dcgmIntrospectContext_t *context, @@ -1613,20 +1755,20 @@ dcgmReturn_t DECLDIR dcgmIntrospectGetFieldsExecTime(dcgmHandle_t pDcgmHandle, int waitIfNoData); /*************************************************************************/ -/* +/** * Retrieve the CPU utilization of the DCGM hostengine process. * - * @param pDcgmHandle IN: DCGM Handle - * @param cpuUtil IN/OUT: see \ref dcgmIntrospectCpuUtil_t. cpuUtil->version must be set to - * dcgmIntrospectCpuUtil_version prior to this call. - * @param waitIfNoData IN: if no metadata is gathered wait till this occurs (!0) - * or return DCGM_ST_NO_DATA (0) + * @param pDcgmHandle IN: DCGM Handle + * @param cpuUtil IN/OUT: see \ref dcgmIntrospectCpuUtil_t. cpuUtil->version must be set to + * dcgmIntrospectCpuUtil_version prior to this call. + * @param waitIfNoData IN: if no metadata is gathered wait till this occurs (!0) or return DCGM_ST_NO_DATA (0) * * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_NOT_CONFIGURED if metadata gathering state is \a DCGM_INTROSPECT_STATE_DISABLED * - \ref DCGM_ST_NO_DATA if \a waitIfNoData is false and metadata has not been gathered yet * - \ref DCGM_ST_VER_MISMATCH if cpuUtil->version or execTime->version is 0 or invalid. + * */ dcgmReturn_t DECLDIR dcgmIntrospectGetHostengineCpuUtilization(dcgmHandle_t pDcgmHandle, dcgmIntrospectCpuUtil_t *cpuUtil, @@ -1638,13 +1780,13 @@ dcgmReturn_t DECLDIR dcgmIntrospectGetHostengineCpuUtilization(dcgmHandle_t pDcg * all DCGM introspection data. This is normally performed automatically on an * interval of 1 second. * - * @param pDcgmHandle IN: DCGM Handle - * @param waitForUpdate IN: Whether or not to wait for the update loop to - * complete before returning to the caller + * @param pDcgmHandle IN: DCGM Handle + * @param waitForUpdate IN: Whether or not to wait for the update loop to complete before returning to the caller * * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_BADPARAM if \a waitForUpdate is invalid + * */ dcgmReturn_t DECLDIR dcgmIntrospectUpdateAll(dcgmHandle_t pDcgmHandle, int waitForUpdate); @@ -1659,35 +1801,30 @@ dcgmReturn_t DECLDIR dcgmIntrospectUpdateAll(dcgmHandle_t pDcgmHandle, int waitF /*************************************************************************/ /** - * Get the best group of gpus from the specified bitmask according to topological proximity: cpuAffinity, NUMA + * Get the best group of gpus from the specified bitmask according to topological proximity: cpuAffinity, NUMA * node, and NVLink. * - * @param pDcgmHandle IN: DCGM Handle - * @param inputGpuIds IN: a bitmask of which GPUs DCGM should consider. If some of the GPUs on the system - * are already in use, they shouldn't be included in the bitmask. 0 means that all - * of the GPUs in the system should be considered. - * @param numGpus IN: the number of GPUs that are desired from inputGpuIds. If this number is greater - * than the number of healthy GPUs in inputGpuIds, then less than numGpus gpus will - * be specified in outputGpuIds. - * @param outputGpuIds OUT: a bitmask of numGpus or fewer GPUs from inputGpuIds that represent the best - * placement available from inputGpuIds. - * @param hintFlags IN: a bitmask of DCGM_TOPO_HINT_F_ #defines of hints that should be taken into - * account when assigning outputGpuIds. + * @param pDcgmHandle IN: DCGM Handle + * @param inputGpuIds IN: a bitmask of which GPUs DCGM should consider. If some of the GPUs on the system are + * already in use, they shouldn't be included in the bitmask. 0 means that all of the GPUs + * in the system should be considered. + * @param numGpus IN: the number of GPUs that are desired from inputGpuIds. If this number is greater than + * the number of healthy GPUs in inputGpuIds, then less than numGpus gpus will be + * specified in outputGpuIds. + * @param outputGpuIds OUT: a bitmask of numGpus or fewer GPUs from inputGpuIds that represent the best placement + * available from inputGpuIds. + * @param hintFlags IN: a bitmask of DCGM_TOPO_HINT_F_ #defines of hints that should be taken into account when + * assigning outputGpuIds. * * @return * - \ref DCGM_ST_OK if the call was successful - */ -dcgmReturn_t DECLDIR dcgmSelectGpusByTopology(dcgmHandle_t pDcgmHandle, uint64_t inputGpuIds, uint32_t numGpus, - uint64_t *outputGpuIds, uint64_t hintFlags); - -/*************************************************************************/ -/** - * Get a summary of the values for a field id over a period of time. * - * @param pDcgmHandle IN: DCGM Handle - * @param request IN / OUT: a pointer to the struct detailing the request and containing the response */ -dcgmReturn_t DECLDIR dcgmGetFieldSummary(dcgmHandle_t pDcgmHandle, dcgmFieldSummaryRequest_t *request); +dcgmReturn_t DECLDIR dcgmSelectGpusByTopology(dcgmHandle_t pDcgmHandle, + uint64_t inputGpuIds, + uint32_t numGpus, + uint64_t *outputGpuIds, + uint64_t hintFlags); /** @} */ // Closing for DCGMAPI_TOPOLOGY @@ -1706,15 +1843,15 @@ dcgmReturn_t DECLDIR dcgmGetFieldSummary(dcgmHandle_t pDcgmHandle, dcgmFieldSumm * You can also pass --blacklist-modules to the nv-hostengine binary to make sure modules * get blacklisted immediately after the host engine starts up. * - * @param pDcgmHandle IN: DCGM Handle - * @param moduleId IN: ID of the module to blacklist. Use \ref dcgmModuleGetStatuses to get a - * list of valid module IDs. + * @param pDcgmHandle IN: DCGM Handle + * @param moduleId IN: ID of the module to blacklist. Use \ref dcgmModuleGetStatuses to get a list of valid + * module IDs. * * @return - * - \ref DCGM_ST_OK if the module has been blacklisted. - * - \ref DCGM_ST_IN_USE if the module has already been loaded and cannot be blacklisted. - * - \ref DCGM_ST_BADPARAM if a parameter is missing or bad. - * + * - \ref DCGM_ST_OK if the module has been blacklisted. + * - \ref DCGM_ST_IN_USE if the module has already been loaded and cannot be blacklisted. + * - \ref DCGM_ST_BADPARAM if a parameter is missing or bad. + * */ dcgmReturn_t DECLDIR dcgmModuleBlacklist(dcgmHandle_t pDcgmHandle, dcgmModuleId_t moduleId); @@ -1722,13 +1859,14 @@ dcgmReturn_t DECLDIR dcgmModuleBlacklist(dcgmHandle_t pDcgmHandle, dcgmModuleId_ /** * Get the status of all of the DCGM modules. * - * @param pDcgmHandle IN: DCGM Handle - * @param moduleStatuses OUT: Module statuses. .version should be set to dcgmModuleStatuses_version upon calling. + * @param pDcgmHandle IN: DCGM Handle + * @param moduleStatuses OUT: Module statuses.
+ * .version should be set to dcgmModuleStatuses_version upon calling. * * @return - * - \ref DCGM_ST_OK if the request succeeds. - * - \ref DCGM_ST_BADPARAM if a parameter is missing or bad. - * + * - \ref DCGM_ST_OK if the request succeeds. + * - \ref DCGM_ST_BADPARAM if a parameter is missing or bad. + * */ dcgmReturn_t DECLDIR dcgmModuleGetStatuses(dcgmHandle_t pDcgmHandle, dcgmModuleGetStatuses_t *moduleStatuses); @@ -1744,78 +1882,134 @@ dcgmReturn_t DECLDIR dcgmModuleGetStatuses(dcgmHandle_t pDcgmHandle, dcgmModuleG /*************************************************************************/ /** * Get all of the profiling metric groups for a given GPU group. - * + * * Profiling metrics are watched in groups of fields that are all watched together. For instance, if you want * to watch DCGM_FI_PROF_GR_ENGINE_ACTIVITY, this might also be in the same group as DCGM_FI_PROF_SM_EFFICIENCY. * Watching this group would result in DCGM storing values for both of these metrics. - * + * * Some groups cannot be watched concurrently as others as they utilize the same hardware resource. For instance, * you may not be able to watch DCGM_FI_PROF_TENSOR_OP_UTIL at the same time as DCGM_FI_PROF_GR_ENGINE_ACTIVITY * on your hardware. At the same time, you may be able to watch DCGM_FI_PROF_TENSOR_OP_UTIL at the same time as * DCGM_FI_PROF_NVLINK_TX_DATA. - * + * * Metrics that can be watched concurrently will have different .majorId fields in their dcgmProfMetricGroupInfo_t - * + * * See \ref dcgmGroupCreate for details on creating a GPU group * See \ref dcgmProfWatchFields to actually watch a metric group * - * @param pDcgmHandle IN: DCGM Handle - * @param metricGroups IN/OUT: Metric groups supported for metricGroups->groupId. - * metricGroups->version should be set to dcgmProfGetMetricGroups_version upon calling. + * @param pDcgmHandle IN: DCGM Handle + * @param metricGroups IN/OUT: Metric groups supported for metricGroups->groupId.
+ * metricGroups->version should be set to dcgmProfGetMetricGroups_version upon calling. * * @return - * - \ref DCGM_ST_OK if the request succeeds. - * - \ref DCGM_ST_BADPARAM if a parameter is missing or bad. - * - \ref DCGM_ST_GROUP_INCOMPATIBLE if metricGroups->groupId's GPUs are not identical GPUs. - * - \ref DCGM_ST_NOT_SUPPORTED if profiling metrics are not supported for the given GPU group. - * + * - \ref DCGM_ST_OK if the request succeeds. + * - \ref DCGM_ST_BADPARAM if a parameter is missing or bad. + * - \ref DCGM_ST_GROUP_INCOMPATIBLE if metricGroups->groupId's GPUs are not identical GPUs. + * - \ref DCGM_ST_NOT_SUPPORTED if profiling metrics are not supported for the given GPU group. + * */ -dcgmReturn_t DECLDIR dcgmProfGetSupportedMetricGroups(dcgmHandle_t pDcgmHandle, dcgmProfGetMetricGroups_t *metricGroups); +dcgmReturn_t DECLDIR dcgmProfGetSupportedMetricGroups(dcgmHandle_t pDcgmHandle, + dcgmProfGetMetricGroups_t *metricGroups); /** * Request that DCGM start recording updates for a given list of profiling field IDs. - * + * * Once metrics have been watched by this API, any of the normal DCGM field-value retrieval APIs can be used on * the underlying fieldIds of this metric group. See \ref dcgmGetLatestValues_v2, \ref dcgmGetLatestValuesForFields, * \ref dcgmEntityGetLatestValues, and \ref dcgmEntitiesGetLatestValues. * - * @param pDcgmHandle IN: DCGM Handle - * @param watchFields IN: Details of which metric groups to watch for which GPUs. See \ref dcgmProfWatchFields_v1 - * for details of what should be put in each struct member. - * watchFields->version should be set to dcgmProfWatchFields_version upon calling. + * @param pDcgmHandle IN: DCGM Handle + * @param watchFields IN: Details of which metric groups to watch for which GPUs. See \ref dcgmProfWatchFields_v1 + * for details of what should be put in each struct member. watchFields->version should be + * set to dcgmProfWatchFields_version upon calling. * * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - \ref DCGM_ST_NOT_SUPPORTED if profiling metric group metricGroupTag is not supported - * for the given GPU group. - * - \ref DCGM_ST_GROUP_INCOMPATIBLE if groupId's GPUs are not identical GPUs. Profiling metrics are only - * support for homogenous groups of GPUs. - * - \ref DCGM_ST_PROFILING_MULTI_PASS if any of the metric groups could not be watched concurrently due to - * requiring the hardware to gather them with multiple passes + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * - \ref DCGM_ST_NOT_SUPPORTED if profiling metric group metricGroupTag is not supported for the given + * GPU group. + * - \ref DCGM_ST_GROUP_INCOMPATIBLE if groupId's GPUs are not identical GPUs. Profiling metrics are only + * support for homogenous groups of GPUs. + * - \ref DCGM_ST_PROFILING_MULTI_PASS if any of the metric groups could not be watched concurrently due to + * requiring the hardware to gather them with multiple passes + * */ - dcgmReturn_t dcgmProfWatchFields(dcgmHandle_t pDcgmHandle, dcgmProfWatchFields_t *watchFields); /** * Request that DCGM stop recording updates for all profiling field IDs for all GPUs * - * @param pDcgmHandle IN: DCGM Handle - * @param unwatchFields IN: Details of which metric groups to unwatch for which GPUs. See \ref dcgmProfUnwatchFields_v1 - * for details of what should be put in each struct member. - * unwatchFields->version should be set to dcgmProfUnwatchFields_version upon calling. + * @param pDcgmHandle IN: DCGM Handle + * @param unwatchFields IN: Details of which metric groups to unwatch for which GPUs. See \ref + * dcgmProfUnwatchFields_v1 for details of what should be put in each struct member. + * unwatchFields->version should be set to dcgmProfUnwatchFields_version upon calling. * * @return * - \ref DCGM_ST_OK if the call was successful * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * */ - dcgmReturn_t dcgmProfUnwatchFields(dcgmHandle_t pDcgmHandle, dcgmProfUnwatchFields_t *unwatchFields); +dcgmReturn_t dcgmProfUnwatchFields(dcgmHandle_t pDcgmHandle, dcgmProfUnwatchFields_t *unwatchFields); +/** + * Pause profiling activities in DCGM. This should be used when you are monitoring profiling fields + * from DCGM but want to be able to still run developer tools like nvprof, nsight systems, and nsight compute. + * Profiling fields start with DCGM_PROF_ and are in the field ID range 1001-1012. + * + * Call this API before you launch one of those tools and dcgmProfResume() after the tool has completed. + * + * DCGM will save BLANK values while profiling is paused. + * + * Calling this while profiling activities are already paused is fine and will be treated as a no-op. + * + * @param pDcgmHandle IN: DCGM Handle + * + * @return + * - \ref DCGM_ST_OK If the call was successful. + * - \ref DCGM_ST_BADPARAM if a parameter is invalid. + * + */ +dcgmReturn_t dcgmProfPause(dcgmHandle_t pDcgmHandle); + +/** + * Resume profiling activities in DCGM that were previously paused with dcgmProfPause(). + * + * Call this API after you have completed running other NVIDIA developer tools to reenable DCGM + * profiling metrics. + * + * DCGM will save BLANK values while profiling is paused. + * + * Calling this while profiling activities have already been resumed is fine and will be treated as a no-op. + * + * @param pDcgmHandle IN: DCGM Handle + * + * @return + * - \ref DCGM_ST_OK If the call was successful. + * - \ref DCGM_ST_BADPARAM if a parameter is invalid. + * + */ +dcgmReturn_t dcgmProfResume(dcgmHandle_t pDcgmHandle); /** @} */ // Closing for DCGMAPI_PROFILING -#ifdef __cplusplus +/** + * Adds fake GPU instances and or compute instances for testing purposes. The entity IDs specified for + * the GPU instances and compute instances are only guaranteed to be used by DCGM if MIG mode is not active. + * + * NOTE: this API will not work on a real system reading actual values from NVML, and it may even cause + * the real instances to malfunction. This API is for testing purposes only. + * + * @param pDcgmHandle IN: DCGM Handle + * @param hierarchy + * + * @return + * - \ref DCGM_ST_OK + * + */ +dcgmReturn_t dcgmAddFakeInstances(dcgmHandle_t pDcgmHandle, dcgmMigHierarchy_v1 *hierarchy); + +#ifdef __cplusplus } #endif -#endif /* DCGM_AGENT_H */ +#endif /* DCGM_AGENT_H */ diff --git a/bindings/go/dcgm/dcgm_errors.h b/bindings/go/dcgm/dcgm_errors.h index fbdfd5d..619bdb7 100644 --- a/bindings/go/dcgm/dcgm_errors.h +++ b/bindings/go/dcgm/dcgm_errors.h @@ -7,100 +7,104 @@ */ typedef enum dcgmError_enum { - DCGM_FR_OK = 0, //!< No error - DCGM_FR_UNKNOWN = 1, //!< Unknown error code - DCGM_FR_UNRECOGNIZED = 2, //!< Unrecognized error code - DCGM_FR_PCI_REPLAY_RATE = 3, //!< Unacceptable rate of PCI errors - DCGM_FR_VOLATILE_DBE_DETECTED = 4, //!< Uncorrectable volatile double bit error - DCGM_FR_VOLATILE_SBE_DETECTED = 5, //!< Unacceptable rate of volatile single bit errors - DCGM_FR_PENDING_PAGE_RETIREMENTS = 6, //!< Pending page retirements detected - DCGM_FR_RETIRED_PAGES_LIMIT = 7, //!< Unacceptable total page retirements detected - DCGM_FR_RETIRED_PAGES_DBE_LIMIT = 8, //!< Unacceptable total page retirements due to uncorrectable errors - DCGM_FR_CORRUPT_INFOROM = 9, //!< Corrupt inforom found - DCGM_FR_CLOCK_THROTTLE_THERMAL = 10, //!< Clocks being throttled due to overheating - DCGM_FR_POWER_UNREADABLE = 11, //!< Cannot get a reading for power from NVML - DCGM_FR_CLOCK_THROTTLE_POWER = 12, //!< Clock being throttled due to power restrictions - DCGM_FR_NVLINK_ERROR_THRESHOLD = 13, //!< Unacceptable rate of NVLink errors - DCGM_FR_NVLINK_DOWN = 14, //!< NVLink is down - DCGM_FR_NVSWITCH_FATAL_ERROR = 15, //!< Fatal errors on the NVSwitch - DCGM_FR_NVSWITCH_NON_FATAL_ERROR = 16, //!< Non-fatal errors on the NVSwitch - DCGM_FR_NVSWITCH_DOWN = 17, //!< NVSwitch is down - DCGM_FR_NO_ACCESS_TO_FILE = 18, //!< Cannot access a file - DCGM_FR_NVML_API = 19, //!< Error occurred on an NVML API - DCGM_FR_DEVICE_COUNT_MISMATCH = 20, //!< Disagreement in GPU count between /dev and NVML - DCGM_FR_BAD_PARAMETER = 21, //!< Bad parameter passed to API - DCGM_FR_CANNOT_OPEN_LIB = 22, //!< Cannot open a library that must be accessed - DCGM_FR_BLACKLISTED_DRIVER = 23, //!< A blacklisted driver (nouveau) is active - DCGM_FR_NVML_LIB_BAD = 24, //!< The NVML library is missing expected functions - DCGM_FR_GRAPHICS_PROCESSES = 25, //!< Graphics processes are active on this GPU - DCGM_FR_HOSTENGINE_CONN = 26, //!< Unstable connection to nv-hostengine (daemonized DCGM) - DCGM_FR_FIELD_QUERY = 27, //!< Error querying a field from DCGM - DCGM_FR_BAD_CUDA_ENV = 28, //!< The environment has variables that hurt CUDA - DCGM_FR_PERSISTENCE_MODE = 29, //!< Persistence mode is disabled - DCGM_FR_LOW_BANDWIDTH = 30, //!< The bandwidth is unacceptably low - DCGM_FR_HIGH_LATENCY = 31, //!< Latency is too high - DCGM_FR_CANNOT_GET_FIELD_TAG = 32, //!< Cannot find a tag for a field - DCGM_FR_FIELD_VIOLATION = 33, //!< The value for the specified error field is above 0 - DCGM_FR_FIELD_THRESHOLD = 34, //!< The value for the specified field is above the threshold - DCGM_FR_FIELD_VIOLATION_DBL = 35, //!< The value for the specified error field is above 0 - DCGM_FR_FIELD_THRESHOLD_DBL = 36, //!< The value for the specified field is above the threshold - DCGM_FR_UNSUPPORTED_FIELD_TYPE = 37, //!< Field type cannot be supported - DCGM_FR_FIELD_THRESHOLD_TS = 38, //!< The value for the specified field is above the threshold - DCGM_FR_FIELD_THRESHOLD_TS_DBL = 39, //!< The value for the specified field is above the threshold - DCGM_FR_THERMAL_VIOLATIONS = 40, //!< Thermal violations detected - DCGM_FR_THERMAL_VIOLATIONS_TS = 41, //!< Thermal violations detected with a timestamp - DCGM_FR_TEMP_VIOLATION = 42, //!< Temperature is too high - DCGM_FR_THROTTLING_VIOLATION = 43, //!< Non-benign clock throttling is occurring - DCGM_FR_INTERNAL = 44, //!< An internal error was detected - DCGM_FR_PCIE_GENERATION = 45, //!< PCIe generation is too low - DCGM_FR_PCIE_WIDTH = 46, //!< PCIe width is too low - DCGM_FR_ABORTED = 47, //!< Test was aborted by a user signal - DCGM_FR_TEST_DISABLED = 48, //!< This test is disabled for this GPU - DCGM_FR_CANNOT_GET_STAT = 49, //!< Cannot get telemetry for a needed value - DCGM_FR_STRESS_LEVEL = 50, //!< Stress level is too low (bad performance) - DCGM_FR_CUDA_API = 51, //!< Error calling the specified CUDA API - DCGM_FR_FAULTY_MEMORY = 52, //!< Faulty memory detected on this GPU - DCGM_FR_CANNOT_SET_WATCHES = 53, //!< Unable to set field watches in DCGM - DCGM_FR_CUDA_UNBOUND = 54, //!< CUDA context is no longer bound - DCGM_FR_ECC_DISABLED = 55, //!< ECC memory is disabled right now - DCGM_FR_MEMORY_ALLOC = 56, //!< Cannot allocate memory on the GPU - DCGM_FR_CUDA_DBE = 57, //!< CUDA detected unrecovable double-bit error - DCGM_FR_MEMORY_MISMATCH = 58, //!< Memory error detected - DCGM_FR_CUDA_DEVICE = 59, //!< No CUDA device discoverable for existing GPU - DCGM_FR_ECC_UNSUPPORTED = 60, //!< ECC memory is unsupported by this SKU - DCGM_FR_ECC_PENDING = 61, //!< ECC memory is in a pending state - DCGM_FR_MEMORY_BANDWIDTH = 62, //!< Memory bandwidth is too low - DCGM_FR_TARGET_POWER = 63, //!< Cannot hit the target power draw - DCGM_FR_API_FAIL = 64, //!< The specified API call failed - DCGM_FR_API_FAIL_GPU = 65, //!< The specified API call failed for the specified GPU - DCGM_FR_CUDA_CONTEXT = 66, //!< Cannot create a CUDA context on this GPU - DCGM_FR_DCGM_API = 67, //!< DCGM API failure - DCGM_FR_CONCURRENT_GPUS = 68, //!< Need multiple GPUs to run this test - DCGM_FR_TOO_MANY_ERRORS = 69, //!< More errors than fit in the return struct - DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD = 70, //!< More than 100 CRC errors are happening per second - DCGM_FR_NVLINK_ERROR_CRITICAL = 71, //!< NVLink error for a field that should always be 0 - DCGM_FR_ENFORCED_POWER_LIMIT = 72, //!< The enforced power limit is too low to hit the target - DCGM_FR_MEMORY_ALLOC_HOST = 73, //!< Cannot allocate memory on the host - DCGM_FR_GPU_OP_MODE = 74, //!< Bad GPU operating mode for running plugin - DCGM_FR_NO_MEMORY_CLOCKS = 75, //!< No memory clocks with the needed MHz were found - DCGM_FR_NO_GRAPHICS_CLOCKS = 76, //!< No graphics clocks with the needed MHz were found - DCGM_FR_HAD_TO_RESTORE_STATE = 77, //!< Note that we had to restore a GPU's state - DCGM_FR_ERROR_SENTINEL = 78, //!< MUST BE THE LAST ERROR CODE + DCGM_FR_OK = 0, //!< No error + DCGM_FR_UNKNOWN = 1, //!< Unknown error code + DCGM_FR_UNRECOGNIZED = 2, //!< Unrecognized error code + DCGM_FR_PCI_REPLAY_RATE = 3, //!< Unacceptable rate of PCI errors + DCGM_FR_VOLATILE_DBE_DETECTED = 4, //!< Uncorrectable volatile double bit error + DCGM_FR_VOLATILE_SBE_DETECTED = 5, //!< Unacceptable rate of volatile single bit errors + DCGM_FR_PENDING_PAGE_RETIREMENTS = 6, //!< Pending page retirements detected + DCGM_FR_RETIRED_PAGES_LIMIT = 7, //!< Unacceptable total page retirements detected + DCGM_FR_RETIRED_PAGES_DBE_LIMIT = 8, //!< Unacceptable total page retirements due to uncorrectable errors + DCGM_FR_CORRUPT_INFOROM = 9, //!< Corrupt inforom found + DCGM_FR_CLOCK_THROTTLE_THERMAL = 10, //!< Clocks being throttled due to overheating + DCGM_FR_POWER_UNREADABLE = 11, //!< Cannot get a reading for power from NVML + DCGM_FR_CLOCK_THROTTLE_POWER = 12, //!< Clock being throttled due to power restrictions + DCGM_FR_NVLINK_ERROR_THRESHOLD = 13, //!< Unacceptable rate of NVLink errors + DCGM_FR_NVLINK_DOWN = 14, //!< NVLink is down + DCGM_FR_NVSWITCH_FATAL_ERROR = 15, //!< Fatal errors on the NVSwitch + DCGM_FR_NVSWITCH_NON_FATAL_ERROR = 16, //!< Non-fatal errors on the NVSwitch + DCGM_FR_NVSWITCH_DOWN = 17, //!< NVSwitch is down + DCGM_FR_NO_ACCESS_TO_FILE = 18, //!< Cannot access a file + DCGM_FR_NVML_API = 19, //!< Error occurred on an NVML API + DCGM_FR_DEVICE_COUNT_MISMATCH = 20, //!< Disagreement in GPU count between /dev and NVML + DCGM_FR_BAD_PARAMETER = 21, //!< Bad parameter passed to API + DCGM_FR_CANNOT_OPEN_LIB = 22, //!< Cannot open a library that must be accessed + DCGM_FR_BLACKLISTED_DRIVER = 23, //!< A blacklisted driver (nouveau) is active + DCGM_FR_NVML_LIB_BAD = 24, //!< The NVML library is missing expected functions + DCGM_FR_GRAPHICS_PROCESSES = 25, //!< Graphics processes are active on this GPU + DCGM_FR_HOSTENGINE_CONN = 26, //!< Unstable connection to nv-hostengine (daemonized DCGM) + DCGM_FR_FIELD_QUERY = 27, //!< Error querying a field from DCGM + DCGM_FR_BAD_CUDA_ENV = 28, //!< The environment has variables that hurt CUDA + DCGM_FR_PERSISTENCE_MODE = 29, //!< Persistence mode is disabled + DCGM_FR_LOW_BANDWIDTH = 30, //!< The bandwidth is unacceptably low + DCGM_FR_HIGH_LATENCY = 31, //!< Latency is too high + DCGM_FR_CANNOT_GET_FIELD_TAG = 32, //!< Cannot find a tag for a field + DCGM_FR_FIELD_VIOLATION = 33, //!< The value for the specified error field is above 0 + DCGM_FR_FIELD_THRESHOLD = 34, //!< The value for the specified field is above the threshold + DCGM_FR_FIELD_VIOLATION_DBL = 35, //!< The value for the specified error field is above 0 + DCGM_FR_FIELD_THRESHOLD_DBL = 36, //!< The value for the specified field is above the threshold + DCGM_FR_UNSUPPORTED_FIELD_TYPE = 37, //!< Field type cannot be supported + DCGM_FR_FIELD_THRESHOLD_TS = 38, //!< The value for the specified field is above the threshold + DCGM_FR_FIELD_THRESHOLD_TS_DBL = 39, //!< The value for the specified field is above the threshold + DCGM_FR_THERMAL_VIOLATIONS = 40, //!< Thermal violations detected + DCGM_FR_THERMAL_VIOLATIONS_TS = 41, //!< Thermal violations detected with a timestamp + DCGM_FR_TEMP_VIOLATION = 42, //!< Temperature is too high + DCGM_FR_THROTTLING_VIOLATION = 43, //!< Non-benign clock throttling is occurring + DCGM_FR_INTERNAL = 44, //!< An internal error was detected + DCGM_FR_PCIE_GENERATION = 45, //!< PCIe generation is too low + DCGM_FR_PCIE_WIDTH = 46, //!< PCIe width is too low + DCGM_FR_ABORTED = 47, //!< Test was aborted by a user signal + DCGM_FR_TEST_DISABLED = 48, //!< This test is disabled for this GPU + DCGM_FR_CANNOT_GET_STAT = 49, //!< Cannot get telemetry for a needed value + DCGM_FR_STRESS_LEVEL = 50, //!< Stress level is too low (bad performance) + DCGM_FR_CUDA_API = 51, //!< Error calling the specified CUDA API + DCGM_FR_FAULTY_MEMORY = 52, //!< Faulty memory detected on this GPU + DCGM_FR_CANNOT_SET_WATCHES = 53, //!< Unable to set field watches in DCGM + DCGM_FR_CUDA_UNBOUND = 54, //!< CUDA context is no longer bound + DCGM_FR_ECC_DISABLED = 55, //!< ECC memory is disabled right now + DCGM_FR_MEMORY_ALLOC = 56, //!< Cannot allocate memory on the GPU + DCGM_FR_CUDA_DBE = 57, //!< CUDA detected unrecovable double-bit error + DCGM_FR_MEMORY_MISMATCH = 58, //!< Memory error detected + DCGM_FR_CUDA_DEVICE = 59, //!< No CUDA device discoverable for existing GPU + DCGM_FR_ECC_UNSUPPORTED = 60, //!< ECC memory is unsupported by this SKU + DCGM_FR_ECC_PENDING = 61, //!< ECC memory is in a pending state + DCGM_FR_MEMORY_BANDWIDTH = 62, //!< Memory bandwidth is too low + DCGM_FR_TARGET_POWER = 63, //!< Cannot hit the target power draw + DCGM_FR_API_FAIL = 64, //!< The specified API call failed + DCGM_FR_API_FAIL_GPU = 65, //!< The specified API call failed for the specified GPU + DCGM_FR_CUDA_CONTEXT = 66, //!< Cannot create a CUDA context on this GPU + DCGM_FR_DCGM_API = 67, //!< DCGM API failure + DCGM_FR_CONCURRENT_GPUS = 68, //!< Need multiple GPUs to run this test + DCGM_FR_TOO_MANY_ERRORS = 69, //!< More errors than fit in the return struct + DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD = 70, //!< More than 100 CRC errors are happening per second + DCGM_FR_NVLINK_ERROR_CRITICAL = 71, //!< NVLink error for a field that should always be 0 + DCGM_FR_ENFORCED_POWER_LIMIT = 72, //!< The enforced power limit is too low to hit the target + DCGM_FR_MEMORY_ALLOC_HOST = 73, //!< Cannot allocate memory on the host + DCGM_FR_GPU_OP_MODE = 74, //!< Bad GPU operating mode for running plugin + DCGM_FR_NO_MEMORY_CLOCKS = 75, //!< No memory clocks with the needed MHz were found + DCGM_FR_NO_GRAPHICS_CLOCKS = 76, //!< No graphics clocks with the needed MHz were found + DCGM_FR_HAD_TO_RESTORE_STATE = 77, //!< Note that we had to restore a GPU's state + DCGM_FR_L1TAG_UNSUPPORTED = 78, //!< L1TAG test is unsupported by this SKU + DCGM_FR_L1TAG_MISCOMPARE = 79, //!< L1TAG test failed on a miscompare + DCGM_FR_ROW_REMAP_FAILURE = 80, //!< Row remapping failed (Ampere or newer GPUs) + DCGM_FR_UNCONTAINED_ERROR = 81, //!< Uncontained error - XID 95 + DCGM_FR_ERROR_SENTINEL = 82, //!< MUST BE THE LAST ERROR CODE } dcgmError_t; typedef enum dcgmErrorSeverity_enum { - DCGM_ERROR_MONITOR = 0, // Can perform workload, but needs to be monitored. - DCGM_ERROR_ISOLATE = 1, // Cannot perform workload. GPU should be isolated. - DCGM_ERROR_UNKNOWN = 2, // This error code is not recognized + DCGM_ERROR_MONITOR = 0, //!< Can perform workload, but needs to be monitored. + DCGM_ERROR_ISOLATE = 1, //!< Cannot perform workload. GPU should be isolated. + DCGM_ERROR_UNKNOWN = 2, //!< This error code is not recognized } dcgmErrorSeverity_t; typedef struct { - dcgmError_t errorId; - const char *msgFormat; - const char *suggestion; - int severity; + dcgmError_t errorId; + const char *msgFormat; + const char *suggestion; + int severity; } dcgm_error_meta_t; extern dcgm_error_meta_t dcgmErrorMeta[]; @@ -108,286 +112,336 @@ extern dcgm_error_meta_t dcgmErrorMeta[]; /* Standard message for running a field diagnostic */ #define TRIAGE_RUN_FIELD_DIAG_MSG "Run a field diagnostic on the GPU." -#define DEBUG_COOLING_MSG "Verify that the cooling on this machine is functional, including external, "\ - "thermal material interface, fans, and any other components." +#define DEBUG_COOLING_MSG \ + "Verify that the cooling on this machine is functional, including external, " \ + "thermal material interface, fans, and any other components." /* * Messages for the error codes. All messages must be defined in the ERROR_CODE_MSG format * where is the actual message. */ -#define DCGM_FR_OK_MSG "The operation completed successfully." -#define DCGM_FR_UNKNOWN_MSG "Unknown error." -#define DCGM_FR_UNRECOGNIZED_MSG "Unrecognized error code." +#define DCGM_FR_OK_MSG "The operation completed successfully." +#define DCGM_FR_UNKNOWN_MSG "Unknown error." +#define DCGM_FR_UNRECOGNIZED_MSG "Unrecognized error code." // replay limit, gpu id, replay errors detected -#define DCGM_FR_PCI_REPLAY_RATE_MSG "Detected more than %u PCIe replays per minute for GPU %u : %d" +#define DCGM_FR_PCI_REPLAY_RATE_MSG "Detected more than %u PCIe replays per minute for GPU %u : %d" // dbes deteced, gpu id -#define DCGM_FR_VOLATILE_DBE_DETECTED_MSG "Detected %d volatile double-bit ECC error(s) in GPU %u." +#define DCGM_FR_VOLATILE_DBE_DETECTED_MSG "Detected %d volatile double-bit ECC error(s) in GPU %u." // sbe limit, gpu id, sbes detected -#define DCGM_FR_VOLATILE_SBE_DETECTED_MSG "More than %u single-bit ECC error(s) detected in GPU %u Volatile SBEs: %lld" +#define DCGM_FR_VOLATILE_SBE_DETECTED_MSG "More than %u single-bit ECC error(s) detected in GPU %u Volatile SBEs: %lld" // gpu id -#define DCGM_FR_PENDING_PAGE_RETIREMENTS_MSG "A pending retired page has been detected in GPU %u." +#define DCGM_FR_PENDING_PAGE_RETIREMENTS_MSG "A pending retired page has been detected in GPU %u." // retired pages detected, gpud id -#define DCGM_FR_RETIRED_PAGES_LIMIT_MSG "%u or more retired pages have been detected in GPU %u. " +#define DCGM_FR_RETIRED_PAGES_LIMIT_MSG "%u or more retired pages have been detected in GPU %u. " // retired pages due to dbes detected, gpu id -#define DCGM_FR_RETIRED_PAGES_DBE_LIMIT_MSG "An excess of %u retired pages due to DBEs have been detected and" \ - " more than one page has been retired due to DBEs in the past" \ - " week in GPU %u." +#define DCGM_FR_RETIRED_PAGES_DBE_LIMIT_MSG \ + "An excess of %u retired pages due to DBEs have been detected and" \ + " more than one page has been retired due to DBEs in the past" \ + " week in GPU %u." // gpu id -#define DCGM_FR_CORRUPT_INFOROM_MSG "A corrupt InfoROM has been detected in GPU %u." +#define DCGM_FR_CORRUPT_INFOROM_MSG "A corrupt InfoROM has been detected in GPU %u." // gpu id -#define DCGM_FR_CLOCK_THROTTLE_THERMAL_MSG "Detected clock throttling due to thermal violation in GPU %u." +#define DCGM_FR_CLOCK_THROTTLE_THERMAL_MSG "Detected clock throttling due to thermal violation in GPU %u." // gpu id -#define DCGM_FR_POWER_UNREADABLE_MSG "Cannot reliably read the power usage for GPU %u." +#define DCGM_FR_POWER_UNREADABLE_MSG "Cannot reliably read the power usage for GPU %u." // gpu id -#define DCGM_FR_CLOCK_THROTTLE_POWER_MSG "Detected clock throttling due to power violation in GPU %u." +#define DCGM_FR_CLOCK_THROTTLE_POWER_MSG "Detected clock throttling due to power violation in GPU %u." // nvlink errors detected, nvlink id, error threshold -#define DCGM_FR_NVLINK_ERROR_THRESHOLD_MSG "Detected %ld %s NvLink errors on GPU %u's NVLink which exceeds "\ - "threshold of %u" +#define DCGM_FR_NVLINK_ERROR_THRESHOLD_MSG \ + "Detected %ld %s NvLink errors on GPU %u's NVLink which exceeds " \ + "threshold of %u" // gpu id, nvlink id -#define DCGM_FR_NVLINK_DOWN_MSG "GPU %u's NvLink link %d is currently down" +#define DCGM_FR_NVLINK_DOWN_MSG "GPU %u's NvLink link %d is currently down" // nvswitch id, nvlink id -#define DCGM_FR_NVSWITCH_FATAL_ERROR_MSG "Detected fatal errors on NvSwitch %u link %u" +#define DCGM_FR_NVSWITCH_FATAL_ERROR_MSG "Detected fatal errors on NvSwitch %u link %u" // nvswitch id, nvlink id -#define DCGM_FR_NVSWITCH_NON_FATAL_ERROR_MSG "Detected nonfatal errors on NvSwitch %u link %u" +#define DCGM_FR_NVSWITCH_NON_FATAL_ERROR_MSG "Detected nonfatal errors on NvSwitch %u link %u" // nvswitch id, nvlink port -#define DCGM_FR_NVSWITCH_DOWN_MSG "NvSwitch physical ID %u's NvLink port %d is currently down." +#define DCGM_FR_NVSWITCH_DOWN_MSG "NvSwitch physical ID %u's NvLink port %d is currently down." // file path, error detail -#define DCGM_FR_NO_ACCESS_TO_FILE_MSG "File %s could not be accessed directly: %s" +#define DCGM_FR_NO_ACCESS_TO_FILE_MSG "File %s could not be accessed directly: %s" // purpose for communicating with NVML, NVML error as string, NVML error -#define DCGM_FR_NVML_API_MSG "Error calling NVML API %s: %s" -#define DCGM_FR_DEVICE_COUNT_MISMATCH_MSG "The number of devices NVML returns is different than the number "\ - "of devices in /dev." +#define DCGM_FR_NVML_API_MSG "Error calling NVML API %s: %s" +#define DCGM_FR_DEVICE_COUNT_MISMATCH_MSG \ + "The number of devices NVML returns is different than the number " \ + "of devices in /dev." // function name -#define DCGM_FR_BAD_PARAMETER_MSG "Bad parameter to function %s cannot be processed" +#define DCGM_FR_BAD_PARAMETER_MSG "Bad parameter to function %s cannot be processed" // library name, error returned from dlopen -#define DCGM_FR_CANNOT_OPEN_LIB_MSG "Cannot open library %s: '%s'" +#define DCGM_FR_CANNOT_OPEN_LIB_MSG "Cannot open library %s: '%s'" // the name of the blacklisted driver -#define DCGM_FR_BLACKLISTED_DRIVER_MSG "Found blacklisted driver: %s" +#define DCGM_FR_BLACKLISTED_DRIVER_MSG "Found blacklisted driver: %s" // the name of the function that wasn't found -#define DCGM_FR_NVML_LIB_BAD_MSG "Cannot get pointer to %s from libnvidia-ml.so" -#define DCGM_FR_GRAPHICS_PROCESSES_MSG "NVVS has detected graphics processes running on at least one "\ - "GPU. This may cause some tests to fail." +#define DCGM_FR_NVML_LIB_BAD_MSG "Cannot get pointer to %s from libnvidia-ml.so" +#define DCGM_FR_GRAPHICS_PROCESSES_MSG \ + "NVVS has detected graphics processes running on at least one " \ + "GPU. This may cause some tests to fail." // error message from the API call -#define DCGM_FR_HOSTENGINE_CONN_MSG "Could not connect to the host engine: '%s'" +#define DCGM_FR_HOSTENGINE_CONN_MSG "Could not connect to the host engine: '%s'" // field name, gpu id -#define DCGM_FR_FIELD_QUERY_MSG "Could not query field %s for GPU %u" +#define DCGM_FR_FIELD_QUERY_MSG "Could not query field %s for GPU %u" // environment variable name -#define DCGM_FR_BAD_CUDA_ENV_MSG "Found CUDA performance-limiting environment variable '%s'." +#define DCGM_FR_BAD_CUDA_ENV_MSG "Found CUDA performance-limiting environment variable '%s'." // gpu id -#define DCGM_FR_PERSISTENCE_MODE_MSG "Persistence mode for GPU %u is currently disabled. The DCGM "\ - "diagnostic requires peristence mode to be enabled." +#define DCGM_FR_PERSISTENCE_MODE_MSG \ + "Persistence mode for GPU %u is currently disabled. The DCGM " \ + "diagnostic requires peristence mode to be enabled." // gpu id, direction (d2h, e.g.), measured bandwidth, expected bandwidth -#define DCGM_FR_LOW_BANDWIDTH_MSG "Bandwidth of GPU %u in direction %s of %.2f did not exceed "\ - "minimum required bandwidth of %.2f." +#define DCGM_FR_LOW_BANDWIDTH_MSG \ + "Bandwidth of GPU %u in direction %s of %.2f did not exceed " \ + "minimum required bandwidth of %.2f." // gpu id, direction (d2h, e.g.), measured latency, expected latency -#define DCGM_FR_HIGH_LATENCY_MSG "Latency type %s of GPU %u value %.2f exceeded maximum allowed "\ - "latency of %.2f." +#define DCGM_FR_HIGH_LATENCY_MSG \ + "Latency type %s of GPU %u value %.2f exceeded maximum allowed " \ + "latency of %.2f." // field id -#define DCGM_FR_CANNOT_GET_FIELD_TAG_MSG "Unable to get field information for field id %hu" +#define DCGM_FR_CANNOT_GET_FIELD_TAG_MSG "Unable to get field information for field id %hu" // field value, field name, gpu id (this message is for fields that should always have a 0 value) -#define DCGM_FR_FIELD_VIOLATION_MSG "Detected %ld %s for GPU %u" +#define DCGM_FR_FIELD_VIOLATION_MSG "Detected %ld %s for GPU %u" // field value, field name, gpu id, allowable threshold -#define DCGM_FR_FIELD_THRESHOLD_MSG "Detected %ld %s for GPU %u which is above the threshold %ld" +#define DCGM_FR_FIELD_THRESHOLD_MSG "Detected %ld %s for GPU %u which is above the threshold %ld" // field value, field name, gpu id (same as DCGM_FR_FIELD_VIOLATION, but it's a double) -#define DCGM_FR_FIELD_VIOLATION_DBL_MSG "Detected %.1f %s for GPU %u" +#define DCGM_FR_FIELD_VIOLATION_DBL_MSG "Detected %.1f %s for GPU %u" // field value, field name, gpu id, allowable threshold (same as DCGM_FR_FIELD_THRESHOLD, but it's a double) -#define DCGM_FR_FIELD_THRESHOLD_DBL_MSG "Detected %.1f %s for GPU %u which is above the threshold %.1f" +#define DCGM_FR_FIELD_THRESHOLD_DBL_MSG "Detected %.1f %s for GPU %u which is above the threshold %.1f" // field name -#define DCGM_FR_UNSUPPORTED_FIELD_TYPE_MSG "Field %s is not supported by this API because it is neither an "\ - "int64 nor a double type." +#define DCGM_FR_UNSUPPORTED_FIELD_TYPE_MSG \ + "Field %s is not supported by this API because it is neither an " \ + "int64 nor a double type." // field name, allowable threshold, observed value, seconds -#define DCGM_FR_FIELD_THRESHOLD_TS_MSG "%s met or exceeded the threshold of %lu per second: %lu at "\ - "%.1f seconds into the test." +#define DCGM_FR_FIELD_THRESHOLD_TS_MSG \ + "%s met or exceeded the threshold of %lu per second: %lu at " \ + "%.1f seconds into the test." // field name, allowable threshold, observed value, seconds (same as DCGM_FR_FIELD_THRESHOLD, but it's a double) -#define DCGM_FR_FIELD_THRESHOLD_TS_DBL_MSG "%s met or exceeded the threshold of %.1f per second: %.1f at "\ - "%.1f seconds into the test." +#define DCGM_FR_FIELD_THRESHOLD_TS_DBL_MSG \ + "%s met or exceeded the threshold of %.1f per second: %.1f at " \ + "%.1f seconds into the test." // total seconds of violation, gpu id -#define DCGM_FR_THERMAL_VIOLATIONS_MSG "There were thermal violations totaling %lu seconds for GPU %u" +#define DCGM_FR_THERMAL_VIOLATIONS_MSG "There were thermal violations totaling %lu seconds for GPU %u" // total seconds of violations, first instance, gpu id -#define DCGM_FR_THERMAL_VIOLATIONS_TS_MSG "Thermal violations totaling %lu samples started at %.1f seconds "\ - "into the test for GPU %u" +#define DCGM_FR_THERMAL_VIOLATIONS_TS_MSG \ + "Thermal violations totaling %lu samples started at %.1f seconds " \ + "into the test for GPU %u" // observed temperature, gpu id, max allowed temperature -#define DCGM_FR_TEMP_VIOLATION_MSG "Temperature %lld of GPU %u exceeded user-specified maximum "\ - "allowed temperature %lld" +#define DCGM_FR_TEMP_VIOLATION_MSG \ + "Temperature %lld of GPU %u exceeded user-specified maximum " \ + "allowed temperature %lld" // gpu id, seconds into test, details about throttling -#define DCGM_FR_THROTTLING_VIOLATION_MSG "Clocks are being throttled for GPU %u because of clock "\ - "throttling starting %.1f seconds into the test. %s" +#define DCGM_FR_THROTTLING_VIOLATION_MSG \ + "Clocks are being throttled for GPU %u because of clock " \ + "throttling starting %.1f seconds into the test. %s" // details about error -#define DCGM_FR_INTERNAL_MSG "There was an internal error during the test: '%s'" +#define DCGM_FR_INTERNAL_MSG "There was an internal error during the test: '%s'" // gpu id, PCIe generation, minimum allowed, parameter to control -#define DCGM_FR_PCIE_GENERATION_MSG "GPU %u is running at PCI link generation %d, which is below "\ - "the minimum allowed link generation of %d (parameter '%s')" +#define DCGM_FR_PCIE_GENERATION_MSG \ + "GPU %u is running at PCI link generation %d, which is below " \ + "the minimum allowed link generation of %d (parameter '%s')" // gpu id, PCIe width, minimum allowed, parameter to control -#define DCGM_FR_PCIE_WIDTH_MSG "GPU %u is running at PCI link width %dX, which is below the "\ - "minimum allowed link generation of %d (parameter '%s')" -#define DCGM_FR_ABORTED_MSG "Test was aborted early due to user signal" +#define DCGM_FR_PCIE_WIDTH_MSG \ + "GPU %u is running at PCI link width %dX, which is below the " \ + "minimum allowed link generation of %d (parameter '%s')" +#define DCGM_FR_ABORTED_MSG "Test was aborted early due to user signal" // Test name -#define DCGM_FR_TEST_DISABLED_MSG "The %s test is skipped for this GPU." +#define DCGM_FR_TEST_DISABLED_MSG "The %s test is skipped for this GPU." // stat name, gpu id -#define DCGM_FR_CANNOT_GET_STAT_MSG "Unable to generate / collect stat %s for GPU %u" +#define DCGM_FR_CANNOT_GET_STAT_MSG "Unable to generate / collect stat %s for GPU %u" // observed value, minimum allowed, gpu id -#define DCGM_FR_STRESS_LEVEL_MSG "Max stress level of %.1f did not reach desired stress level of "\ - "%.1f for GPU %u" +#define DCGM_FR_STRESS_LEVEL_MSG \ + "Max stress level of %.1f did not reach desired stress level of " \ + "%.1f for GPU %u" // CUDA API name -#define DCGM_FR_CUDA_API_MSG "Error using CUDA API %s" +#define DCGM_FR_CUDA_API_MSG "Error using CUDA API %s" // count, gpu id -#define DCGM_FR_FAULTY_MEMORY_MSG "Found %d faulty memory elements on GPU %u" +#define DCGM_FR_FAULTY_MEMORY_MSG "Found %d faulty memory elements on GPU %u" // error detail -#define DCGM_FR_CANNOT_SET_WATCHES_MSG "Unable to add field watches to DCGM: %s" +#define DCGM_FR_CANNOT_SET_WATCHES_MSG "Unable to add field watches to DCGM: %s" // gpu id -#define DCGM_FR_CUDA_UNBOUND_MSG "Cuda GPU %d is no longer bound to a CUDA context...Aborting" +#define DCGM_FR_CUDA_UNBOUND_MSG "Cuda GPU %d is no longer bound to a CUDA context...Aborting" // Test name, gpu id -#define DCGM_FR_ECC_DISABLED_MSG "Skipping test %s because ECC is not enabled on GPU %u" +#define DCGM_FR_ECC_DISABLED_MSG "Skipping test %s because ECC is not enabled on GPU %u" // percentage of memory we tried to allocate, gpu id -#define DCGM_FR_MEMORY_ALLOC_MSG "Couldn't allocate at least %.1f%% of GPU memory on GPU %u" +#define DCGM_FR_MEMORY_ALLOC_MSG "Couldn't allocate at least %.1f%% of GPU memory on GPU %u" // gpu id -#define DCGM_FR_CUDA_DBE_MSG "CUDA APIs have indicated that a double-bit ECC error has "\ - "occured on GPU %u." +#define DCGM_FR_CUDA_DBE_MSG \ + "CUDA APIs have indicated that a double-bit ECC error has " \ + "occured on GPU %u." // gpu id -#define DCGM_FR_MEMORY_MISMATCH_MSG "A memory mismatch was detected on GPU %u, but no error was "\ - "reported by CUDA or NVML." +#define DCGM_FR_MEMORY_MISMATCH_MSG \ + "A memory mismatch was detected on GPU %u, but no error was " \ + "reported by CUDA or NVML." // gpu id, error detail -#define DCGM_FR_CUDA_DEVICE_MSG "Unable to find a corresponding CUDA device for GPU %u: '%s'" -#define DCGM_FR_ECC_UNSUPPORTED_MSG "This card does not support ECC Memory. Skipping test." +#define DCGM_FR_CUDA_DEVICE_MSG "Unable to find a corresponding CUDA device for GPU %u: '%s'" +#define DCGM_FR_ECC_UNSUPPORTED_MSG "ECC Memory is not turned on or is unsupported. Skipping test." // gpu id -#define DCGM_FR_ECC_PENDING_MSG "ECC memory for GPU %u is in a pending state." +#define DCGM_FR_ECC_PENDING_MSG "ECC memory for GPU %u is in a pending state." // gpu id, observed bandwidth, required, test name -#define DCGM_FR_MEMORY_BANDWIDTH_MSG "GPU %u only achieved a memory bandwidth of %.2f GB/s, failing "\ - "to meet %.2f GB/s for test %d" +#define DCGM_FR_MEMORY_BANDWIDTH_MSG \ + "GPU %u only achieved a memory bandwidth of %.2f GB/s, failing " \ + "to meet %.2f GB/s for test %d" // power draw observed, field tag, minimum power draw required, gpu id -#define DCGM_FR_TARGET_POWER_MSG "Max power of %.1f did not reach desired power minimum %s of "\ - "%.1f for GPU %u" +#define DCGM_FR_TARGET_POWER_MSG \ + "Max power of %.1f did not reach desired power minimum %s of " \ + "%.1f for GPU %u" // API name, error detail -#define DCGM_FR_API_FAIL_MSG "API call %s failed: '%s'" +#define DCGM_FR_API_FAIL_MSG "API call %s failed: '%s'" // API name, gpu id, error detail -#define DCGM_FR_API_FAIL_GPU_MSG "API call %s failed for GPU %u: '%s'" +#define DCGM_FR_API_FAIL_GPU_MSG "API call %s failed for GPU %u: '%s'" // gpu id, error detail -#define DCGM_FR_CUDA_CONTEXT_MSG "GPU %u failed to create a CUDA context: %s" +#define DCGM_FR_CUDA_CONTEXT_MSG "GPU %u failed to create a CUDA context: %s" // DCGM API name -#define DCGM_FR_DCGM_API_MSG "Error using DCGM API %s" -#define DCGM_FR_CONCURRENT_GPUS_MSG "Unable to run concurrent pair bandwidth test without 2 or more "\ - "gpus. Skipping" -#define DCGM_FR_TOO_MANY_ERRORS_MSG "This API can only return up to four errors per system. "\ - "Additional errors were found for this system that couldn't be "\ - "communicated." -#define DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_MSG "%.1f %s NvLink errors found occuring per second on GPU %u, "\ - "exceeding the limit of 100 per second." -#define DCGM_FR_NVLINK_ERROR_CRITICAL_MSG "Detected %ld %s NvLink errors on GPU %u's NVLink (should be 0)" -#define DCGM_FR_ENFORCED_POWER_LIMIT_MSG "Enforced power limit on GPU %u set to %.1f, which is too low to "\ - "attempt to achieve target power %.1f" -#define DCGM_FR_MEMORY_ALLOC_HOST_MSG "Cannot allocate %zu bytes on the host" -#define DCGM_FR_GPU_OP_MODE_MSG "Skipping plugin due to a GPU being in GPU Operating Mode: LOW_DP." -#define DCGM_FR_NO_MEMORY_CLOCKS_MSG "No memory clocks <= %u MHZ were found in %u supported memory clocks." -#define DCGM_FR_NO_GRAPHICS_CLOCKS_MSG "No graphics clocks <= %u MHZ were found in %u supported graphics clocks for memory clock %u MHZ." -#define DCGM_FR_HAD_TO_RESTORE_STATE_MSG "Had to restore GPU state on NVML GPU(s): %s" - +#define DCGM_FR_DCGM_API_MSG "Error using DCGM API %s" +#define DCGM_FR_CONCURRENT_GPUS_MSG \ + "Unable to run concurrent pair bandwidth test without 2 or more " \ + "gpus. Skipping" +#define DCGM_FR_TOO_MANY_ERRORS_MSG \ + "This API can only return up to four errors per system. " \ + "Additional errors were found for this system that couldn't be " \ + "communicated." +#define DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_MSG \ + "%.1f %s NvLink errors found occuring per second on GPU %u, " \ + "exceeding the limit of 100 per second." +#define DCGM_FR_NVLINK_ERROR_CRITICAL_MSG "Detected %ld %s NvLink errors on GPU %u's NVLink (should be 0)" +#define DCGM_FR_ENFORCED_POWER_LIMIT_MSG \ + "Enforced power limit on GPU %u set to %.1f, which is too low to " \ + "attempt to achieve target power %.1f" +#define DCGM_FR_MEMORY_ALLOC_HOST_MSG "Cannot allocate %zu bytes on the host" +#define DCGM_FR_GPU_OP_MODE_MSG "Skipping plugin due to a GPU being in GPU Operating Mode: LOW_DP." +#define DCGM_FR_NO_MEMORY_CLOCKS_MSG "No memory clocks <= %u MHZ were found in %u supported memory clocks." +#define DCGM_FR_NO_GRAPHICS_CLOCKS_MSG \ + "No graphics clocks <= %u MHZ were found in %u supported graphics clocks for memory clock %u MHZ." +#define DCGM_FR_HAD_TO_RESTORE_STATE_MSG "Had to restore GPU state on NVML GPU(s): %s" +#define DCGM_FR_L1TAG_UNSUPPORTED_MSG "This card does not support the L1 cache test. Skipping test." +#define DCGM_FR_L1TAG_MISCOMPARE_MSG "Detected a miscompare failure in the L1 cache." +#define DCGM_FR_ROW_REMAP_FAILURE_MSG "Row remapping failed." +#define DCGM_FR_UNCONTAINED_ERROR_MSG "GPU had an uncontained error (XID 95)" /* * Suggestions for next steps for the corresponding error message */ -#define DCGM_FR_OK_NEXT "N/A" -#define DCGM_FR_UNKNOWN_NEXT "" -#define DCGM_FR_UNRECOGNIZED_NEXT "" -#define DCGM_FR_PCI_REPLAY_RATE_NEXT "Reconnect PCIe card. Run system side PCIE diagnostic utilities "\ - "to verify hops off the GPU board. If issue is on the board, run "\ - "the field diagnostic." -#define DCGM_FR_VOLATILE_DBE_DETECTED_NEXT "Drain the GPU and reset it or reboot the node." -#define DCGM_FR_VOLATILE_SBE_DETECTED_NEXT "Monitor - this GPU can still perform workload." -#define DCGM_FR_PENDING_PAGE_RETIREMENTS_NEXT "If volatile double bit errors exist, drain the GPU and reset it "\ - "or reboot the node. Otherwise, monitor - GPU can still perform "\ - "workload." -#define DCGM_FR_RETIRED_PAGES_LIMIT_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_RETIRED_PAGES_DBE_LIMIT_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_CORRUPT_INFOROM_NEXT "Flash the InfoROM to clear this corruption." -#define DCGM_FR_CLOCK_THROTTLE_THERMAL_NEXT DEBUG_COOLING_MSG -#define DCGM_FR_POWER_UNREADABLE_NEXT "" -#define DCGM_FR_CLOCK_THROTTLE_POWER_NEXT "Monitor the power conditions. This GPU can still perform workload." -#define DCGM_FR_NVLINK_ERROR_THRESHOLD_NEXT "Monitor the NVLink. It can still perform workload." -#define DCGM_FR_NVLINK_DOWN_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_NVSWITCH_FATAL_ERROR_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_NVSWITCH_NON_FATAL_ERROR_NEXT "Monitor the NVSwitch. It can still perform workload." -#define DCGM_FR_NVSWITCH_DOWN_NEXT "" -#define DCGM_FR_NO_ACCESS_TO_FILE_NEXT "Check relevant permissions, access, and existence of the file." -#define DCGM_FR_NVML_API_NEXT "Check the error condition and ensure that appropriate libraries "\ - "are present and accessible." -#define DCGM_FR_DEVICE_COUNT_MISMATCH_NEXT "Check for the presence of cgroups, operating system blocks, and "\ - "or unsupported / older cards" -#define DCGM_FR_BAD_PARAMETER_NEXT "" -#define DCGM_FR_CANNOT_OPEN_LIB_NEXT "Check for the existence of the library and set LD_LIBRARY_PATH "\ - "if needed." -#define DCGM_FR_BLACKLISTED_DRIVER_NEXT "Please load the appropriate driver." -#define DCGM_FR_NVML_LIB_BAD_NEXT "Make sure that the required version of libnvidia-ml.so "\ - "is present and accessible on the system." -#define DCGM_FR_GRAPHICS_PROCESSES_NEXT "Stop the graphics processes or run this diagnostic on a server "\ - "that is not being used for display purposes." -#define DCGM_FR_HOSTENGINE_CONN_NEXT "If hostengine is run separately, please ensure that it is up "\ - "and responsive." -#define DCGM_FR_FIELD_QUERY_NEXT "" -#define DCGM_FR_BAD_CUDA_ENV_NEXT "Please unset this environment variable to address test failures." -#define DCGM_FR_PERSISTENCE_MODE_NEXT "Enable persistence mode by running \"nvidia-smi -i -pm "\ - "1 \" as root." -#define DCGM_FR_LOW_BANDWIDTH_NEXT "Verify that your minimum bandwidth setting is appropriate for "\ - "the topology of each GPU. If so, and errors are consistent, "\ - "please run a field diagnostic." -#define DCGM_FR_HIGH_LATENCY_NEXT "Verify that your maximum latency setting is appropriate for "\ - "the topology of each GPU. If so, and errors are consistent, "\ - "please run a field diagnostic." -#define DCGM_FR_CANNOT_GET_FIELD_TAG_NEXT "" -#define DCGM_FR_FIELD_VIOLATION_NEXT "" -#define DCGM_FR_FIELD_THRESHOLD_NEXT "" -#define DCGM_FR_FIELD_VIOLATION_DBL_NEXT "" -#define DCGM_FR_FIELD_THRESHOLD_DBL_NEXT "" -#define DCGM_FR_UNSUPPORTED_FIELD_TYPE_NEXT "" -#define DCGM_FR_FIELD_THRESHOLD_TS_NEXT "" -#define DCGM_FR_FIELD_THRESHOLD_TS_DBL_NEXT "" -#define DCGM_FR_THERMAL_VIOLATIONS_NEXT DEBUG_COOLING_MSG -#define DCGM_FR_THERMAL_VIOLATIONS_TS_NEXT DEBUG_COOLING_MSG -#define DCGM_FR_TEMP_VIOLATION_NEXT "Verify that the user-specified temperature maximum is set "\ - "correctly. If it is, check the cooling for this GPU and node: "\ - DEBUG_COOLING_MSG -#define DCGM_FR_THROTTLING_VIOLATION_NEXT "" -#define DCGM_FR_INTERNAL_NEXT "" -#define DCGM_FR_PCIE_GENERATION_NEXT "" -#define DCGM_FR_PCIE_WIDTH_NEXT "" -#define DCGM_FR_ABORTED_NEXT "" -#define DCGM_FR_TEST_DISABLED_NEXT "" -#define DCGM_FR_CANNOT_GET_STAT_NEXT "If running a standalone nv-hostengine, verify that it is up "\ - "and responsive." -#define DCGM_FR_STRESS_LEVEL_NEXT "" -#define DCGM_FR_CUDA_API_NEXT "" -#define DCGM_FR_FAULTY_MEMORY_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_CANNOT_SET_WATCHES_NEXT "" -#define DCGM_FR_CUDA_UNBOUND_NEXT "" -#define DCGM_FR_ECC_DISABLED_NEXT "Enable ECC memory by running \"nvidia-smi -i -e 1\" "\ - "to enable. This may require a GPU reset or reboot to take effect." -#define DCGM_FR_MEMORY_ALLOC_NEXT "" -#define DCGM_FR_CUDA_DBE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_MEMORY_MISMATCH_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_CUDA_DEVICE_NEXT "Make sure CUDA_VISIBLE_DEVICES is not preventing visibility of "\ - "this GPU. Also check if CUDA libraries are compatible and "\ - "correctly installed." -#define DCGM_FR_ECC_UNSUPPORTED_NEXT "" -#define DCGM_FR_ECC_PENDING_NEXT "Reboot to complete activation of the ECC memory." -#define DCGM_FR_MEMORY_BANDWIDTH_NEXT "" -#define DCGM_FR_TARGET_POWER_NEXT "Verify that the clock speeds and GPU utilization are high." -#define DCGM_FR_API_FAIL_NEXT "" -#define DCGM_FR_API_FAIL_GPU_NEXT "" -#define DCGM_FR_CUDA_CONTEXT_NEXT "Please make sure the correct driver version is installed and "\ - "verify that no conflicting libraries are present." +#define DCGM_FR_OK_NEXT "N/A" +#define DCGM_FR_UNKNOWN_NEXT "" +#define DCGM_FR_UNRECOGNIZED_NEXT "" +#define DCGM_FR_PCI_REPLAY_RATE_NEXT \ + "Reconnect PCIe card. Run system side PCIE diagnostic utilities " \ + "to verify hops off the GPU board. If issue is on the board, run " \ + "the field diagnostic." +#define DCGM_FR_VOLATILE_DBE_DETECTED_NEXT "Drain the GPU and reset it or reboot the node." +#define DCGM_FR_VOLATILE_SBE_DETECTED_NEXT "Monitor - this GPU can still perform workload." +#define DCGM_FR_PENDING_PAGE_RETIREMENTS_NEXT \ + "If volatile double bit errors exist, drain the GPU and reset it " \ + "or reboot the node. Otherwise, monitor - GPU can still perform " \ + "workload." +#define DCGM_FR_RETIRED_PAGES_LIMIT_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_RETIRED_PAGES_DBE_LIMIT_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_CORRUPT_INFOROM_NEXT "Flash the InfoROM to clear this corruption." +#define DCGM_FR_CLOCK_THROTTLE_THERMAL_NEXT DEBUG_COOLING_MSG +#define DCGM_FR_POWER_UNREADABLE_NEXT "" +#define DCGM_FR_CLOCK_THROTTLE_POWER_NEXT "Monitor the power conditions. This GPU can still perform workload." +#define DCGM_FR_NVLINK_ERROR_THRESHOLD_NEXT "Monitor the NVLink. It can still perform workload." +#define DCGM_FR_NVLINK_DOWN_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_NVSWITCH_FATAL_ERROR_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_NVSWITCH_NON_FATAL_ERROR_NEXT "Monitor the NVSwitch. It can still perform workload." +#define DCGM_FR_NVSWITCH_DOWN_NEXT "" +#define DCGM_FR_NO_ACCESS_TO_FILE_NEXT "Check relevant permissions, access, and existence of the file." +#define DCGM_FR_NVML_API_NEXT \ + "Check the error condition and ensure that appropriate libraries " \ + "are present and accessible." +#define DCGM_FR_DEVICE_COUNT_MISMATCH_NEXT \ + "Check for the presence of cgroups, operating system blocks, and " \ + "or unsupported / older cards" +#define DCGM_FR_BAD_PARAMETER_NEXT "" +#define DCGM_FR_CANNOT_OPEN_LIB_NEXT \ + "Check for the existence of the library and set LD_LIBRARY_PATH " \ + "if needed." +#define DCGM_FR_BLACKLISTED_DRIVER_NEXT "Please load the appropriate driver." +#define DCGM_FR_NVML_LIB_BAD_NEXT \ + "Make sure that the required version of libnvidia-ml.so " \ + "is present and accessible on the system." +#define DCGM_FR_GRAPHICS_PROCESSES_NEXT \ + "Stop the graphics processes or run this diagnostic on a server " \ + "that is not being used for display purposes." +#define DCGM_FR_HOSTENGINE_CONN_NEXT \ + "If hostengine is run separately, please ensure that it is up " \ + "and responsive." +#define DCGM_FR_FIELD_QUERY_NEXT "" +#define DCGM_FR_BAD_CUDA_ENV_NEXT "Please unset this environment variable to address test failures." +#define DCGM_FR_PERSISTENCE_MODE_NEXT \ + "Enable persistence mode by running \"nvidia-smi -i -pm " \ + "1 \" as root." +#define DCGM_FR_LOW_BANDWIDTH_NEXT \ + "Verify that your minimum bandwidth setting is appropriate for " \ + "the topology of each GPU. If so, and errors are consistent, " \ + "please run a field diagnostic." +#define DCGM_FR_HIGH_LATENCY_NEXT \ + "Verify that your maximum latency setting is appropriate for " \ + "the topology of each GPU. If so, and errors are consistent, " \ + "please run a field diagnostic." +#define DCGM_FR_CANNOT_GET_FIELD_TAG_NEXT "" +#define DCGM_FR_FIELD_VIOLATION_NEXT "" +#define DCGM_FR_FIELD_THRESHOLD_NEXT "" +#define DCGM_FR_FIELD_VIOLATION_DBL_NEXT "" +#define DCGM_FR_FIELD_THRESHOLD_DBL_NEXT "" +#define DCGM_FR_UNSUPPORTED_FIELD_TYPE_NEXT "" +#define DCGM_FR_FIELD_THRESHOLD_TS_NEXT "" +#define DCGM_FR_FIELD_THRESHOLD_TS_DBL_NEXT "" +#define DCGM_FR_THERMAL_VIOLATIONS_NEXT DEBUG_COOLING_MSG +#define DCGM_FR_THERMAL_VIOLATIONS_TS_NEXT DEBUG_COOLING_MSG +#define DCGM_FR_TEMP_VIOLATION_NEXT \ + "Verify that the user-specified temperature maximum is set " \ + "correctly. If it is, check the cooling for this GPU and node: " DEBUG_COOLING_MSG +#define DCGM_FR_THROTTLING_VIOLATION_NEXT "" +#define DCGM_FR_INTERNAL_NEXT "" +#define DCGM_FR_PCIE_GENERATION_NEXT "" +#define DCGM_FR_PCIE_WIDTH_NEXT "" +#define DCGM_FR_ABORTED_NEXT "" +#define DCGM_FR_TEST_DISABLED_NEXT "" +#define DCGM_FR_CANNOT_GET_STAT_NEXT \ + "If running a standalone nv-hostengine, verify that it is up " \ + "and responsive." +#define DCGM_FR_STRESS_LEVEL_NEXT "" +#define DCGM_FR_CUDA_API_NEXT "" +#define DCGM_FR_FAULTY_MEMORY_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_CANNOT_SET_WATCHES_NEXT "" +#define DCGM_FR_CUDA_UNBOUND_NEXT "" +#define DCGM_FR_ECC_DISABLED_NEXT \ + "Enable ECC memory by running \"nvidia-smi -i -e 1\" " \ + "to enable. This may require a GPU reset or reboot to take effect." +#define DCGM_FR_MEMORY_ALLOC_NEXT "" +#define DCGM_FR_CUDA_DBE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_MEMORY_MISMATCH_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_CUDA_DEVICE_NEXT \ + "Make sure CUDA_VISIBLE_DEVICES is not preventing visibility of " \ + "this GPU. Also check if CUDA libraries are compatible and " \ + "correctly installed." +#define DCGM_FR_ECC_UNSUPPORTED_NEXT "" +#define DCGM_FR_ECC_PENDING_NEXT "Reboot to complete activation of the ECC memory." +#define DCGM_FR_MEMORY_BANDWIDTH_NEXT "" +#define DCGM_FR_TARGET_POWER_NEXT "Verify that the clock speeds and GPU utilization are high." +#define DCGM_FR_API_FAIL_NEXT "" +#define DCGM_FR_API_FAIL_GPU_NEXT "" +#define DCGM_FR_CUDA_CONTEXT_NEXT \ + "Please make sure the correct driver version is installed and " \ + "verify that no conflicting libraries are present." #define DCGM_FR_DCGM_API_NEXT "" #define DCGM_FR_CONCURRENT_GPUS_NEXT "" #define DCGM_FR_TOO_MANY_ERRORS_NEXT "" #define DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_NEXT TRIAGE_RUN_FIELD_DIAG_MSG #define DCGM_FR_NVLINK_ERROR_CRITICAL_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_ENFORCED_POWER_LIMIT_NEXT "If this enforced power limit is necessary, then this test "\ - "cannot be run. If it is unnecessary, then raise the enforced "\ - "power limit setting to be able to run this test." -#define DCGM_FR_MEMORY_ALLOC_HOST_NEXT "Manually kill processes or restart your machine." -#define DCGM_FR_GPU_OP_MODE_NEXT "Fix by running nvidia-smi as root with: nvidia-smi --gom=0 -i "\ - "" -#define DCGM_FR_NO_MEMORY_CLOCKS_NEXT "" -#define DCGM_FR_NO_GRAPHICS_CLOCKS_NEXT "" -#define DCGM_FR_HAD_TO_RESTORE_STATE_NEXT "" +#define DCGM_FR_ENFORCED_POWER_LIMIT_NEXT \ + "If this enforced power limit is necessary, then this test " \ + "cannot be run. If it is unnecessary, then raise the enforced " \ + "power limit setting to be able to run this test." +#define DCGM_FR_MEMORY_ALLOC_HOST_NEXT "Manually kill processes or restart your machine." +#define DCGM_FR_GPU_OP_MODE_NEXT \ + "Fix by running nvidia-smi as root with: nvidia-smi --gom=0 -i " \ + "" +#define DCGM_FR_NO_MEMORY_CLOCKS_NEXT "" +#define DCGM_FR_NO_GRAPHICS_CLOCKS_NEXT "" +#define DCGM_FR_HAD_TO_RESTORE_STATE_NEXT "" +#define DCGM_FR_L1TAG_UNSUPPORTED_NEXT "" +#define DCGM_FR_L1TAG_MISCOMPARE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_ROW_REMAP_FAILURE_NEXT DCGM_FR_VOLATILE_DBE_DETECTED_NEXT +#define DCGM_FR_UNCONTAINED_ERROR_NEXT DCGM_FR_VOLATILE_DBE_DETECTED_NEXT dcgmErrorSeverity_t dcgmErrorGetPriorityByCode(unsigned int code); const char *dcgmErrorGetFormatMsgByCode(unsigned int code); diff --git a/bindings/go/dcgm/dcgm_fields.h b/bindings/go/dcgm/dcgm_fields.h index b514766..8bbf799 100644 --- a/bindings/go/dcgm/dcgm_fields.h +++ b/bindings/go/dcgm/dcgm_fields.h @@ -15,70 +15,79 @@ #ifdef __cplusplus extern "C" { #endif - + /***************************************************************************************************/ /** @defgroup dcgmFieldTypes Field Types * Field Types are a single byte. * @{ */ -/***************************************************************************************************/ +/***************************************************************************************************/ /** * Blob of binary data representing a structure */ -#define DCGM_FT_BINARY 'b' - +#define DCGM_FT_BINARY 'b' + /** * 8-byte double precision */ -#define DCGM_FT_DOUBLE 'd' - +#define DCGM_FT_DOUBLE 'd' + /** * 8-byte signed integer */ -#define DCGM_FT_INT64 'i' - +#define DCGM_FT_INT64 'i' + /** * Null-terminated ASCII Character string */ -#define DCGM_FT_STRING 's' - +#define DCGM_FT_STRING 's' + /** * 8-byte signed integer usec since 1970 */ #define DCGM_FT_TIMESTAMP 't' - -/** @} */ - + +/** @} */ + /***************************************************************************************************/ /** @defgroup dcgmFieldScope Field Scope * Represents field association with entity scope or global scope. * @{ */ -/***************************************************************************************************/ +/***************************************************************************************************/ /** * Field is global (ex: driver version) */ -#define DCGM_FS_GLOBAL 0 +#define DCGM_FS_GLOBAL 0 /** * Field is associated with an entity (GPU, VGPU...etc) */ -#define DCGM_FS_ENTITY 1 +#define DCGM_FS_ENTITY 1 /** * Field is associated with a device. Deprecated. Use DCGM_FS_ENTITY */ -#define DCGM_FS_DEVICE DCGM_FS_ENTITY +#define DCGM_FS_DEVICE DCGM_FS_ENTITY + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup dcgmFieldConstants Field Constants + * Constants that represent contents of individual field values. + * @{ + */ +/***************************************************************************************************/ /** * DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY is 16 bits of major version followed by * 16 bits of the minor version. These macros separate the two. */ -#define DCGM_CUDA_COMPUTE_CAPABILITY_MAJOR(x) ((uint64_t)(x) & 0xFFFF0000) -#define DCGM_CUDA_COMPUTE_CAPABILITY_MINOR(x) ((uint64_t)(x) & 0x0000FFFF) +#define DCGM_CUDA_COMPUTE_CAPABILITY_MAJOR(x) ((uint64_t)(x)&0xFFFF0000) +#define DCGM_CUDA_COMPUTE_CAPABILITY_MINOR(x) ((uint64_t)(x)&0x0000FFFF) /** * DCGM_FI_DEV_CLOCK_THROTTLE_REASONS is a bitmap of why the clock is throttled. @@ -88,23 +97,23 @@ extern "C" { /** Nothing is running on the GPU and the clocks are dropping to Idle state * \note This limiter may be removed in a later release */ -#define DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE 0x0000000000000001LL +#define DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE 0x0000000000000001LL /** GPU clocks are limited by current setting of applications clocks */ -#define DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING 0x0000000000000002LL -/** SW Power Scaling algorithm is reducing the clocks below requested clocks +#define DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING 0x0000000000000002LL +/** SW Power Scaling algorithm is reducing the clocks below requested clocks */ -#define DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP 0x0000000000000004LL +#define DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP 0x0000000000000004LL /** HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged * - *This is an indicator of: - * - temperature being too high - * - External Power Brake Assertion is triggered (e.g. by the system power supply) - * - Power draw is too high and Fast Trigger protection is reducing the clocks - * - May be also reported during PState or clock change - * - This behavior may be removed in a later release. - */ -#define DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN 0x0000000000000008LL + * This is an indicator of: + * - temperature being too high + * - External Power Brake Assertion is triggered (e.g. by the system power supply) + * - Power draw is too high and Fast Trigger protection is reducing the clocks + * - May be also reported during PState or clock change + * - This behavior may be removed in a later release. + */ +#define DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN 0x0000000000000008LL /** Sync Boost * * This GPU has been added to a Sync boost group with nvidia-smi or DCGM in @@ -113,29 +122,42 @@ extern "C" { * the throttle reasons for other GPUs in the system to see why those GPUs are * holding this one at lower clocks. */ -#define DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST 0x0000000000000010LL +#define DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST 0x0000000000000010LL /** SW Thermal Slowdown * * This is an indicator of one or more of the following: * - Current GPU temperature above the GPU Max Operating Temperature * - Current memory temperature above the Memory Max Operating Temperature */ -#define DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL 0x0000000000000020LL +#define DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL 0x0000000000000020LL /** HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged * * This is an indicator of: * - temperature being too high */ -#define DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL 0x0000000000000040LL +#define DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL 0x0000000000000040LL /** HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged * * This is an indicator of: * - External Power Brake Assertion being triggered (e.g. by the system power supply) */ -#define DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE 0x0000000000000080LL +#define DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE 0x0000000000000080LL /** GPU clocks are limited by current setting of Display clocks */ -#define DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS 0x0000000000000100LL +#define DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS 0x0000000000000100LL + +/** + * GPU virtualization mode types for DCGM_FI_DEV_VIRTUAL_MODE + */ +typedef enum +{ + DCGM_GPU_VIRTUALIZATION_MODE_NONE = 0, //!< Represents Bare Metal GPU + DCGM_GPU_VIRTUALIZATION_MODE_PASSTHROUGH = 1, //!< Device is associated with GPU-Passthrough + DCGM_GPU_VIRTUALIZATION_MODE_VGPU = 2, //!< Device is associated with vGPU inside virtual machine. + DCGM_GPU_VIRTUALIZATION_MODE_HOST_VGPU = 3, //!< Device is associated with VGX hypervisor in vGPU mode + DCGM_GPU_VIRTUALIZATION_MODE_HOST_VSGA = 4, //!< Device is associated with VGX hypervisor in vSGA mode +} dcgmGpuVirtualizationMode_t; + /** @} */ @@ -151,12 +173,14 @@ extern "C" { */ typedef enum dcgm_field_entity_group_t { - DCGM_FE_NONE = 0, /** Field is not associated with an entity. Field scope should be DCGM_FS_GLOBAL */ - DCGM_FE_GPU, /** Field is associated with a GPU entity */ - DCGM_FE_VGPU, /** Field is associated with a VGPU entity */ - DCGM_FE_SWITCH, /** Field is associated with a Switch entity */ - - DCGM_FE_COUNT /** Number of elements in this enumeration. Keep this entry last */ + DCGM_FE_NONE = 0, /*!< Field is not associated with an entity. Field scope should be DCGM_FS_GLOBAL */ + DCGM_FE_GPU, /*!< Field is associated with a GPU entity */ + DCGM_FE_VGPU, /*!< Field is associated with a VGPU entity */ + DCGM_FE_SWITCH, /*!< Field is associated with a Switch entity */ + DCGM_FE_GPU_I, /*!< Field is associated with a GPU Instance entity */ + DCGM_FE_GPU_CI, /*!< Field is associated with a GPU Compute Instance entity */ + + DCGM_FE_COUNT /*!< Number of elements in this enumeration. Keep this entry last */ } dcgm_field_entity_group_t; /** @@ -172,98 +196,106 @@ typedef unsigned int dcgm_field_eid_t; * @{ */ /***************************************************************************************************/ - + /** * NULL field - */ -#define DCGM_FI_UNKNOWN 0 - + */ +#define DCGM_FI_UNKNOWN 0 + /** * Driver Version */ -#define DCGM_FI_DRIVER_VERSION 1 - +#define DCGM_FI_DRIVER_VERSION 1 + /* Underlying NVML version */ -#define DCGM_FI_NVML_VERSION 2 - +#define DCGM_FI_NVML_VERSION 2 + /* * Process Name */ -#define DCGM_FI_PROCESS_NAME 3 - +#define DCGM_FI_PROCESS_NAME 3 + /** * Number of Devices on the node - */ -#define DCGM_FI_DEV_COUNT 4 + */ +#define DCGM_FI_DEV_COUNT 4 + +/** + * Cuda Driver Version + * Retrieves a number with the major value in the thousands place and the minor value in the hundreds place. + * CUDA 11.1 = 11100 + */ +#define DCGM_FI_CUDA_DRIVER_VERSION 5 + /** * Name of the GPU device */ -#define DCGM_FI_DEV_NAME 50 - +#define DCGM_FI_DEV_NAME 50 + /** * Device Brand */ -#define DCGM_FI_DEV_BRAND 51 - +#define DCGM_FI_DEV_BRAND 51 + /** * NVML index of this GPU */ -#define DCGM_FI_DEV_NVML_INDEX 52 +#define DCGM_FI_DEV_NVML_INDEX 52 /** * Device Serial Number */ -#define DCGM_FI_DEV_SERIAL 53 +#define DCGM_FI_DEV_SERIAL 53 /** * UUID corresponding to the device */ -#define DCGM_FI_DEV_UUID 54 +#define DCGM_FI_DEV_UUID 54 /** * Device node minor number /dev/nvidia# */ -#define DCGM_FI_DEV_MINOR_NUMBER 55 +#define DCGM_FI_DEV_MINOR_NUMBER 55 /** * OEM inforom version */ -#define DCGM_FI_DEV_OEM_INFOROM_VER 56 +#define DCGM_FI_DEV_OEM_INFOROM_VER 56 /** * PCI attributes for the device */ -#define DCGM_FI_DEV_PCI_BUSID 57 +#define DCGM_FI_DEV_PCI_BUSID 57 /** * The combined 16-bit device id and 16-bit vendor id */ -#define DCGM_FI_DEV_PCI_COMBINED_ID 58 - +#define DCGM_FI_DEV_PCI_COMBINED_ID 58 + /** * The 32-bit Sub System Device ID */ -#define DCGM_FI_DEV_PCI_SUBSYS_ID 59 +#define DCGM_FI_DEV_PCI_SUBSYS_ID 59 /** * Topology of all GPUs on the system via PCI (static) */ -#define DCGM_FI_GPU_TOPOLOGY_PCI 60 +#define DCGM_FI_GPU_TOPOLOGY_PCI 60 /** * Topology of all GPUs on the system via NVLINK (static) */ -#define DCGM_FI_GPU_TOPOLOGY_NVLINK 61 +#define DCGM_FI_GPU_TOPOLOGY_NVLINK 61 /** * Affinity of all GPUs on the system (static) */ -#define DCGM_FI_GPU_TOPOLOGY_AFFINITY 62 +#define DCGM_FI_GPU_TOPOLOGY_AFFINITY 62 /** * Cuda compute capability for the device. - * The major version is the upper 32 bits and + * The major version is the upper 32 bits and * the minor version is the lower 32 bits. */ #define DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY 63 @@ -271,103 +303,114 @@ typedef unsigned int dcgm_field_eid_t; /** * Compute mode for the device */ -#define DCGM_FI_DEV_COMPUTE_MODE 65 +#define DCGM_FI_DEV_COMPUTE_MODE 65 +/** + * Persistence mode for the device + * Boolean: 0 is disabled, 1 is enabled + */ +#define DCGM_FI_DEV_PERSISTENCE_MODE 66 + +/** + * MIG mode for the device + * Boolean: 0 is disabled, 1 is enabled + */ +#define DCGM_FI_DEV_MIG_MODE 67 /** * Device CPU affinity. part 1/8 = cpus 0 - 63 */ -#define DCGM_FI_DEV_CPU_AFFINITY_0 70 - +#define DCGM_FI_DEV_CPU_AFFINITY_0 70 + /** * Device CPU affinity. part 1/8 = cpus 64 - 127 - */ -#define DCGM_FI_DEV_CPU_AFFINITY_1 71 - + */ +#define DCGM_FI_DEV_CPU_AFFINITY_1 71 + /** * Device CPU affinity. part 2/8 = cpus 128 - 191 - */ -#define DCGM_FI_DEV_CPU_AFFINITY_2 72 + */ +#define DCGM_FI_DEV_CPU_AFFINITY_2 72 /** * Device CPU affinity. part 3/8 = cpus 192 - 255 */ -#define DCGM_FI_DEV_CPU_AFFINITY_3 73 +#define DCGM_FI_DEV_CPU_AFFINITY_3 73 /** * ECC inforom version */ -#define DCGM_FI_DEV_ECC_INFOROM_VER 80 +#define DCGM_FI_DEV_ECC_INFOROM_VER 80 /** * Power management object inforom version */ -#define DCGM_FI_DEV_POWER_INFOROM_VER 81 +#define DCGM_FI_DEV_POWER_INFOROM_VER 81 /** * Inforom image version */ -#define DCGM_FI_DEV_INFOROM_IMAGE_VER 82 +#define DCGM_FI_DEV_INFOROM_IMAGE_VER 82 /** * Inforom configuration checksum */ -#define DCGM_FI_DEV_INFOROM_CONFIG_CHECK 83 +#define DCGM_FI_DEV_INFOROM_CONFIG_CHECK 83 /** * Reads the infoROM from the flash and verifies the checksums */ -#define DCGM_FI_DEV_INFOROM_CONFIG_VALID 84 +#define DCGM_FI_DEV_INFOROM_CONFIG_VALID 84 /** * VBIOS version of the device */ -#define DCGM_FI_DEV_VBIOS_VERSION 85 +#define DCGM_FI_DEV_VBIOS_VERSION 85 /** * Total BAR1 of the GPU in MB */ -#define DCGM_FI_DEV_BAR1_TOTAL 90 +#define DCGM_FI_DEV_BAR1_TOTAL 90 /** - * Sync boost settings on the node + * Deprecated - Sync boost settings on the node */ -#define DCGM_FI_SYNC_BOOST 91 +#define DCGM_FI_SYNC_BOOST 91 /** * Used BAR1 of the GPU in MB */ -#define DCGM_FI_DEV_BAR1_USED 92 +#define DCGM_FI_DEV_BAR1_USED 92 /** * Free BAR1 of the GPU in MB */ -#define DCGM_FI_DEV_BAR1_FREE 93 +#define DCGM_FI_DEV_BAR1_FREE 93 /** * SM clock for the device */ -#define DCGM_FI_DEV_SM_CLOCK 100 +#define DCGM_FI_DEV_SM_CLOCK 100 /** * Memory clock for the device */ -#define DCGM_FI_DEV_MEM_CLOCK 101 +#define DCGM_FI_DEV_MEM_CLOCK 101 /** * Video encoder/decoder clock for the device */ -#define DCGM_FI_DEV_VIDEO_CLOCK 102 +#define DCGM_FI_DEV_VIDEO_CLOCK 102 /** * SM Application clocks */ -#define DCGM_FI_DEV_APP_SM_CLOCK 110 +#define DCGM_FI_DEV_APP_SM_CLOCK 110 /** * Memory Application clocks */ -#define DCGM_FI_DEV_APP_MEM_CLOCK 111 +#define DCGM_FI_DEV_APP_MEM_CLOCK 111 /** * Current clock throttle reasons (bitmask of DCGM_CLOCKS_THROTTLE_REASON_*) @@ -377,42 +420,53 @@ typedef unsigned int dcgm_field_eid_t; /** * Maximum supported SM clock for the device */ -#define DCGM_FI_DEV_MAX_SM_CLOCK 113 +#define DCGM_FI_DEV_MAX_SM_CLOCK 113 /** * Maximum supported Memory clock for the device */ -#define DCGM_FI_DEV_MAX_MEM_CLOCK 114 +#define DCGM_FI_DEV_MAX_MEM_CLOCK 114 /** * Maximum supported Video encoder/decoder clock for the device */ -#define DCGM_FI_DEV_MAX_VIDEO_CLOCK 115 +#define DCGM_FI_DEV_MAX_VIDEO_CLOCK 115 /** * Auto-boost for the device (1 = enabled. 0 = disabled) */ -#define DCGM_FI_DEV_AUTOBOOST 120 +#define DCGM_FI_DEV_AUTOBOOST 120 /** * Supported clocks for the device */ -#define DCGM_FI_DEV_SUPPORTED_CLOCKS 130 +#define DCGM_FI_DEV_SUPPORTED_CLOCKS 130 /** * Memory temperature for the device */ -#define DCGM_FI_DEV_MEMORY_TEMP 140 +#define DCGM_FI_DEV_MEMORY_TEMP 140 /** * Current temperature readings for the device, in degrees C */ -#define DCGM_FI_DEV_GPU_TEMP 150 +#define DCGM_FI_DEV_GPU_TEMP 150 + +/** + * Maximum operating temperature for the memory of this GPU + */ +#define DCGM_FI_DEV_MEM_MAX_OP_TEMP 151 + +/** + * Maximum operating temperature for this GPU + */ +#define DCGM_FI_DEV_GPU_MAX_OP_TEMP 152 + /** * Power usage for the device in Watts */ -#define DCGM_FI_DEV_POWER_USAGE 155 +#define DCGM_FI_DEV_POWER_USAGE 155 /** * Total energy consumption for the GPU in mJ since the driver was last reloaded @@ -422,72 +476,76 @@ typedef unsigned int dcgm_field_eid_t; /** * Slowdown temperature for the device */ -#define DCGM_FI_DEV_SLOWDOWN_TEMP 158 +#define DCGM_FI_DEV_SLOWDOWN_TEMP 158 /** * Shutdown temperature for the device */ -#define DCGM_FI_DEV_SHUTDOWN_TEMP 159 +#define DCGM_FI_DEV_SHUTDOWN_TEMP 159 /** * Current Power limit for the device */ -#define DCGM_FI_DEV_POWER_MGMT_LIMIT 160 +#define DCGM_FI_DEV_POWER_MGMT_LIMIT 160 /** * Minimum power management limit for the device */ -#define DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN 161 +#define DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN 161 /** * Maximum power management limit for the device */ -#define DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX 162 +#define DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX 162 /** * Default power management limit for the device */ -#define DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF 163 +#define DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF 163 /** * Effective power limit that the driver enforces after taking into account all limiters */ -#define DCGM_FI_DEV_ENFORCED_POWER_LIMIT 164 +#define DCGM_FI_DEV_ENFORCED_POWER_LIMIT 164 /** * Performance state (P-State) 0-15. 0=highest */ -#define DCGM_FI_DEV_PSTATE 190 +#define DCGM_FI_DEV_PSTATE 190 /** * Fan speed for the device in percent 0-100 */ -#define DCGM_FI_DEV_FAN_SPEED 191 +#define DCGM_FI_DEV_FAN_SPEED 191 /** * PCIe Tx utilization information + * + * Deprecated: Use DCGM_FI_PROF_PCIE_TX_BYTES instead. */ -#define DCGM_FI_DEV_PCIE_TX_THROUGHPUT 200 - +#define DCGM_FI_DEV_PCIE_TX_THROUGHPUT 200 + /** * PCIe Rx utilization information - */ -#define DCGM_FI_DEV_PCIE_RX_THROUGHPUT 201 - + * + * Deprecated: Use DCGM_FI_PROF_PCIE_RX_BYTES instead. + */ +#define DCGM_FI_DEV_PCIE_RX_THROUGHPUT 201 + /** * PCIe replay counter */ -#define DCGM_FI_DEV_PCIE_REPLAY_COUNTER 202 +#define DCGM_FI_DEV_PCIE_REPLAY_COUNTER 202 /** * GPU Utilization */ -#define DCGM_FI_DEV_GPU_UTIL 203 +#define DCGM_FI_DEV_GPU_UTIL 203 /** * Memory Utilization */ -#define DCGM_FI_DEV_MEM_COPY_UTIL 204 +#define DCGM_FI_DEV_MEM_COPY_UTIL 204 /** * Process accounting stats. @@ -496,17 +554,17 @@ typedef unsigned int dcgm_field_eid_t; * enable accounting ahead of time. Accounting mode can be enabled by * running "nvidia-smi -am 1" as root on the same node the host engine is running on. */ -#define DCGM_FI_DEV_ACCOUNTING_DATA 205 +#define DCGM_FI_DEV_ACCOUNTING_DATA 205 /** * Encoder Utilization */ -#define DCGM_FI_DEV_ENC_UTIL 206 +#define DCGM_FI_DEV_ENC_UTIL 206 /** * Decoder Utilization */ -#define DCGM_FI_DEV_DEC_UTIL 207 +#define DCGM_FI_DEV_DEC_UTIL 207 /** * Memory utilization samples @@ -516,57 +574,57 @@ typedef unsigned int dcgm_field_eid_t; /* * SM utilization samples */ -#define DCGM_FI_DEV_GPU_UTIL_SAMPLES 211 +#define DCGM_FI_DEV_GPU_UTIL_SAMPLES 211 /** * Graphics processes running on the GPU. */ -#define DCGM_FI_DEV_GRAPHICS_PIDS 220 +#define DCGM_FI_DEV_GRAPHICS_PIDS 220 /** * Compute processes running on the GPU. */ -#define DCGM_FI_DEV_COMPUTE_PIDS 221 +#define DCGM_FI_DEV_COMPUTE_PIDS 221 /** * XID errors. The value is the specific XID error */ -#define DCGM_FI_DEV_XID_ERRORS 230 +#define DCGM_FI_DEV_XID_ERRORS 230 /** * PCIe Max Link Generation */ -#define DCGM_FI_DEV_PCIE_MAX_LINK_GEN 235 +#define DCGM_FI_DEV_PCIE_MAX_LINK_GEN 235 /** * PCIe Max Link Width */ -#define DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH 236 +#define DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH 236 /** * PCIe Current Link Generation */ -#define DCGM_FI_DEV_PCIE_LINK_GEN 237 +#define DCGM_FI_DEV_PCIE_LINK_GEN 237 /** * PCIe Current Link Width */ -#define DCGM_FI_DEV_PCIE_LINK_WIDTH 238 +#define DCGM_FI_DEV_PCIE_LINK_WIDTH 238 /** * Power Violation time in usec */ -#define DCGM_FI_DEV_POWER_VIOLATION 240 +#define DCGM_FI_DEV_POWER_VIOLATION 240 /** * Thermal Violation time in usec */ -#define DCGM_FI_DEV_THERMAL_VIOLATION 241 +#define DCGM_FI_DEV_THERMAL_VIOLATION 241 /** * Sync Boost Violation time in usec */ -#define DCGM_FI_DEV_SYNC_BOOST_VIOLATION 242 +#define DCGM_FI_DEV_SYNC_BOOST_VIOLATION 242 /** * Board violation limit. @@ -576,7 +634,7 @@ typedef unsigned int dcgm_field_eid_t; /** *Low utilisation violation limit. */ -#define DCGM_FI_DEV_LOW_UTIL_VIOLATION 244 +#define DCGM_FI_DEV_LOW_UTIL_VIOLATION 244 /** *Reliability violation limit. @@ -596,1301 +654,1439 @@ typedef unsigned int dcgm_field_eid_t; /** * Total Frame Buffer of the GPU in MB */ -#define DCGM_FI_DEV_FB_TOTAL 250 +#define DCGM_FI_DEV_FB_TOTAL 250 /** * Free Frame Buffer in MB */ -#define DCGM_FI_DEV_FB_FREE 251 +#define DCGM_FI_DEV_FB_FREE 251 /** * Used Frame Buffer in MB */ -#define DCGM_FI_DEV_FB_USED 252 +#define DCGM_FI_DEV_FB_USED 252 /** * Current ECC mode for the device */ -#define DCGM_FI_DEV_ECC_CURRENT 300 - +#define DCGM_FI_DEV_ECC_CURRENT 300 + /** * Pending ECC mode for the device - */ -#define DCGM_FI_DEV_ECC_PENDING 301 - + */ +#define DCGM_FI_DEV_ECC_PENDING 301 + /** * Total single bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_SBE_VOL_TOTAL 310 - + */ +#define DCGM_FI_DEV_ECC_SBE_VOL_TOTAL 310 + /** * Total double bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_DBE_VOL_TOTAL 311 - + */ +#define DCGM_FI_DEV_ECC_DBE_VOL_TOTAL 311 + /** * Total single bit aggregate (persistent) ECC errors * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_SBE_AGG_TOTAL 312 - + */ +#define DCGM_FI_DEV_ECC_SBE_AGG_TOTAL 312 + /** * Total double bit aggregate (persistent) ECC errors * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_DBE_AGG_TOTAL 313 - + */ +#define DCGM_FI_DEV_ECC_DBE_AGG_TOTAL 313 + /** * L1 cache single bit volatile ECC errors */ -#define DCGM_FI_DEV_ECC_SBE_VOL_L1 314 - +#define DCGM_FI_DEV_ECC_SBE_VOL_L1 314 + /** * L1 cache double bit volatile ECC errors */ -#define DCGM_FI_DEV_ECC_DBE_VOL_L1 315 - +#define DCGM_FI_DEV_ECC_DBE_VOL_L1 315 + /** * L2 cache single bit volatile ECC errors */ -#define DCGM_FI_DEV_ECC_SBE_VOL_L2 316 - +#define DCGM_FI_DEV_ECC_SBE_VOL_L2 316 + /** * L2 cache double bit volatile ECC errors */ -#define DCGM_FI_DEV_ECC_DBE_VOL_L2 317 - +#define DCGM_FI_DEV_ECC_DBE_VOL_L2 317 + /** * Device memory single bit volatile ECC errors */ -#define DCGM_FI_DEV_ECC_SBE_VOL_DEV 318 +#define DCGM_FI_DEV_ECC_SBE_VOL_DEV 318 /** * Device memory double bit volatile ECC errors */ -#define DCGM_FI_DEV_ECC_DBE_VOL_DEV 319 - +#define DCGM_FI_DEV_ECC_DBE_VOL_DEV 319 + /** * Register file single bit volatile ECC errors */ -#define DCGM_FI_DEV_ECC_SBE_VOL_REG 320 - +#define DCGM_FI_DEV_ECC_SBE_VOL_REG 320 + /** * Register file double bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_DBE_VOL_REG 321 - + */ +#define DCGM_FI_DEV_ECC_DBE_VOL_REG 321 + /** * Texture memory single bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_SBE_VOL_TEX 322 - + */ +#define DCGM_FI_DEV_ECC_SBE_VOL_TEX 322 + /** * Texture memory double bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_DBE_VOL_TEX 323 - + */ +#define DCGM_FI_DEV_ECC_DBE_VOL_TEX 323 + /** * L1 cache single bit aggregate (persistent) ECC errors * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_SBE_AGG_L1 324 - + */ +#define DCGM_FI_DEV_ECC_SBE_AGG_L1 324 + /** * L1 cache double bit aggregate (persistent) ECC errors * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_DBE_AGG_L1 325 - + */ +#define DCGM_FI_DEV_ECC_DBE_AGG_L1 325 + /** * L2 cache single bit aggregate (persistent) ECC errors * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_SBE_AGG_L2 326 + */ +#define DCGM_FI_DEV_ECC_SBE_AGG_L2 326 /** * L2 cache double bit aggregate (persistent) ECC errors * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_DBE_AGG_L2 327 - + */ +#define DCGM_FI_DEV_ECC_DBE_AGG_L2 327 + /** * Device memory single bit aggregate (persistent) ECC errors * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_SBE_AGG_DEV 328 - + */ +#define DCGM_FI_DEV_ECC_SBE_AGG_DEV 328 + /** * Device memory double bit aggregate (persistent) ECC errors * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_DBE_AGG_DEV 329 - + */ +#define DCGM_FI_DEV_ECC_DBE_AGG_DEV 329 + /** * Register File single bit aggregate (persistent) ECC errors * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_SBE_AGG_REG 330 - + */ +#define DCGM_FI_DEV_ECC_SBE_AGG_REG 330 + /** * Register File double bit aggregate (persistent) ECC errors * Note: monotonically increasing */ -#define DCGM_FI_DEV_ECC_DBE_AGG_REG 331 - +#define DCGM_FI_DEV_ECC_DBE_AGG_REG 331 + /** * Texture memory single bit aggregate (persistent) ECC errors * Note: monotonically increasing */ -#define DCGM_FI_DEV_ECC_SBE_AGG_TEX 332 +#define DCGM_FI_DEV_ECC_SBE_AGG_TEX 332 /** * Texture memory double bit aggregate (persistent) ECC errors * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_DBE_AGG_TEX 333 - + */ +#define DCGM_FI_DEV_ECC_DBE_AGG_TEX 333 + /** * Number of retired pages because of single bit errors * Note: monotonically increasing */ -#define DCGM_FI_DEV_RETIRED_SBE 390 +#define DCGM_FI_DEV_RETIRED_SBE 390 /** * Number of retired pages because of double bit errors * Note: monotonically increasing */ -#define DCGM_FI_DEV_RETIRED_DBE 391 +#define DCGM_FI_DEV_RETIRED_DBE 391 /** * Number of pages pending retirement */ -#define DCGM_FI_DEV_RETIRED_PENDING 392 +#define DCGM_FI_DEV_RETIRED_PENDING 392 + +/** + * Number of remapped rows for uncorrectable errors + */ +#define DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS 393 + +/** + * Number of remapped rows for correctable errors + */ +#define DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS 394 + +/** + * Whether remapping of rows has failed + */ +#define DCGM_FI_DEV_ROW_REMAP_FAILURE 395 /* -* NV Link flow control CRC Error Counter for Lane 0 -*/ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 400 + * NV Link flow control CRC Error Counter for Lane 0 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 400 /* -* NV Link flow control CRC Error Counter for Lane 1 -*/ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 401 + * NV Link flow control CRC Error Counter for Lane 1 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 401 /* -* NV Link flow control CRC Error Counter for Lane 2 -*/ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 402 + * NV Link flow control CRC Error Counter for Lane 2 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 402 /* -* NV Link flow control CRC Error Counter for Lane 3 -*/ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 403 + * NV Link flow control CRC Error Counter for Lane 3 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 403 /* -* NV Link flow control CRC Error Counter for Lane 4 -*/ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 404 + * NV Link flow control CRC Error Counter for Lane 4 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 404 /* -* NV Link flow control CRC Error Counter for Lane 5 -*/ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 405 + * NV Link flow control CRC Error Counter for Lane 5 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 405 /* -* NV Link flow control CRC Error Counter total for all Lanes -*/ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL 409 + * NV Link flow control CRC Error Counter total for all Lanes + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL 409 /* -* NV Link data CRC Error Counter for Lane 0 -*/ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 410 + * NV Link data CRC Error Counter for Lane 0 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 410 /* -* NV Link data CRC Error Counter for Lane 1 -*/ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 411 + * NV Link data CRC Error Counter for Lane 1 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 411 /* -* NV Link data CRC Error Counter for Lane 2 -*/ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 412 + * NV Link data CRC Error Counter for Lane 2 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 412 /* -* NV Link data CRC Error Counter for Lane 3 -*/ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 413 + * NV Link data CRC Error Counter for Lane 3 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 413 /* -* NV Link data CRC Error Counter for Lane 4 -*/ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 414 + * NV Link data CRC Error Counter for Lane 4 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 414 /* -* NV Link data CRC Error Counter for Lane 5 -*/ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 415 + * NV Link data CRC Error Counter for Lane 5 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 415 /* -* NV Link data CRC Error Counter total for all Lanes -*/ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL 419 + * NV Link data CRC Error Counter total for all Lanes + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL 419 /* -* NV Link Replay Error Counter for Lane 0 -*/ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 420 + * NV Link Replay Error Counter for Lane 0 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 420 /* -* NV Link Replay Error Counter for Lane 1 -*/ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 421 + * NV Link Replay Error Counter for Lane 1 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 421 /* -* NV Link Replay Error Counter for Lane 2 -*/ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 422 + * NV Link Replay Error Counter for Lane 2 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 422 /* -* NV Link Replay Error Counter for Lane 3 -*/ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 423 + * NV Link Replay Error Counter for Lane 3 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 423 /* -* NV Link Replay Error Counter for Lane 4 -*/ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 424 + * NV Link Replay Error Counter for Lane 4 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 424 /* -* NV Link Replay Error Counter for Lane 5 -*/ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 425 + * NV Link Replay Error Counter for Lane 5 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 425 /* -* NV Link Replay Error Counter total for all Lanes -*/ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL 429 + * NV Link Replay Error Counter total for all Lanes + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL 429 /* -* NV Link Recovery Error Counter for Lane 0 -*/ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 430 + * NV Link Recovery Error Counter for Lane 0 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 430 /* -* NV Link Recovery Error Counter for Lane 1 -*/ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 431 + * NV Link Recovery Error Counter for Lane 1 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 431 /* -* NV Link Recovery Error Counter for Lane 2 -*/ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 432 + * NV Link Recovery Error Counter for Lane 2 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 432 /* -* NV Link Recovery Error Counter for Lane 3 -*/ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 433 + * NV Link Recovery Error Counter for Lane 3 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 433 /* -* NV Link Recovery Error Counter for Lane 4 -*/ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 434 + * NV Link Recovery Error Counter for Lane 4 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 434 /* -* NV Link Recovery Error Counter for Lane 5 -*/ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 435 + * NV Link Recovery Error Counter for Lane 5 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 435 /* -* NV Link Recovery Error Counter total for all Lanes -*/ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL 439 + * NV Link Recovery Error Counter total for all Lanes + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL 439 /* -* NV Link Bandwidth Counter for Lane 0 -*/ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 440 + * NV Link Bandwidth Counter for Lane 0 - Not supported in DCGM 2.0 + */ +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 440 /* -* NV Link Bandwidth Counter for Lane 1 -*/ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 441 + * NV Link Bandwidth Counter for Lane 1 - Not supported in DCGM 2.0 + */ +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 441 /* -* NV Link Bandwidth Counter for Lane 2 -*/ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 442 + * NV Link Bandwidth Counter for Lane 2 - Not supported in DCGM 2.0 + */ +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 442 /* -* NV Link Bandwidth Counter for Lane 3 -*/ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 443 + * NV Link Bandwidth Counter for Lane 3 - Not supported in DCGM 2.0 + */ +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 443 /* -* NV Link Bandwidth Counter for Lane 4 -*/ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 444 + * NV Link Bandwidth Counter for Lane 4 - Not supported in DCGM 2.0 + */ +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 444 /* -* NV Link Bandwidth Counter for Lane 5 -*/ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 445 + * NV Link Bandwidth Counter for Lane 5 - Not supported in DCGM 2.0 + */ +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 445 /* -* NV Link Bandwidth Counter total for all Lanes -*/ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL 449 + * NV Link Bandwidth Counter total for all Lanes + */ +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL 449 /* -* GPU NVLink error information -*/ -#define DCGM_FI_DEV_GPU_NVLINK_ERRORS 450 + * GPU NVLink error information + */ +#define DCGM_FI_DEV_GPU_NVLINK_ERRORS 450 + +/* + * NV Link flow control CRC Error Counter for Lane 6 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 451 + +/* + * NV Link flow control CRC Error Counter for Lane 7 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 452 + +/* + * NV Link flow control CRC Error Counter for Lane 8 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 453 + +/* + * NV Link flow control CRC Error Counter for Lane 9 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 454 + +/* + * NV Link flow control CRC Error Counter for Lane 10 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 455 + +/* + * NV Link flow control CRC Error Counter for Lane 11 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 456 + +/* + * NV Link data CRC Error Counter for Lane 6 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 457 + +/* + * NV Link data CRC Error Counter for Lane 7 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 458 + +/* + * NV Link data CRC Error Counter for Lane 8 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 459 + +/* + * NV Link data CRC Error Counter for Lane 9 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 460 + +/* + * NV Link data CRC Error Counter for Lane 10 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 461 + +/* + * NV Link data CRC Error Counter for Lane 11 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 462 + +/* + * NV Link Replay Error Counter for Lane 6 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 463 + +/* + * NV Link Replay Error Counter for Lane 7 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 464 + +/* + * NV Link Replay Error Counter for Lane 8 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 465 + +/* + * NV Link Replay Error Counter for Lane 9 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 466 + +/* + * NV Link Replay Error Counter for Lane 10 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 467 + +/* + * NV Link Replay Error Counter for Lane 11 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 468 + +/* + * NV Link Recovery Error Counter for Lane 6 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 469 + +/* + * NV Link Recovery Error Counter for Lane 7 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 470 + +/* + * NV Link Recovery Error Counter for Lane 8 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 471 + +/* + * NV Link Recovery Error Counter for Lane 9 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 472 + +/* + * NV Link Recovery Error Counter for Lane 10 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 473 + +/* + * NV Link Recovery Error Counter for Lane 11 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 474 /** - * Virtualization Mode corresponding to the GPU + * Virtualization Mode corresponding to the GPU. + * + * One of DCGM_GPU_VIRTUALIZATION_MODE_* constants. */ -#define DCGM_FI_DEV_VIRTUAL_MODE 500 +#define DCGM_FI_DEV_VIRTUAL_MODE 500 /** * Includes Count and Static info of vGPU types supported on a device */ -#define DCGM_FI_DEV_SUPPORTED_TYPE_INFO 501 +#define DCGM_FI_DEV_SUPPORTED_TYPE_INFO 501 /** * Includes Count and currently Creatable vGPU types on a device */ -#define DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS 502 +#define DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS 502 /** * Includes Count and currently Active vGPU Instances on a device */ -#define DCGM_FI_DEV_VGPU_INSTANCE_IDS 503 +#define DCGM_FI_DEV_VGPU_INSTANCE_IDS 503 /** * Utilization values for vGPUs running on the device */ -#define DCGM_FI_DEV_VGPU_UTILIZATIONS 504 +#define DCGM_FI_DEV_VGPU_UTILIZATIONS 504 /** * Utilization values for processes running within vGPU VMs using the device */ -#define DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION 505 +#define DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION 505 /** * Current encoder statistics for a given device */ -#define DCGM_FI_DEV_ENC_STATS 506 +#define DCGM_FI_DEV_ENC_STATS 506 /** * Statistics of current active frame buffer capture sessions on a given device */ -#define DCGM_FI_DEV_FBC_STATS 507 +#define DCGM_FI_DEV_FBC_STATS 507 /** * Information about active frame buffer capture sessions on a target device */ -#define DCGM_FI_DEV_FBC_SESSIONS_INFO 508 +#define DCGM_FI_DEV_FBC_SESSIONS_INFO 508 /** * VM ID of the vGPU instance */ -#define DCGM_FI_DEV_VGPU_VM_ID 520 +#define DCGM_FI_DEV_VGPU_VM_ID 520 /** * VM name of the vGPU instance */ -#define DCGM_FI_DEV_VGPU_VM_NAME 521 +#define DCGM_FI_DEV_VGPU_VM_NAME 521 /** * vGPU type of the vGPU instance */ -#define DCGM_FI_DEV_VGPU_TYPE 522 +#define DCGM_FI_DEV_VGPU_TYPE 522 /** * UUID of the vGPU instance */ -#define DCGM_FI_DEV_VGPU_UUID 523 +#define DCGM_FI_DEV_VGPU_UUID 523 /** * Driver version of the vGPU instance */ -#define DCGM_FI_DEV_VGPU_DRIVER_VERSION 524 +#define DCGM_FI_DEV_VGPU_DRIVER_VERSION 524 /** * Memory usage of the vGPU instance */ -#define DCGM_FI_DEV_VGPU_MEMORY_USAGE 525 +#define DCGM_FI_DEV_VGPU_MEMORY_USAGE 525 /** * License status of the vGPU instance */ -#define DCGM_FI_DEV_VGPU_LICENSE_STATUS 526 +#define DCGM_FI_DEV_VGPU_LICENSE_STATUS 526 /** * Frame rate limit of the vGPU instance */ -#define DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT 527 +#define DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT 527 /** * Current encoder statistics of the vGPU instance */ -#define DCGM_FI_DEV_VGPU_ENC_STATS 528 +#define DCGM_FI_DEV_VGPU_ENC_STATS 528 /** * Information about all active encoder sessions on the vGPU instance */ -#define DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO 529 +#define DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO 529 /** * Statistics of current active frame buffer capture sessions on the vGPU instance */ -#define DCGM_FI_DEV_VGPU_FBC_STATS 530 +#define DCGM_FI_DEV_VGPU_FBC_STATS 530 /** * Information about active frame buffer capture sessions on the vGPU instance */ -#define DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO 531 +#define DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO 531 /** * Starting field ID of the vGPU instance */ -#define DCGM_FI_FIRST_VGPU_FIELD_ID 520 +#define DCGM_FI_FIRST_VGPU_FIELD_ID 520 /** * Last field ID of the vGPU instance */ -#define DCGM_FI_LAST_VGPU_FIELD_ID 570 +#define DCGM_FI_LAST_VGPU_FIELD_ID 570 /** * For now max vGPU field Ids taken as difference of DCGM_FI_LAST_VGPU_FIELD_ID and DCGM_FI_LAST_VGPU_FIELD_ID i.e. 50 */ -#define DCGM_FI_MAX_VGPU_FIELDS DCGM_FI_LAST_VGPU_FIELD_ID - DCGM_FI_FIRST_VGPU_FIELD_ID +#define DCGM_FI_MAX_VGPU_FIELDS DCGM_FI_LAST_VGPU_FIELD_ID - DCGM_FI_FIRST_VGPU_FIELD_ID /** * Starting ID for all the internal fields */ -#define DCGM_FI_INTERNAL_FIELDS_0_START 600 +#define DCGM_FI_INTERNAL_FIELDS_0_START 600 /** * Last ID for all the internal fields */ /** -*

 

-*

 

-*

 

-*

NVSwitch entity field IDs start here.

-*

 

-*

 

-*

NVSwitch latency bins for port 0

-*/ - -#define DCGM_FI_INTERNAL_FIELDS_0_END 699 - - -/** -*

Low latency bin

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P00 700 -/** -* Medium latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P00 701 -/** -* High latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P00 702 -/** -* Max latency bin -*

 

-*

 

-*

NVSwitch latency bins for port 1

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P00 703 - -/** -*

Low latency bin

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P01 704 -/** -* Medium latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P01 705 -/** -* High latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P01 706 -/** -* Max latency bin -*

 

-*

 

-*

NVSwitch latency bins for port 2

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P01 707 - -/** -*

Low latency bin

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P02 708 -/** -* Medium latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P02 709 -/** -* High latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P02 710 -/** -* Max latency bin -*

 

-*

 

-*

NVSwitch latency bins for port 3

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P02 711 - -/** -*

Low latency bin

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P03 712 -/** -* Medium latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P03 713 -/** -* High latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P03 714 -/** -* Max latency bin -*

 

-*

 

-*

NVSwitch latency bins for port 4

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P03 715 - -/** -*

Low latency bin

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P04 716 -/** -* Medium latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P04 717 -/** -* High latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P04 718 -/** -* Max latency bin -*

 

-*

 

-*

NVSwitch latency bins for port 5

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P04 719 - -/** -*

Low latency bin

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P05 720 -/** -* Medium latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P05 721 -/** -* High latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P05 722 -/** -* Max latency bin -*

 

-*

 

-*

NVSwitch latency bins for port 6

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P05 723 - -/** -*

Low latency bin

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P06 724 -/** -* Medium latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P06 725 -/** -* High latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P06 726 -/** -* Max latency bin -*

 

-*

 

-*

NVSwitch latency bins for port 7

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P06 727 - -/** -*

Low latency bin

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P07 728 -/** -* Medium latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P07 729 -/** -* High latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P07 730 -/** -* Max latency bin -*

 

-*

 

-*

NVSwitch latency bins for port 8

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P07 731 - -/** -*

Low latency bin

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P08 732 -/** -* Medium latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P08 733 -/** -* High latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P08 734 -/** -* Max latency bin -*

 

-*

 

-*

NVSwitch latency bins for port 9

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P08 735 - -/** -*

Low latency bin

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P09 736 -/** -* Medium latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P09 737 -/** -* High latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P09 738 -/** -* Max latency bin -*

 

-*

 

-*

NVSwitch latency bins for port 10

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P09 739 - -/** -*

Low latency bin

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P10 740 -/** -* Medium latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P10 741 -/** -* High latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P10 742 -/** -* Max latency bin -*

 

-*

 

-*

NVSwitch latency bins for port 11

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P10 743 - -/** -*

Low latency bin

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P11 744 -/** -* Medium latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P11 745 -/** -* High latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P11 746 -/** -* Max latency bin -*

 

-*

 

-*

NVSwitch latency bins for port 12

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P11 747 - -/** -*

Low latency bin

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P12 748 -/** -* Medium latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P12 749 -/** -* High latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P12 750 -/** -* Max latency bin -*

 

-*

 

-*

NVSwitch latency bins for port 13

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P12 751 - -/** -*

Low latency bin

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P13 752 -/** -* Medium latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P13 753 -/** -* High latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P13 754 -/** -* Max latency bin -*

 

-*

 

-*

NVSwitch latency bins for port 14

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P13 755 - -/** -*

Low latency bin

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P14 756 -/** -* Medium latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P14 757 -/** -* High latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P14 758 -/** -* Max latency bin -*

 

-*

 

-*

NVSwitch latency bins for port 15

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P14 759 - -/** -*

Low latency bin

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P15 760 -/** -* Medium latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P15 761 -/** -* High latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P15 762 -/** -* Max latency bin -*

 

-*

 

-*

NVSwitch latency bins for port 16

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P15 763 - -/** -*

Low latency bin

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P16 764 -/** -* Medium latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P16 765 -/** -* High latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P16 766 -/** -* Max latency bin -*

 

-*

 

-*

NVSwitch latency bins for port 17

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P16 767 - -/** -*

Low latency bin

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P17 768 -/** -* Medium latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P17 769 -/** -* High latency bin -*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P17 770 -/** -*

Max latency bin

-*

 

-*

 

-*

 

-*

NVSwitch Tx and Rx Counter 0 for each port

-*

By default, Counter 0 counts bytes.

-*/ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P17 771 - -/** -*

NVSwitch Tx Bandwidth Counter 0 for port 0

-*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P00 780 -/** -* NVSwitch Rx Bandwidth Counter 0 for port 0 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P00 781 - -/** -* NVSwitch Tx Bandwidth Counter 0 for port 1 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P01 782 -/** -* NVSwitch Rx Bandwidth Counter 0 for port 1 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P01 783 - -/** -* NVSwitch Tx Bandwidth Counter 0 for port 2 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P02 784 -/** -* NVSwitch Rx Bandwidth Counter 0 for port 2 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P02 785 - -/** -* NVSwitch Tx Bandwidth Counter 0 for port 3 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P03 786 + *

 

+ *

 

+ *

 

+ *

NVSwitch entity field IDs start here.

+ *

 

+ *

 

+ *

NVSwitch latency bins for port 0

+ */ + +#define DCGM_FI_INTERNAL_FIELDS_0_END 699 + + +/** + *

Low latency bin

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P00 700 +/** + * Medium latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P00 701 +/** + * High latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P00 702 /** -* NVSwitch Rx Bandwidth Counter 0 for port 3 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P03 787 + * Max latency bin + *

 

+ *

 

+ *

NVSwitch latency bins for port 1

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P00 703 /** -* NVSwitch Tx Bandwidth Counter 0 for port 4 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P04 788 + *

Low latency bin

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P01 704 +/** + * Medium latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P01 705 +/** + * High latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P01 706 /** -* NVSwitch Rx Bandwidth Counter 0 for port 4 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P04 789 + * Max latency bin + *

 

+ *

 

+ *

NVSwitch latency bins for port 2

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P01 707 /** -* NVSwitch Tx Bandwidth Counter 0 for port 5 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P05 790 + *

Low latency bin

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P02 708 /** -* NVSwitch Rx Bandwidth Counter 0 for port 5 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P05 791 + * Medium latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P02 709 +/** + * High latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P02 710 +/** + * Max latency bin + *

 

+ *

 

+ *

NVSwitch latency bins for port 3

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P02 711 /** -* NVSwitch Tx Bandwidth Counter 0 for port 6 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P06 792 + *

Low latency bin

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P03 712 /** -* NVSwitch Rx Bandwidth Counter 0 for port 6 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P06 793 + * Medium latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P03 713 +/** + * High latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P03 714 +/** + * Max latency bin + *

 

+ *

 

+ *

NVSwitch latency bins for port 4

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P03 715 + +/** + *

Low latency bin

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P04 716 +/** + * Medium latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P04 717 +/** + * High latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P04 718 +/** + * Max latency bin + *

 

+ *

 

+ *

NVSwitch latency bins for port 5

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P04 719 + +/** + *

Low latency bin

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P05 720 +/** + * Medium latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P05 721 +/** + * High latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P05 722 +/** + * Max latency bin + *

 

+ *

 

+ *

NVSwitch latency bins for port 6

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P05 723 + +/** + *

Low latency bin

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P06 724 +/** + * Medium latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P06 725 +/** + * High latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P06 726 +/** + * Max latency bin + *

 

+ *

 

+ *

NVSwitch latency bins for port 7

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P06 727 + +/** + *

Low latency bin

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P07 728 +/** + * Medium latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P07 729 +/** + * High latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P07 730 +/** + * Max latency bin + *

 

+ *

 

+ *

NVSwitch latency bins for port 8

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P07 731 + +/** + *

Low latency bin

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P08 732 +/** + * Medium latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P08 733 +/** + * High latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P08 734 +/** + * Max latency bin + *

 

+ *

 

+ *

NVSwitch latency bins for port 9

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P08 735 + +/** + *

Low latency bin

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P09 736 +/** + * Medium latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P09 737 +/** + * High latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P09 738 +/** + * Max latency bin + *

 

+ *

 

+ *

NVSwitch latency bins for port 10

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P09 739 + +/** + *

Low latency bin

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P10 740 +/** + * Medium latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P10 741 +/** + * High latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P10 742 +/** + * Max latency bin + *

 

+ *

 

+ *

NVSwitch latency bins for port 11

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P10 743 + +/** + *

Low latency bin

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P11 744 +/** + * Medium latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P11 745 +/** + * High latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P11 746 +/** + * Max latency bin + *

 

+ *

 

+ *

NVSwitch latency bins for port 12

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P11 747 + +/** + *

Low latency bin

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P12 748 +/** + * Medium latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P12 749 +/** + * High latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P12 750 +/** + * Max latency bin + *

 

+ *

 

+ *

NVSwitch latency bins for port 13

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P12 751 + +/** + *

Low latency bin

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P13 752 +/** + * Medium latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P13 753 +/** + * High latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P13 754 +/** + * Max latency bin + *

 

+ *

 

+ *

NVSwitch latency bins for port 14

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P13 755 + +/** + *

Low latency bin

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P14 756 +/** + * Medium latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P14 757 +/** + * High latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P14 758 +/** + * Max latency bin + *

 

+ *

 

+ *

NVSwitch latency bins for port 15

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P14 759 + +/** + *

Low latency bin

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P15 760 +/** + * Medium latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P15 761 +/** + * High latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P15 762 +/** + * Max latency bin + *

 

+ *

 

+ *

NVSwitch latency bins for port 16

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P15 763 + +/** + *

Low latency bin

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P16 764 +/** + * Medium latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P16 765 +/** + * High latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P16 766 +/** + * Max latency bin + *

 

+ *

 

+ *

NVSwitch latency bins for port 17

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P16 767 /** -* NVSwitch Tx Bandwidth Counter 0 for port 7 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P07 794 + *

Low latency bin

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P17 768 +/** + * Medium latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P17 769 +/** + * High latency bin + */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P17 770 /** -* NVSwitch Rx Bandwidth Counter 0 for port 7 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P07 795 + *

Max latency bin

+ *

 

+ *

 

+ *

 

+ *

NVSwitch Tx and Rx Counter 0 for each port

+ *

By default, Counter 0 counts bytes.

+ */ +#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P17 771 /** -* NVSwitch Tx Bandwidth Counter 0 for port 8 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P08 796 + *

NVSwitch Tx Bandwidth Counter 0 for port 0

+ */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P00 780 /** -* NVSwitch Rx Bandwidth Counter 0 for port 8 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P08 797 + * NVSwitch Rx Bandwidth Counter 0 for port 0 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P00 781 /** -* NVSwitch Tx Bandwidth Counter 0 for port 9 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P09 798 + * NVSwitch Tx Bandwidth Counter 0 for port 1 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P01 782 /** -* NVSwitch Rx Bandwidth Counter 0 for port 9 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P09 799 + * NVSwitch Rx Bandwidth Counter 0 for port 1 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P01 783 /** -* NVSwitch Tx Bandwidth Counter 0 for port 10 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P10 800 + * NVSwitch Tx Bandwidth Counter 0 for port 2 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P02 784 /** -* NVSwitch Rx Bandwidth Counter 0 for port 10 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P10 801 + * NVSwitch Rx Bandwidth Counter 0 for port 2 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P02 785 /** -* NVSwitch Tx Bandwidth Counter 0 for port 11 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P11 802 + * NVSwitch Tx Bandwidth Counter 0 for port 3 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P03 786 /** -* NVSwitch Rx Bandwidth Counter 0 for port 11 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P11 803 - + * NVSwitch Rx Bandwidth Counter 0 for port 3 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P03 787 + /** -* NVSwitch Tx Bandwidth Counter 0 for port 12 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P12 804 + * NVSwitch Tx Bandwidth Counter 0 for port 4 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P04 788 /** -* NVSwitch Rx Bandwidth Counter 0 for port 12 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P12 805 + * NVSwitch Rx Bandwidth Counter 0 for port 4 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P04 789 /** -* NVSwitch Tx Bandwidth Counter 0 for port 13 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P13 806 + * NVSwitch Tx Bandwidth Counter 0 for port 5 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P05 790 /** -* NVSwitch Rx Bandwidth Counter 0 for port 13 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P13 807 + * NVSwitch Rx Bandwidth Counter 0 for port 5 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P05 791 /** -* NVSwitch Tx Bandwidth Counter 0 for port 14 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P14 808 + * NVSwitch Tx Bandwidth Counter 0 for port 6 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P06 792 /** -* NVSwitch Rx Bandwidth Counter 0 for port 14 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P14 809 + * NVSwitch Rx Bandwidth Counter 0 for port 6 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P06 793 /** -* NVSwitch Tx Bandwidth Counter 0 for port 15 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P15 810 + * NVSwitch Tx Bandwidth Counter 0 for port 7 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P07 794 /** -* NVSwitch Rx Bandwidth Counter 0 for port 15 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P15 811 + * NVSwitch Rx Bandwidth Counter 0 for port 7 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P07 795 /** -* NVSwitch Tx Bandwidth Counter 0 for port 16 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P16 812 + * NVSwitch Tx Bandwidth Counter 0 for port 8 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P08 796 /** -* NVSwitch Rx Bandwidth Counter 0 for port 16 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P16 813 + * NVSwitch Rx Bandwidth Counter 0 for port 8 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P08 797 /** -* NVSwitch Tx Bandwidth Counter 0 for port 17 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P17 814 + * NVSwitch Tx Bandwidth Counter 0 for port 9 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P09 798 /** -*

NVSwitch Rx Bandwidth Counter 0 for port 17

-*

 

-*

 

-*

 

-*

NVSwitch Tx and RX Bandwidth Counter 1 for each port

-*

By default, Counter 1 counts packets.

-*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P17 815 + * NVSwitch Rx Bandwidth Counter 0 for port 9 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P09 799 /** -*

NVSwitch Tx Bandwidth Counter 1 for port 0

-*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P00 820 + * NVSwitch Tx Bandwidth Counter 0 for port 10 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P10 800 /** -* NVSwitch Rx Bandwidth Counter 1 for port 0 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P00 821 + * NVSwitch Rx Bandwidth Counter 0 for port 10 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P10 801 /** -* NVSwitch Tx Bandwidth Counter 1 for port 1 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P01 822 + * NVSwitch Tx Bandwidth Counter 0 for port 11 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P11 802 /** -* NVSwitch Rx Bandwidth Counter 1 for port 1 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P01 823 + * NVSwitch Rx Bandwidth Counter 0 for port 11 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P11 803 /** -* NVSwitch Tx Bandwidth Counter 1 for port 2 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P02 824 + * NVSwitch Tx Bandwidth Counter 0 for port 12 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P12 804 /** -* NVSwitch Rx Bandwidth Counter 1 for port 2 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P02 825 + * NVSwitch Rx Bandwidth Counter 0 for port 12 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P12 805 /** -* NVSwitch Tx Bandwidth Counter 1 for port 3 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P03 826 + * NVSwitch Tx Bandwidth Counter 0 for port 13 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P13 806 /** -* NVSwitch Rx Bandwidth Counter 1 for port 3 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P03 827 + * NVSwitch Rx Bandwidth Counter 0 for port 13 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P13 807 /** -* NVSwitch Tx Bandwidth Counter 1 for port 4 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P04 828 + * NVSwitch Tx Bandwidth Counter 0 for port 14 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P14 808 /** -* NVSwitch Rx Bandwidth Counter 1 for port 4 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P04 829 + * NVSwitch Rx Bandwidth Counter 0 for port 14 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P14 809 /** -* NVSwitch Tx Bandwidth Counter 1 for port 5 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P05 830 + * NVSwitch Tx Bandwidth Counter 0 for port 15 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P15 810 /** -* NVSwitch Rx Bandwidth Counter 1 for port 5 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P05 831 + * NVSwitch Rx Bandwidth Counter 0 for port 15 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P15 811 /** -* NVSwitch Tx Bandwidth Counter 1 for port 6 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P06 832 + * NVSwitch Tx Bandwidth Counter 0 for port 16 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P16 812 /** -* NVSwitch Rx Bandwidth Counter 1 for port 6 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P06 833 + * NVSwitch Rx Bandwidth Counter 0 for port 16 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P16 813 /** -* NVSwitch Tx Bandwidth Counter 1 for port 7 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P07 834 + * NVSwitch Tx Bandwidth Counter 0 for port 17 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P17 814 /** -* NVSwitch Rx Bandwidth Counter 1 for port 7 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P07 835 + *

NVSwitch Rx Bandwidth Counter 0 for port 17

+ *

 

+ *

 

+ *

 

+ *

NVSwitch Tx and RX Bandwidth Counter 1 for each port

+ *

By default, Counter 1 counts packets.

+ */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P17 815 /** -* NVSwitch Tx Bandwidth Counter 1 for port 8 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P08 836 + *

NVSwitch Tx Bandwidth Counter 1 for port 0

+ */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P00 820 /** -* NVSwitch Rx Bandwidth Counter 1 for port 8 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P08 837 + * NVSwitch Rx Bandwidth Counter 1 for port 0 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P00 821 /** -* NVSwitch Tx Bandwidth Counter 1 for port 9 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P09 838 + * NVSwitch Tx Bandwidth Counter 1 for port 1 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P01 822 /** -* NVSwitch Rx Bandwidth Counter 1 for port 9 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P09 839 + * NVSwitch Rx Bandwidth Counter 1 for port 1 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P01 823 /** -* NVSwitch Tx Bandwidth Counter 0 for port 10 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P10 840 + * NVSwitch Tx Bandwidth Counter 1 for port 2 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P02 824 /** -* NVSwitch Rx Bandwidth Counter 1 for port 10 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P10 841 + * NVSwitch Rx Bandwidth Counter 1 for port 2 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P02 825 /** -* NVSwitch Tx Bandwidth Counter 1 for port 11 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P11 842 + * NVSwitch Tx Bandwidth Counter 1 for port 3 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P03 826 /** -* NVSwitch Rx Bandwidth Counter 1 for port 11 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P11 843 + * NVSwitch Rx Bandwidth Counter 1 for port 3 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P03 827 /** -* NVSwitch Tx Bandwidth Counter 1 for port 12 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P12 844 + * NVSwitch Tx Bandwidth Counter 1 for port 4 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P04 828 /** -* NVSwitch Rx Bandwidth Counter 1 for port 12 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P12 845 + * NVSwitch Rx Bandwidth Counter 1 for port 4 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P04 829 /** -* NVSwitch Tx Bandwidth Counter 0 for port 13 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P13 846 + * NVSwitch Tx Bandwidth Counter 1 for port 5 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P05 830 /** -* NVSwitch Rx Bandwidth Counter 1 for port 13 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P13 847 + * NVSwitch Rx Bandwidth Counter 1 for port 5 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P05 831 /** -* NVSwitch Tx Bandwidth Counter 1 for port 14 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P14 848 + * NVSwitch Tx Bandwidth Counter 1 for port 6 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P06 832 /** -* NVSwitch Rx Bandwidth Counter 1 for port 14 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P14 849 + * NVSwitch Rx Bandwidth Counter 1 for port 6 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P06 833 /** -* NVSwitch Tx Bandwidth Counter 1 for port 15 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P15 850 + * NVSwitch Tx Bandwidth Counter 1 for port 7 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P07 834 /** -* NVSwitch Rx Bandwidth Counter 1 for port 15 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P15 851 + * NVSwitch Rx Bandwidth Counter 1 for port 7 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P07 835 /** -* NVSwitch Tx Bandwidth Counter 1 for port 16 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P16 852 + * NVSwitch Tx Bandwidth Counter 1 for port 8 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P08 836 /** -* NVSwitch Rx Bandwidth Counter 1 for port 16 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P16 853 + * NVSwitch Rx Bandwidth Counter 1 for port 8 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P08 837 /** -* NVSwitch Tx Bandwidth Counter 1 for port 17 -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P17 854 + * NVSwitch Tx Bandwidth Counter 1 for port 9 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P09 838 +/** + * NVSwitch Rx Bandwidth Counter 1 for port 9 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P09 839 + +/** + * NVSwitch Tx Bandwidth Counter 0 for port 10 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P10 840 +/** + * NVSwitch Rx Bandwidth Counter 1 for port 10 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P10 841 + +/** + * NVSwitch Tx Bandwidth Counter 1 for port 11 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P11 842 +/** + * NVSwitch Rx Bandwidth Counter 1 for port 11 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P11 843 + +/** + * NVSwitch Tx Bandwidth Counter 1 for port 12 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P12 844 +/** + * NVSwitch Rx Bandwidth Counter 1 for port 12 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P12 845 + +/** + * NVSwitch Tx Bandwidth Counter 0 for port 13 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P13 846 +/** + * NVSwitch Rx Bandwidth Counter 1 for port 13 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P13 847 + +/** + * NVSwitch Tx Bandwidth Counter 1 for port 14 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P14 848 +/** + * NVSwitch Rx Bandwidth Counter 1 for port 14 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P14 849 + +/** + * NVSwitch Tx Bandwidth Counter 1 for port 15 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P15 850 /** -* NVSwitch Rx Bandwidth Counter 1 for port 17 -*

 

-*

 

-*

 

-* NVSwitch error counters -*/ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P17 855 + * NVSwitch Rx Bandwidth Counter 1 for port 15 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P15 851 /** -* NVSwitch fatal error information. -* Note: value field indicates the specific SXid reported -*/ -#define DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS 856 + * NVSwitch Tx Bandwidth Counter 1 for port 16 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P16 852 +/** + * NVSwitch Rx Bandwidth Counter 1 for port 16 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P16 853 /** -* NVSwitch non fatal error information. -* Note: value field indicates the specific SXid reported -*/ -#define DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS 857 + * NVSwitch Tx Bandwidth Counter 1 for port 17 + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P17 854 +/** + * NVSwitch Rx Bandwidth Counter 1 for port 17 + *

 

+ *

 

+ *

 

+ * NVSwitch error counters + */ +#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P17 855 + +/** + * NVSwitch fatal error information. + * Note: value field indicates the specific SXid reported + */ +#define DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS 856 + +/** + * NVSwitch non fatal error information. + * Note: value field indicates the specific SXid reported + */ +#define DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS 857 /** * Starting field ID of the NVSwitch instance */ -#define DCGM_FI_FIRST_NVSWITCH_FIELD_ID 700 +#define DCGM_FI_FIRST_NVSWITCH_FIELD_ID 700 /** * Last field ID of the NVSwitch instance */ -#define DCGM_FI_LAST_NVSWITCH_FIELD_ID 860 +#define DCGM_FI_LAST_NVSWITCH_FIELD_ID 860 /** - * For now max NVSwitch field Ids taken as difference of DCGM_FI_LAST_NVSWITCH_FIELD_ID and DCGM_FI_FIRST_NVSWITCH_FIELD_ID + 1 i.e. 200 + * For now max NVSwitch field Ids taken as difference of DCGM_FI_LAST_NVSWITCH_FIELD_ID and + * DCGM_FI_FIRST_NVSWITCH_FIELD_ID + 1 i.e. 200 */ -#define DCGM_FI_MAX_NVSWITCH_FIELDS DCGM_FI_LAST_NVSWITCH_FIELD_ID - DCGM_FI_FIRST_NVSWITCH_FIELD_ID + 1 +#define DCGM_FI_MAX_NVSWITCH_FIELDS DCGM_FI_LAST_NVSWITCH_FIELD_ID - DCGM_FI_FIRST_NVSWITCH_FIELD_ID + 1 /** * Profiling Fields. These all start with DCGM_FI_PROF_* */ /** - * Ratio of time the graphics engine is active. The graphics engine is - * active if a graphics/compute context is bound and the graphics pipe or + * Ratio of time the graphics engine is active. The graphics engine is + * active if a graphics/compute context is bound and the graphics pipe or * compute pipe is busy. */ -#define DCGM_FI_PROF_GR_ENGINE_ACTIVE 1001 +#define DCGM_FI_PROF_GR_ENGINE_ACTIVE 1001 /** - * The ratio of cycles an SM has at least 1 warp assigned - * (computed from the number of cycles and elapsed cycles) + * The ratio of cycles an SM has at least 1 warp assigned + * (computed from the number of cycles and elapsed cycles) */ -#define DCGM_FI_PROF_SM_ACTIVE 1002 +#define DCGM_FI_PROF_SM_ACTIVE 1002 /** - * The ratio of number of warps resident on an SM. - * (number of resident as a ratio of the theoretical + * The ratio of number of warps resident on an SM. + * (number of resident as a ratio of the theoretical * maximum number of warps per elapsed cycle) */ -#define DCGM_FI_PROF_SM_OCCUPANCY 1003 +#define DCGM_FI_PROF_SM_OCCUPANCY 1003 /** - * The ratio of cycles the tensor (HMMA) pipe is active + * The ratio of cycles the tensor (HMMA) pipe is active * (off the peak sustained elapsed cycles) */ -#define DCGM_FI_PROF_PIPE_TENSOR_ACTIVE 1004 +#define DCGM_FI_PROF_PIPE_TENSOR_ACTIVE 1004 /** - * The ratio of cycles the device memory interface is + * The ratio of cycles the device memory interface is * active sending or receiving data. */ -#define DCGM_FI_PROF_DRAM_ACTIVE 1005 +#define DCGM_FI_PROF_DRAM_ACTIVE 1005 /** * Ratio of cycles the fp64 pipe is active. */ -#define DCGM_FI_PROF_PIPE_FP64_ACTIVE 1006 +#define DCGM_FI_PROF_PIPE_FP64_ACTIVE 1006 /** * Ratio of cycles the fp32 pipe is active. */ -#define DCGM_FI_PROF_PIPE_FP32_ACTIVE 1007 +#define DCGM_FI_PROF_PIPE_FP32_ACTIVE 1007 /** * Ratio of cycles the fp16 pipe is active. This does not include HMMA. */ -#define DCGM_FI_PROF_PIPE_FP16_ACTIVE 1008 +#define DCGM_FI_PROF_PIPE_FP16_ACTIVE 1008 /** * The number of bytes of active PCIe tx (transmit) data including both header and payload. - * + * * Note that this is from the perspective of the GPU, so copying data from device to host (DtoH) * would be reflected in this metric. */ -#define DCGM_FI_PROF_PCIE_TX_BYTES 1009 +#define DCGM_FI_PROF_PCIE_TX_BYTES 1009 /** * The number of bytes of active PCIe rx (read) data including both header and payload. - * + * * Note that this is from the perspective of the GPU, so copying data from host to device (HtoD) * would be reflected in this metric. */ -#define DCGM_FI_PROF_PCIE_RX_BYTES 1010 +#define DCGM_FI_PROF_PCIE_RX_BYTES 1010 /** * The number of bytes of active NvLink tx (transmit) data including both header and payload. */ -#define DCGM_FI_PROF_NVLINK_TX_BYTES 1011 +#define DCGM_FI_PROF_NVLINK_TX_BYTES 1011 /** * The number of bytes of active NvLink rx (read) data including both header and payload. */ -#define DCGM_FI_PROF_NVLINK_RX_BYTES 1012 +#define DCGM_FI_PROF_NVLINK_RX_BYTES 1012 /** * 1 greater than maximum fields above. This is the 1 greater than the maximum field id that could be allocated */ -#define DCGM_FI_MAX_FIELDS 1013 +#define DCGM_FI_MAX_FIELDS 1013 /** @} */ @@ -1903,11 +2099,11 @@ typedef unsigned int dcgm_field_eid_t; */ typedef struct { - char shortName[10]; /* Short name corresponding to field. This short name - is used to identify columns in dmon output.*/ - char unit[4]; /* The unit of value. Eg: C(elsius), W(att), MB/s*/ - short width; /* Maximum width/number of digits that a value for field can have.*/ -} dcgm_field_output_format_t,*dcgm_field_output_format_p; + char shortName[10]; /*!< Short name corresponding to field. This short name is used to identify columns in dmon + output.*/ + char unit[4]; /*!< The unit of value. Eg: C(elsius), W(att), MB/s*/ + short width; /*!< Maximum width/number of digits that a value for field can have.*/ +} dcgm_field_output_format_t, *dcgm_field_output_format_p; /** * Structure to store meta data for the field @@ -1915,15 +2111,18 @@ typedef struct typedef struct { - unsigned short fieldId; /* Field identifier. DCGM_FI_? #define */ - char fieldType; /* Field type. DCGM_FT_? #define */ - unsigned char size; /* field size in bytes (raw value size). 0=variable (like DCGM_FT_STRING) */ - char tag[48]; /* Tag for this field for serialization like 'device_temperature' */ - int scope; /* Field scope. DCGM_FS_? #define of this field's association */ - int nvmlFieldId; /* Optional NVML field this DCGM field maps to. 0 = no mapping. Otherwise, - this should be a NVML_FI_? #define from nvml.h */ - - dcgm_field_output_format_p valueFormat; /* pointer to the structure that holds the formatting the values for fields */ + unsigned short fieldId; /*!< Field identifier. DCGM_FI_? #define */ + char fieldType; /*!< Field type. DCGM_FT_? #define */ + unsigned char size; /*!< field size in bytes (raw value size). 0=variable (like DCGM_FT_STRING) */ + char tag[48]; /*!< Tag for this field for serialization like 'device_temperature' */ + int scope; /*!< Field scope. DCGM_FS_? #define of this field's association */ + int nvmlFieldId; /*!< Optional NVML field this DCGM field maps to. 0 = no mapping. + Otherwise, this should be a NVML_FI_? #define from nvml.h */ + dcgm_field_entity_group_t + entityLevel; /*!< Field entity level. DCGM_FE_? specifying at what level the field is queryable */ + + dcgm_field_output_format_p valueFormat; /*!< pointer to the structure that holds the formatting the + values for fields */ } dcgm_field_meta_t, *dcgm_field_meta_p; /***************************************************************************************************/ @@ -1934,48 +2133,60 @@ typedef struct /** * Get a pointer to the metadata for a field by its field ID. See DCGM_FI_? for a list of field IDs. - * @param fieldId IN: One of the field IDs (DCGM_FI_?) + * + * @param fieldId IN: One of the field IDs (DCGM_FI_?) + * * @return - * 0 On Failure - * > 0 Pointer to field metadata structure if found. + * 0 On Failure + * >0 Pointer to field metadata structure if found. + * */ dcgm_field_meta_p DcgmFieldGetById(unsigned short fieldId); /** * Get a pointer to the metadata for a field by its field tag. + * * @param tag IN: Tag for the field of interest + * * @return - * 0 On failure or not found - * > 0 Pointer to field metadata structure if found + * 0 On failure or not found + * >0 Pointer to field metadata structure if found + * */ dcgm_field_meta_p DcgmFieldGetByTag(char *tag); /** * Initialize the DcgmFields module. Call this once from inside * your program - * @return - * 0 On success - * <0 On error + * + * @return + * 0 On success + * <0 On error + * */ int DcgmFieldsInit(void); /** * Terminates the DcgmFields module. Call this once from inside your program - * @return - * 0 On success - * <0 On error + * + * @return + * 0 On success + * <0 On error + * */ int DcgmFieldsTerm(void); /** * Get the string version of a entityGroupId * - * Returns Pointer to a string like GPU/NvSwitch..etc - * Null on error + * @returns + * - Pointer to a string like GPU/NvSwitch..etc + * - Null on error + * */ char *DcgmFieldsGetEntityGroupString(dcgm_field_entity_group_t entityGroupId); -/** @} */ +/** @} */ #ifdef __cplusplus @@ -1983,4 +2194,4 @@ char *DcgmFieldsGetEntityGroupString(dcgm_field_entity_group_t entityGroupId); #endif -#endif //DCGMFIELDS_H +#endif // DCGMFIELDS_H diff --git a/bindings/go/dcgm/dcgm_structs.h b/bindings/go/dcgm/dcgm_structs.h index a882ce1..501de36 100644 --- a/bindings/go/dcgm/dcgm_structs.h +++ b/bindings/go/dcgm/dcgm_structs.h @@ -16,40 +16,41 @@ #ifndef DCGM_STRUCTS_H #define DCGM_STRUCTS_H -#ifdef __cplusplus -extern "C" { -#endif - -#include "dcgm_fields.h" +#include "dcgm_fields.h" #include + /***************************************************************************************************/ -/** @defgroup nvmlReturnEnums Enums and Macros +/** @defgroup dcgmReturnEnums Enums and Macros * @{ */ -/***************************************************************************************************/ +/***************************************************************************************************/ + +/** + * Creates a unique version number for each struct + */ +#define MAKE_DCGM_VERSION(typeName, ver) (unsigned int)(sizeof(typeName) | ((unsigned long)(ver) << 24U)) /** * Represents value of the field which can be returned by Host Engine in case the * operation is not successful - * */ #ifndef DCGM_BLANK_VALUES #define DCGM_BLANK_VALUES - + /** - * Base value for 32 bits integer blank. can be used as an unspecified blank + * Base value for 32 bits integer blank. can be used as an unspecified blank */ #define DCGM_INT32_BLANK 0x7ffffff0 - + /** - * Base value for 64 bits integer blank. can be used as an unspecified blank + * Base value for 64 bits integer blank. can be used as an unspecified blank */ #define DCGM_INT64_BLANK 0x7ffffffffffffff0 /** * Base value for double blank. 2 ** 47. FP 64 has 52 bits of mantissa, - * so 47 bits can still increment by 1 and represent each value from 0-15 + * so 47 bits can still increment by 1 and represent each value from 0-15 */ #define DCGM_FP64_BLANK 140737488355328.0 @@ -58,201 +59,192 @@ extern "C" { */ #define DCGM_STR_BLANK "<<>>" -/** - * Represents an error where INT32 data was not found - */ -#define DCGM_INT32_NOT_FOUND (DCGM_INT32_BLANK+1) - -/** - * Represents an error where INT64 data was not found - */ -#define DCGM_INT64_NOT_FOUND (DCGM_INT64_BLANK+1) - -/** - * Represents an error where FP64 data was not found - */ -#define DCGM_FP64_NOT_FOUND (DCGM_FP64_BLANK+1.0) - -/** - * Represents an error where STR data was not found - */ -#define DCGM_STR_NOT_FOUND "<<>>" - -/** - * Represents an error where fetching the INT32 value is not supported - */ -#define DCGM_INT32_NOT_SUPPORTED (DCGM_INT32_BLANK+2) - -/** - * Represents an error where fetching the INT64 value is not supported - */ -#define DCGM_INT64_NOT_SUPPORTED (DCGM_INT64_BLANK+2) - -/** - * Represents an error where fetching the FP64 value is not supported - */ -#define DCGM_FP64_NOT_SUPPORTED (DCGM_FP64_BLANK+2.0) - -/** - * Represents an error where fetching the STR value is not supported - */ -#define DCGM_STR_NOT_SUPPORTED "<<>>" - -/** - * Represents and error where fetching the INT32 value is not allowed with our current credentials - */ -#define DCGM_INT32_NOT_PERMISSIONED (DCGM_INT32_BLANK+3) - -/** - * Represents and error where fetching the INT64 value is not allowed with our current credentials - */ -#define DCGM_INT64_NOT_PERMISSIONED (DCGM_INT64_BLANK+3) - -/** - * Represents and error where fetching the FP64 value is not allowed with our current credentials - */ -#define DCGM_FP64_NOT_PERMISSIONED (DCGM_FP64_BLANK+3.0) - -/** - * Represents and error where fetching the STR value is not allowed with our current credentials - */ -#define DCGM_STR_NOT_PERMISSIONED "<<>>" - -/** - * Macro to check if a INT32 value is blank or not +/** + * Represents an error where INT32 data was not found + */ +#define DCGM_INT32_NOT_FOUND (DCGM_INT32_BLANK + 1) + +/** + * Represents an error where INT64 data was not found + */ +#define DCGM_INT64_NOT_FOUND (DCGM_INT64_BLANK + 1) + +/** + * Represents an error where FP64 data was not found + */ +#define DCGM_FP64_NOT_FOUND (DCGM_FP64_BLANK + 1.0) + +/** + * Represents an error where STR data was not found + */ +#define DCGM_STR_NOT_FOUND "<<>>" + +/** + * Represents an error where fetching the INT32 value is not supported + */ +#define DCGM_INT32_NOT_SUPPORTED (DCGM_INT32_BLANK + 2) + +/** + * Represents an error where fetching the INT64 value is not supported + */ +#define DCGM_INT64_NOT_SUPPORTED (DCGM_INT64_BLANK + 2) + +/** + * Represents an error where fetching the FP64 value is not supported + */ +#define DCGM_FP64_NOT_SUPPORTED (DCGM_FP64_BLANK + 2.0) + +/** + * Represents an error where fetching the STR value is not supported + */ +#define DCGM_STR_NOT_SUPPORTED "<<>>" + +/** + * Represents and error where fetching the INT32 value is not allowed with our current credentials + */ +#define DCGM_INT32_NOT_PERMISSIONED (DCGM_INT32_BLANK + 3) + +/** + * Represents and error where fetching the INT64 value is not allowed with our current credentials + */ +#define DCGM_INT64_NOT_PERMISSIONED (DCGM_INT64_BLANK + 3) + +/** + * Represents and error where fetching the FP64 value is not allowed with our current credentials + */ +#define DCGM_FP64_NOT_PERMISSIONED (DCGM_FP64_BLANK + 3.0) + +/** + * Represents and error where fetching the STR value is not allowed with our current credentials + */ +#define DCGM_STR_NOT_PERMISSIONED "<<>>" + +/** + * Macro to check if a INT32 value is blank or not */ #define DCGM_INT32_IS_BLANK(val) (((val) >= DCGM_INT32_BLANK) ? 1 : 0) - -/** - * Macro to check if a INT64 value is blank or not - */ + +/** + * Macro to check if a INT64 value is blank or not + */ #define DCGM_INT64_IS_BLANK(val) (((val) >= DCGM_INT64_BLANK) ? 1 : 0) - -/** - * Macro to check if a FP64 value is blank or not - */ + +/** + * Macro to check if a FP64 value is blank or not + */ #define DCGM_FP64_IS_BLANK(val) (((val) >= DCGM_FP64_BLANK ? 1 : 0)) - -/** - * Macro to check if a STR value is blank or not + +/** + * Macro to check if a STR value is blank or not * Works on (char *). Looks for <<< at first position and >>> inside string - */ + */ #define DCGM_STR_IS_BLANK(val) (val == strstr(val, "<<<") && strstr(val, ">>>")) -#endif //DCGM_BLANK_VALUES +#endif // DCGM_BLANK_VALUES /** * Max number of GPUs supported by DCGM - */ -#define DCGM_MAX_NUM_DEVICES 16 + */ +#define DCGM_MAX_NUM_DEVICES 32 /* DCGM 2.0 and newer = 32. DCGM 1.8 and older = 16. */ /** * Number of NvLink links per GPU supported by DCGM - * This is 6 for Volta and 4 for Pascal + * This is 12 for Ampere, 6 for Volta, and 4 for Pascal + */ +#define DCGM_NVLINK_MAX_LINKS_PER_GPU 12 + +/** + * Maximum NvLink links pre-Ampere */ -#define DCGM_NVLINK_MAX_LINKS_PER_GPU 6 +#define DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY1 6 /** - * Max number of NvSwitches supported by DCGM + * Max number of NvSwitches supported by DCGM **/ #define DCGM_MAX_NUM_SWITCHES 12 /** * Number of NvLink links per NvSwitch supported by DCGM */ -#define DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH 18 +#define DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH 36 /** * Maximum number of vGPU instances per physical GPU */ #define DCGM_MAX_VGPU_INSTANCES_PER_PGPU 32 -/** - * Max number of vGPUs supported on DCGM - */ -#define DCGM_MAX_NUM_VGPU_DEVICES DCGM_MAX_NUM_DEVICES * DCGM_MAX_VGPU_INSTANCES_PER_PGPU - /** * Max length of the DCGM string field */ -#define DCGM_MAX_STR_LENGTH 256 +#define DCGM_MAX_STR_LENGTH 256 /** * Max number of clocks supported for a device */ -#define DCGM_MAX_CLOCKS 256 +#define DCGM_MAX_CLOCKS 256 /** * Max limit on the number of groups supported by DCGM */ -#define DCGM_MAX_NUM_GROUPS 64 +#define DCGM_MAX_NUM_GROUPS 64 /** * Max number of active FBC sessions */ -#define DCGM_MAX_FBC_SESSIONS 256 - +#define DCGM_MAX_FBC_SESSIONS 256 /** - * Represents the size of a buffer that holds a vGPU type Name or vGPU class type or name of process running on vGPU instance. + * Represents the size of a buffer that holds a vGPU type Name or vGPU class type or name of process running on vGPU + * instance. */ -#define DCGM_VGPU_NAME_BUFFER_SIZE 64 +#define DCGM_VGPU_NAME_BUFFER_SIZE 64 /** * Represents the size of a buffer that holds a vGPU license string */ -#define DCGM_GRID_LICENSE_BUFFER_SIZE 128 +#define DCGM_GRID_LICENSE_BUFFER_SIZE 128 /** * Default compute mode -- multiple contexts per device */ -#define DCGM_CONFIG_COMPUTEMODE_DEFAULT 0 - +#define DCGM_CONFIG_COMPUTEMODE_DEFAULT 0 + /** * Compute-prohibited mode -- no contexts per device */ -#define DCGM_CONFIG_COMPUTEMODE_PROHIBITED 1 - +#define DCGM_CONFIG_COMPUTEMODE_PROHIBITED 1 + /** - * Compute-exclusive-process mode -- only one context per device, usable from multiple threads at - * a time + * Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time */ -#define DCGM_CONFIG_COMPUTEMODE_EXCLUSIVE_PROCESS 2 - +#define DCGM_CONFIG_COMPUTEMODE_EXCLUSIVE_PROCESS 2 /** * Default Port Number for DCGM Host Engine */ #define DCGM_HE_PORT_NUMBER 5555 - -/** - * Creates a unique version number for each struct - */ -#define MAKE_DCGM_VERSION(typeName,ver) (unsigned int)(sizeof(typeName) | ((ver)<<24)) - -/***************************************************************************************************/ - - - +#ifdef __cplusplus +extern "C" { +#endif /** * Operation mode for DCGM - * - * DCGM can run in auto-mode where it runs additional threads in the background to collect + * + * DCGM can run in auto-mode where it runs additional threads in the background to collect * any metrics of interest and auto manages any operations needed for policy management. - * + * * DCGM can also operate in manual-mode where it's execution is controlled by the user. In * this mode, the user has to periodically call APIs such as \ref dcgmPolicyTrigger and * \ref dcgmUpdateAllFields which tells DCGM to wake up and perform data collection and * operations needed for policy management. */ -typedef enum dcgmOperationMode_enum +typedef enum dcgmOperationMode_enum { DCGM_OPERATION_MODE_AUTO = 1, DCGM_OPERATION_MODE_MANUAL = 2 } dcgmOperationMode_t; - + /** * When more than one value is returned from a query, which order should it be returned in? */ @@ -262,12 +254,12 @@ typedef enum dcgmOrder_enum DCGM_ORDER_DESCENDING = 2 //!< Data with latest (highest) timestamps returned first } dcgmOrder_t; -/** - * Return values for DCGM API calls. +/** + * Return values for DCGM API calls. */ typedef enum dcgmReturn_enum { - DCGM_ST_OK = 0, //!< Success + DCGM_ST_OK = 0, //!< Success DCGM_ST_BADPARAM = -1, //!< A bad parameter was passed to a function DCGM_ST_GENERIC_ERROR = -3, //!< A generic, unspecified error DCGM_ST_MEMORY = -4, //!< An out of memory error occurred @@ -287,166 +279,105 @@ typedef enum dcgmReturn_enum DCGM_ST_GPU_IS_LOST = -18, //!< GPU is no longer reachable DCGM_ST_RESET_REQUIRED = -19, //!< GPU requires a reset DCGM_ST_FUNCTION_NOT_FOUND = -20, //!< The function that was requested was not found (bindings only error) - DCGM_ST_CONNECTION_NOT_VALID = -21, //!< The connection to the host engine is not valid any longer + DCGM_ST_CONNECTION_NOT_VALID = -21, //!< The connection to the host engine is not valid any longer DCGM_ST_GPU_NOT_SUPPORTED = -22, //!< This GPU is not supported by DCGM - DCGM_ST_GROUP_INCOMPATIBLE = -23, //!< The GPUs of the provided group are not compatible with each other for the requested operation - DCGM_ST_MAX_LIMIT = -24, //!< Max limit reached for the object - DCGM_ST_LIBRARY_NOT_FOUND = -25, //!< DCGM library could not be found - DCGM_ST_DUPLICATE_KEY = -26, //!< Duplicate key passed to a function - DCGM_ST_GPU_IN_SYNC_BOOST_GROUP = -27, //! Maxwell, setting this implies autoBoost=0 -}dcgmConfigPerfStateSettings_t; + unsigned int syncBoost; //!< Sync Boost Mode (0: Disabled, 1 : Enabled, DCGM_INT32_BLANK : Ignored). Note that + //!< using this setting may result in lower clocks than targetClocks + dcgmClockSet_t targetClocks; //!< Target clocks. Set smClock and memClock to DCGM_INT32_BLANK to ignore/use + //!< compatible values. For GPUs > Maxwell, setting this implies autoBoost=0 +} dcgmConfigPerfStateSettings_t; /** - * Used to represents the power capping limit for each GPU in the group or to represent the power + * Used to represents the power capping limit for each GPU in the group or to represent the power * budget for the entire group */ typedef struct { - dcgmConfigPowerLimitType_t type; //!< Flag to represent power cap for each GPU or power budget for the group of GPUs - unsigned int val; //!< Power Limit in Watts (Set a value OR DCGM_INT32_BLANK to Ignore) -}dcgmConfigPowerLimit_t; + dcgmConfigPowerLimitType_t type; //!< Flag to represent power cap for each GPU or power budget for the group of GPUs + unsigned int val; //!< Power Limit in Watts (Set a value OR DCGM_INT32_BLANK to Ignore) +} dcgmConfigPowerLimit_t; /** * Structure to represent default and target configuration for a device */ typedef struct { - unsigned int version; //!< Version number (dcgmConfig_version) - unsigned int gpuId; //!< GPU ID - unsigned int eccMode; //!< ECC Mode (0: Disabled, 1 : Enabled, DCGM_INT32_BLANK : Ignored) - unsigned int computeMode; //!< Compute Mode (One of DCGM_CONFIG_COMPUTEMODE_? OR DCGM_INT32_BLANK to Ignore) - dcgmConfigPerfStateSettings_t perfState; //!< Performance State Settings (clocks / boost mode) - dcgmConfigPowerLimit_t powerLimit; //!< Power Limits -}dcgmConfig_v1; + unsigned int version; //!< Version number (dcgmConfig_version) + unsigned int gpuId; //!< GPU ID + unsigned int eccMode; //!< ECC Mode (0: Disabled, 1 : Enabled, DCGM_INT32_BLANK : Ignored) + unsigned int computeMode; //!< Compute Mode (One of DCGM_CONFIG_COMPUTEMODE_? OR DCGM_INT32_BLANK to Ignore) + dcgmConfigPerfStateSettings_t perfState; //!< Performance State Settings (clocks / boost mode) + dcgmConfigPowerLimit_t powerLimit; //!< Power Limits +} dcgmConfig_v1; /** * Typedef for \ref dcgmConfig_v1 @@ -1272,56 +1234,28 @@ typedef dcgmConfig_v1 dcgmConfig_t; */ #define dcgmConfig_version dcgmConfig_version1 -/** - * Structure to represent default and target vgpu configuration for a device - */ -typedef struct -{ - unsigned int version; //!< Version number (dcgmConfig_version) - unsigned int gpuId; //!< GPU ID - unsigned int eccMode; //!< ECC Mode (0: Disabled, 1 : Enabled, DCGM_INT32_BLANK : Ignored) - unsigned int computeMode; //!< Compute Mode (One of DCGM_CONFIG_COMPUTEMODE_? OR DCGM_INT32_BLANK to Ignore) - dcgmConfigPerfStateSettings_t perfState; //!< Performance State Settings (clocks / boost mode) - dcgmConfigPowerLimit_t powerLimit; //!< Power Limits -}dcgmVgpuConfig_v1; - -/** - * Typedef for \ref dcgmVgpuConfig_v1 - */ -typedef dcgmVgpuConfig_v1 dcgmVgpuConfig_t; - -/** - * Version 1 for \ref dcgmVgpuConfig_v1 - */ -#define dcgmVgpuConfig_version1 MAKE_DCGM_VERSION(dcgmVgpuConfig_v1, 1) - -/** - * Latest version for \ref dcgmVgpuConfig_t - */ -#define dcgmVgpuConfig_version dcgmVgpuConfig_version1 - /** * Represents a callback to receive updates from asynchronous functions. * Currently the only implemented callback function is dcgmPolicyRegister * and the void * data will be a pointer to dcgmPolicyCallbackResponse_t. * Ex. * dcgmPolicyCallbackResponse_t *callbackResponse = (dcgmPolicyCallbackResponse_t *) userData; - * + * */ typedef int (*fpRecvUpdates)(void *userData); /*Remove from doxygen documentation * - * Define the structure that contains specific policy information + * Define the structure that contains specific policy information */ -typedef struct +typedef struct { // version must always be first - unsigned int version; //!< Version number (dcgmPolicyViolation_version) + unsigned int version; //!< Version number (dcgmPolicyViolation_version) - unsigned int notifyOnEccDbe; //!< true/false notification on ECC Double Bit Errors - unsigned int notifyOnPciEvent; //!< true/false notification on PCI Events - unsigned int notifyOnMaxRetiredPages; //!< number of retired pages to occur before notification + unsigned int notifyOnEccDbe; //!< true/false notification on ECC Double Bit Errors + unsigned int notifyOnPciEvent; //!< true/false notification on PCI Events + unsigned int notifyOnMaxRetiredPages; //!< number of retired pages to occur before notification } dcgmPolicyViolation_v1; /*Remove from doxygen documentation @@ -1344,21 +1278,22 @@ typedef dcgmPolicyViolation_v1 dcgmPolicyViolation_t; */ #define dcgmPolicyViolation_version dcgmPolicyViolation_version1 -/** +/** * Enumeration for policy conditions. - * When used as part of dcgmPolicy_t these have corresponding parameters to + * When used as part of dcgmPolicy_t these have corresponding parameters to * allow them to be switched on/off or set specific violation thresholds */ typedef enum dcgmPolicyCondition_enum { // these are bitwise rather than sequential - DCGM_POLICY_COND_DBE = 0x1, //!< Double bit errors -- boolean in dcgmPolicyConditionParms_t - DCGM_POLICY_COND_PCI = 0x2, //!< PCI events/errors -- boolean in dcgmPolicyConditionParms_t - DCGM_POLICY_COND_MAX_PAGES_RETIRED = 0x4, //!< Maximum number of retired pages -- number required in dcgmPolicyConditionParms_t - DCGM_POLICY_COND_THERMAL = 0x8, //!< Thermal violation -- number required in dcgmPolicyConditionParms_t - DCGM_POLICY_COND_POWER = 0x10, //!< Power violation -- number required in dcgmPolicyConditionParms_t - DCGM_POLICY_COND_NVLINK = 0x20, //!< NVLINK errors -- boolean in dcgmPolicyConditionParms_t - DCGM_POLICY_COND_XID = 0x40, //!< XID errors -- number required in dcgmPolicyConditionParms_t + DCGM_POLICY_COND_DBE = 0x1, //!< Double bit errors -- boolean in dcgmPolicyConditionParams_t + DCGM_POLICY_COND_PCI = 0x2, //!< PCI events/errors -- boolean in dcgmPolicyConditionParams_t + DCGM_POLICY_COND_MAX_PAGES_RETIRED = 0x4, //!< Maximum number of retired pages -- number + //!< required in dcgmPolicyConditionParams_t + DCGM_POLICY_COND_THERMAL = 0x8, //!< Thermal violation -- number required in dcgmPolicyConditionParams_t + DCGM_POLICY_COND_POWER = 0x10, //!< Power violation -- number required in dcgmPolicyConditionParams_t + DCGM_POLICY_COND_NVLINK = 0x20, //!< NVLINK errors -- boolean in dcgmPolicyConditionParams_t + DCGM_POLICY_COND_XID = 0x40, //!< XID errors -- number required in dcgmPolicyConditionParams_t } dcgmPolicyCondition_t; #define DCGM_POLICY_COND_MAX 7 @@ -1369,22 +1304,27 @@ typedef enum dcgmPolicyCondition_enum * as well as a "val" which is a union of the possible value types. For example, * to pass a true boolean: tag = BOOL, val.boolean = 1. */ -typedef struct dcgmPolicyConditionParms_st +typedef struct dcgmPolicyConditionParams_st { - enum {BOOL, LLONG} tag; - union { - unsigned int boolean; + enum + { + BOOL, + LLONG + } tag; + union + { + unsigned int boolean; unsigned long long llval; } val; -} dcgmPolicyConditionParms_t; +} dcgmPolicyConditionParams_t; /** * Enumeration for policy modes */ typedef enum dcgmPolicyMode_enum { - DCGM_POLICY_MODE_AUTOMATED = 0, //!< automatic mode - DCGM_POLICY_MODE_MANUAL = 1, //!< manual mode + DCGM_POLICY_MODE_AUTOMATED = 0, //!< automatic mode + DCGM_POLICY_MODE_MANUAL = 1, //!< manual mode } dcgmPolicyMode_t; /** @@ -1392,7 +1332,7 @@ typedef enum dcgmPolicyMode_enum */ typedef enum dcgmPolicyIsolation_enum { - DCGM_POLICY_ISOLATION_NONE = 0, //!< no isolation of GPUs on error + DCGM_POLICY_ISOLATION_NONE = 0, //!< no isolation of GPUs on error } dcgmPolicyIsolation_t; /** @@ -1400,8 +1340,8 @@ typedef enum dcgmPolicyIsolation_enum */ typedef enum dcgmPolicyAction_enum { - DCGM_POLICY_ACTION_NONE = 0, //!< no action - DCGM_POLICY_ACTION_GPURESET = 1, //!< perform a GPU reset on violation + DCGM_POLICY_ACTION_NONE = 0, //!< no action + DCGM_POLICY_ACTION_GPURESET = 1, //!< Deprecated - perform a GPU reset on violation } dcgmPolicyAction_t; /** @@ -1409,10 +1349,10 @@ typedef enum dcgmPolicyAction_enum */ typedef enum dcgmPolicyValidation_enum { - DCGM_POLICY_VALID_NONE = 0, //!< no validation after an action is performed - DCGM_POLICY_VALID_SV_SHORT = 1, //!< run a short System Validation on the system after failure - DCGM_POLICY_VALID_SV_MED = 2, //!< run a medium System Validation test after failure - DCGM_POLICY_VALID_SV_LONG = 3, //!< run a extensive System Validation test after failure + DCGM_POLICY_VALID_NONE = 0, //!< no validation after an action is performed + DCGM_POLICY_VALID_SV_SHORT = 1, //!< run a short System Validation on the system after failure + DCGM_POLICY_VALID_SV_MED = 2, //!< run a medium System Validation test after failure + DCGM_POLICY_VALID_SV_LONG = 3, //!< run a extensive System Validation test after failure } dcgmPolicyValidation_t; /** @@ -1420,33 +1360,33 @@ typedef enum dcgmPolicyValidation_enum */ typedef enum dcgmPolicyFailureResp_enum { - DCGM_POLICY_FAILURE_NONE = 0, //!< on failure of validation perform no action + DCGM_POLICY_FAILURE_NONE = 0, //!< on failure of validation perform no action } dcgmPolicyFailureResp_t; -/** +/** * Structure to fill when a user queries for policy violations */ -typedef struct +typedef struct { - unsigned int gpuId; //!< gpu ID - unsigned int violationOccurred; //!< a violation based on the bit values in \ref dcgmPolicyCondition_t + unsigned int gpuId; //!< gpu ID + unsigned int violationOccurred; //!< a violation based on the bit values in \ref dcgmPolicyCondition_t } dcgmPolicyViolationNotify_t; /** - * Define the structure that specifies a policy to be enforced for a GPU + * Define the structure that specifies a policy to be enforced for a GPU */ -typedef struct +typedef struct { // version must always be first - unsigned int version; //!< version number (dcgmPolicy_version) - - dcgmPolicyCondition_t condition; //!< Condition(s) to access \ref dcgmPolicyCondition_t - dcgmPolicyMode_t mode; //!< Mode of operation \ref dcgmPolicyMode_t - dcgmPolicyIsolation_t isolation; //!< Isolation level after a policy violation \ref dcgmPolicyIsolation_t - dcgmPolicyAction_t action; //!< Action to perform after a policy violation \ref dcgmPolicyAction_t action - dcgmPolicyValidation_t validation; //!< Validation to perform after action is taken \ref dcgmPolicyValidation_t - dcgmPolicyFailureResp_t response; //!< Failure to validation response \ref dcgmPolicyFailureResp_t - dcgmPolicyConditionParms_t parms[DCGM_POLICY_COND_MAX]; //!< Parameters for the \a condition fields + unsigned int version; //!< version number (dcgmPolicy_version) + + dcgmPolicyCondition_t condition; //!< Condition(s) to access \ref dcgmPolicyCondition_t + dcgmPolicyMode_t mode; //!< Mode of operation \ref dcgmPolicyMode_t + dcgmPolicyIsolation_t isolation; //!< Isolation level after a policy violation \ref dcgmPolicyIsolation_t + dcgmPolicyAction_t action; //!< Action to perform after a policy violation \ref dcgmPolicyAction_t action + dcgmPolicyValidation_t validation; //!< Validation to perform after action is taken \ref dcgmPolicyValidation_t + dcgmPolicyFailureResp_t response; //!< Failure to validation response \ref dcgmPolicyFailureResp_t + dcgmPolicyConditionParams_t parms[DCGM_POLICY_COND_MAX]; //!< Parameters for the \a condition fields } dcgmPolicy_v1; /** @@ -1470,9 +1410,16 @@ typedef dcgmPolicy_v1 dcgmPolicy_t; */ typedef struct { - long long timestamp; //!< timestamp of the error - enum {L1, L2, DEVICE, REGISTER, TEXTURE} location; //!< location of the error - unsigned int numerrors; //!< number of errors + long long timestamp; //!< timestamp of the error + enum + { + L1, + L2, + DEVICE, + REGISTER, + TEXTURE + } location; //!< location of the error + unsigned int numerrors; //!< number of errors } dcgmPolicyConditionDbe_t; /** @@ -1480,8 +1427,8 @@ typedef struct */ typedef struct { - long long timestamp; //!< timestamp of the error - unsigned int counter; //!< value of the PCIe replay counter + long long timestamp; //!< timestamp of the error + unsigned int counter; //!< value of the PCIe replay counter } dcgmPolicyConditionPci_t; /** @@ -1489,37 +1436,37 @@ typedef struct */ typedef struct { - long long timestamp; //!< timestamp of the error - unsigned int sbepages; //!< number of pending pages due to SBE - unsigned int dbepages; //!< number of pending pages due to DBE + long long timestamp; //!< timestamp of the error + unsigned int sbepages; //!< number of pending pages due to SBE + unsigned int dbepages; //!< number of pending pages due to DBE } dcgmPolicyConditionMpr_t; -/** +/** * Define the thermal policy violations return structure */ typedef struct { - long long timestamp; //!< timestamp of the error - unsigned int thermalViolation; //!< Temperature reached that violated policy + long long timestamp; //!< timestamp of the error + unsigned int thermalViolation; //!< Temperature reached that violated policy } dcgmPolicyConditionThermal_t; -/** +/** * Define the power policy violations return structure */ typedef struct { - long long timestamp; //!< timestamp of the error - unsigned int powerViolation; //!< Power value reached that violated policy + long long timestamp; //!< timestamp of the error + unsigned int powerViolation; //!< Power value reached that violated policy } dcgmPolicyConditionPower_t; -/** +/** * Define the nvlink policy violations return structure */ typedef struct { - long long timestamp; //!< timestamp of the error - unsigned short fieldId; //! + * Every pair is separated by a colon char (:). Only the very first colon is considered as a separation.
+ * Values can contain colon chars. Values and Keys cannot contain semicolon chars.
+ * Usually defined keys are: + *

+ * version : DCGM Version.
+ * arch : Target DCGM Architecture.
+ * buildid : Build ID. Usually a sequential number.
+ * commit : Commit ID (Usually a git commit hash).
+ * author : Author of the commit above.
+ * branch : Branch (Usually a git branch that was used for the build).
+ * buildtype : Build Type.
+ * builddate : Date of the build.
+ * buildplatform : Platform where the build was made.
+ *

+ * Any or all keys may be absent.
+ * This values are for reference only are not supposed to participate in some complicated logic.
+ */ + char rawBuildInfoString[DCGM_MAX_STR_LENGTH * 2]; +} dcgmVersionInfo_v2; + +/** + * Version 2 of the dcgmVersionInfo_v2 + */ +#define dcgmVersionInfo_version2 MAKE_DCGM_VERSION(dcgmVersionInfo_v2, 2) + +#define dcgmVersionInfo_version dcgmVersionInfo_version2 +typedef dcgmVersionInfo_v2 dcgmVersionInfo_t; /** @} */ -#ifdef __cplusplus +#ifdef __cplusplus } #endif -#endif /* DCGM_STRUCTS_H */ +#endif /* DCGM_STRUCTS_H */ diff --git a/bindings/go/dcgm/device_info.go b/bindings/go/dcgm/device_info.go index 8c61a55..bda27a1 100644 --- a/bindings/go/dcgm/device_info.go +++ b/bindings/go/dcgm/device_info.go @@ -120,7 +120,7 @@ func getPciBandwidth(gpuId uint) (int64, error) { func getDeviceInfo(gpuid uint) (deviceInfo Device, err error) { var device C.dcgmDeviceAttributes_t - device.version = makeVersion1(unsafe.Sizeof(device)) + device.version = makeVersion2(unsafe.Sizeof(device)) result := C.dcgmGetDeviceAttributes(handle.handle, C.uint(gpuid), &device) if err = errorString(result); err != nil { diff --git a/bindings/go/dcgm/health.go b/bindings/go/dcgm/health.go index 5ce3abc..e611e72 100644 --- a/bindings/go/dcgm/health.go +++ b/bindings/go/dcgm/health.go @@ -48,8 +48,8 @@ func healthCheckByGpuId(gpuId uint) (deviceHealth DeviceHealth, err error) { return } - var healthResults C.dcgmHealthResponse_v1 - healthResults.version = makeVersion1(unsafe.Sizeof(healthResults)) + var healthResults C.dcgmHealthResponse_v4 + healthResults.version = makeVersion2(unsafe.Sizeof(healthResults)) result := C.dcgmHealthCheck(handle.handle, groupId.handle, (*C.dcgmHealthResponse_t)(unsafe.Pointer(&healthResults))) @@ -60,18 +60,15 @@ func healthCheckByGpuId(gpuId uint) (deviceHealth DeviceHealth, err error) { status := healthStatus(int8(healthResults.overallHealth)) watches := []SystemWatch{} - // only 1 gpu - i := 0 - // number of watches that encountred error/warning - incidents := uint(healthResults.gpu[i].incidentCount) + incidents := uint(healthResults.incidentCount) for j := uint(0); j < incidents; j++ { watch := SystemWatch{ - Type: systemWatch(int(healthResults.gpu[i].systems[j].system)), - Status: healthStatus(int8(healthResults.gpu[i].systems[j].health)), + Type: systemWatch(int(healthResults.incidents[j].system)), + Status: healthStatus(int8(healthResults.incidents[j].health)), - Error: *stringPtr(&healthResults.gpu[i].systems[j].errorString[0]), + Error: *stringPtr(&healthResults.incidents[j].error.msg[0]), } watches = append(watches, watch) } diff --git a/bindings/go/dcgm/hostengine_status.go b/bindings/go/dcgm/hostengine_status.go index 7848f09..4e6e6b9 100644 --- a/bindings/go/dcgm/hostengine_status.go +++ b/bindings/go/dcgm/hostengine_status.go @@ -24,7 +24,7 @@ func introspect() (engine DcgmStatus, err error) { } var memory C.dcgmIntrospectMemory_t - memory.version = makeVersion1(unsafe.Sizeof(memory)) + memory.version = makeVersion2(unsafe.Sizeof(memory)) waitIfNoData := 1 result = C.dcgmIntrospectGetHostengineMemoryUsage(handle.handle, &memory, C.int(waitIfNoData)) @@ -34,7 +34,7 @@ func introspect() (engine DcgmStatus, err error) { var cpu C.dcgmIntrospectCpuUtil_t - cpu.version = makeVersion1(unsafe.Sizeof(cpu)) + cpu.version = makeVersion2(unsafe.Sizeof(cpu)) result = C.dcgmIntrospectGetHostengineCpuUtilization(handle.handle, &cpu, C.int(waitIfNoData)) if err = errorString(result); err != nil { diff --git a/bindings/go/dcgm/policy.go b/bindings/go/dcgm/policy.go index 9352529..06be22f 100644 --- a/bindings/go/dcgm/policy.go +++ b/bindings/go/dcgm/policy.go @@ -250,7 +250,7 @@ func ViolationRegistration(data unsafe.Pointer) int { func setPolicy(groupId GroupHandle, condition C.dcgmPolicyCondition_t, paramList []policyIndex) (err error) { var policy C.dcgmPolicy_t - policy.version = makeVersion1(unsafe.Sizeof(policy)) + policy.version = makeVersion2(unsafe.Sizeof(policy)) policy.mode = C.dcgmPolicyMode_t(C.DCGM_OPERATION_MODE_AUTO) policy.action = C.DCGM_POLICY_ACTION_NONE policy.isolation = C.DCGM_POLICY_ISOLATION_NONE diff --git a/bindings/go/dcgm/process_info.go b/bindings/go/dcgm/process_info.go index 16f7e33..64227cf 100644 --- a/bindings/go/dcgm/process_info.go +++ b/bindings/go/dcgm/process_info.go @@ -95,7 +95,7 @@ func watchPidFields(gpus ...uint) (groupId GroupHandle, err error) { func getProcessInfo(groupId GroupHandle, pid uint) (processInfo []ProcessInfo, err error) { var pidInfo C.dcgmPidInfo_t - pidInfo.version = makeVersion1(unsafe.Sizeof(pidInfo)) + pidInfo.version = makeVersion2(unsafe.Sizeof(pidInfo)) pidInfo.pid = C.uint(pid) result := C.dcgmGetPidInfo(handle.handle, groupId.handle, &pidInfo) diff --git a/bindings/go/dcgm/topology.go b/bindings/go/dcgm/topology.go index cf1dbd5..f3afc38 100644 --- a/bindings/go/dcgm/topology.go +++ b/bindings/go/dcgm/topology.go @@ -97,7 +97,7 @@ func getCPUAffinity(busid string) (string, error) { func getBusid(gpuid uint) (string, error) { var device C.dcgmDeviceAttributes_t - device.version = makeVersion1(unsafe.Sizeof(device)) + device.version = makeVersion2(unsafe.Sizeof(device)) result := C.dcgmGetDeviceAttributes(handle.handle, C.uint(gpuid), &device) if err := errorString(result); err != nil { @@ -108,7 +108,7 @@ func getBusid(gpuid uint) (string, error) { func getDeviceTopology(gpuid uint) (links []P2PLink, err error) { var topology C.dcgmDeviceTopology_t - topology.version = makeVersion1(unsafe.Sizeof(topology)) + topology.version = makeVersion2(unsafe.Sizeof(topology)) result := C.dcgmGetDeviceTopology(handle.handle, C.uint(gpuid), &topology) if result == C.DCGM_ST_NOT_SUPPORTED { diff --git a/docker/Dockerfile.ubi8 b/docker/Dockerfile.ubi8 index 37e9c0d..9adf2da 100644 --- a/docker/Dockerfile.ubi8 +++ b/docker/Dockerfile.ubi8 @@ -6,7 +6,7 @@ COPY . . RUN make binary check-format -FROM registry.access.redhat.com/ubi8:latest +FROM nvidia/cuda:11.0-base-ubi8 LABEL io.k8s.display-name="NVIDIA DCGM Exporter" ARG DCGM_VERSION @@ -19,9 +19,8 @@ COPY --from=builder /go/src/github.com/NVIDIA/gpu-monitoring-tools/dcgm-exporter COPY etc/dcgm-exporter /etc/dcgm-exporter ENV NVIDIA_VISIBLE_DEVICES=all - -RUN useradd dcgm-exporter -USER dcgm-exporter +# Required for DCP metrics +ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32 ARG VERSION @@ -35,4 +34,8 @@ LABEL description="See summary" COPY ./LICENSE ./licenses/LICENSE -ENTRYPOINT ["/usr/bin/dcgm-exporter"] +ENV NO_SETCAP= +COPY docker/docker-entrypoint.sh /usr/local/dcgm/docker-entrypoint.sh +RUN chmod +x /usr/local/dcgm/docker-entrypoint.sh + +ENTRYPOINT ["/usr/local/dcgm/docker-entrypoint.sh"] diff --git a/docker/Dockerfile.ubuntu18.04 b/docker/Dockerfile.ubuntu18.04 index 0eb50a7..4c61bd2 100644 --- a/docker/Dockerfile.ubuntu18.04 +++ b/docker/Dockerfile.ubuntu18.04 @@ -6,7 +6,7 @@ COPY . . RUN make binary check-format -FROM ubuntu:18.04 +FROM nvidia/cuda:11.0-base-ubuntu18.04 LABEL io.k8s.display-name="NVIDIA DCGM Exporter" COPY --from=builder /go/src/github.com/NVIDIA/gpu-monitoring-tools/dcgm-exporter /usr/bin/ @@ -14,6 +14,7 @@ COPY etc/dcgm-exporter /etc/dcgm-exporter ARG DCGM_VERSION RUN apt-get update && apt-get install -y --no-install-recommends \ + libcap2-bin \ libgomp1 \ wget && \ rm -rf /var/lib/apt/lists/* && \ @@ -21,9 +22,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ dpkg -i datacenter-gpu-manager_*.deb && \ rm -f datacenter-gpu-manager_*.deb +# Required for DCP metrics +ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32 + ENV NVIDIA_VISIBLE_DEVICES=all -RUN useradd dcgm-exporter -USER dcgm-exporter +ENV NO_SETCAP= +COPY docker/docker-entrypoint.sh /usr/local/dcgm/docker-entrypoint.sh +RUN chmod +x /usr/local/dcgm/docker-entrypoint.sh -ENTRYPOINT ["/usr/bin/dcgm-exporter"] +ENTRYPOINT ["/usr/local/dcgm/docker-entrypoint.sh"] diff --git a/docker/docker-entrypoint.sh b/docker/docker-entrypoint.sh new file mode 100644 index 0000000..d6c8ea6 --- /dev/null +++ b/docker/docker-entrypoint.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +set -euo pipefail + +# We want to setcap only when the container is started with the right permissions +DCGM_EXPORTER=$(readlink -f $(which dcgm-exporter)) +if [ -z "$NO_SETCAP" ]; then + setcap 'cap_sys_admin=+ep' $DCGM_EXPORTER + + if ! $DCGM_EXPORTER -v 1>/dev/null 2>/dev/null; then + >&2 echo "dcgm-exporter doesn't have sufficient privileges to expose profiling metrics. To use dcgm-exporter for profiling metrics use --cap-add SYS_ADMIN" + setcap 'cap_sys_admin=-ep' $DCGM_EXPORTER + fi +fi + +# Pass the command line arguments to dcgm-exporter +set -- $DCGM_EXPORTER "$@" +exec "$@"