From cb5a78fd92a0f3928fb063b1cd3033612831748d Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Fri, 17 Jul 2020 09:58:38 +0000 Subject: [PATCH] Add check for INVALID_ARGUMENT in NvLink checks Unfortunately, the check to see if a link is active throws an error if an invalid linkID is passed in (instead of simply saying that the link is inactive). This causes problems since the newest nvml.h is for CUDA 11 (which has an NVML_NVLINK_MAX_LINKS of 12) and older versions had an NVML_NVLINK_MAX_LINKS of 6. This patch adds a check to see if the various calls that take a linkID fail with INVALID_ARGUMENT, and if so, silently ignore the error. This hould be OK since we are fairly confident all other arguments are valid. It would have been nice to avoid this (somewhat hacky) solution though. Signed-off-by: Kevin Klues --- bindings/go/nvml/bindings.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bindings/go/nvml/bindings.go b/bindings/go/nvml/bindings.go index 54a5bcf..e9f7f45 100644 --- a/bindings/go/nvml/bindings.go +++ b/bindings/go/nvml/bindings.go @@ -319,7 +319,7 @@ func (h handle) deviceGetNvLinkState(link uint) (*uint, error) { var isActive C.nvmlEnableState_t r := C.nvmlDeviceGetNvLinkState(h.dev, C.uint(link), &isActive) - if r == C.NVML_ERROR_NOT_SUPPORTED { + if r == C.NVML_ERROR_NOT_SUPPORTED || r == C.NVML_ERROR_INVALID_ARGUMENT { return nil, nil } @@ -330,7 +330,7 @@ func (h handle) deviceGetNvLinkRemotePciInfo(link uint) (*string, error) { var pci C.nvmlPciInfo_t r := C.nvmlDeviceGetNvLinkRemotePciInfo(h.dev, C.uint(link), &pci) - if r == C.NVML_ERROR_NOT_SUPPORTED { + if r == C.NVML_ERROR_NOT_SUPPORTED || r == C.NVML_ERROR_INVALID_ARGUMENT { return nil, nil }