diff --git a/src/utilities/_hypre_utilities.h b/src/utilities/_hypre_utilities.h index 1f98eb8d44..db784113de 100644 --- a/src/utilities/_hypre_utilities.h +++ b/src/utilities/_hypre_utilities.h @@ -201,6 +201,7 @@ typedef struct #define hypre_HandleCubDevAllocator(hypre_handle) hypre_DeviceDataCubDevAllocator(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleCubUvmAllocator(hypre_handle) hypre_DeviceDataCubUvmAllocator(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleDevice(hypre_handle) hypre_DeviceDataDevice(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleDeviceUVM(hypre_handle) hypre_DeviceDataDeviceUVM(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleDeviceMaxWorkGroupSize(hypre_handle) hypre_DeviceDataDeviceMaxWorkGroupSize(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleDeviceMaxShmemPerBlock(hypre_handle) hypre_DeviceDataDeviceMaxShmemPerBlock(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleDeviceMaxShmemPerBlockInited(hypre_handle) hypre_DeviceDataDeviceMaxShmemPerBlockInited(hypre_HandleDeviceData(hypre_handle)) diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp index 3f413abce0..aa0f71c00d 100644 --- a/src/utilities/_hypre_utilities.hpp +++ b/src/utilities/_hypre_utilities.hpp @@ -335,25 +335,41 @@ using hypre_DeviceItem = void*; // Macro for device memory prefetching (CUDART 13.0+) #define HYPRE_MEM_PREFETCH_DEVICE(ptr, size, stream) \ do { \ - cudaMemLocation loc = {cudaMemLocationTypeDevice, hypre_HandleDevice(hypre_handle())}; \ - HYPRE_CUDA_CALL(cudaMemPrefetchAsync(ptr, size, loc, 0, stream)); \ + if (hypre_HandleDeviceUVM(hypre_handle())) \ + { \ + cudaMemLocation loc = {cudaMemLocationTypeDevice, hypre_HandleDevice(hypre_handle())}; \ + HYPRE_CUDA_CALL(cudaMemPrefetchAsync(ptr, size, loc, 0, stream)); \ + } \ } while (0) // Macro for host memory prefetching (CUDART 13.0+) #define HYPRE_MEM_PREFETCH_HOST(ptr, size, stream) \ do { \ - cudaMemLocation loc = {cudaMemLocationTypeHost, cudaCpuDeviceId}; \ - HYPRE_CUDA_CALL(cudaMemPrefetchAsync(ptr, size, loc, 0, stream)); \ + if (hypre_HandleDeviceUVM(hypre_handle())) \ + { \ + cudaMemLocation loc = {cudaMemLocationTypeHost, cudaCpuDeviceId}; \ + HYPRE_CUDA_CALL(cudaMemPrefetchAsync(ptr, size, loc, 0, stream)); \ + } \ } while (0) #else // Macro for device memory prefetching (< CUDART 13.0) #define HYPRE_MEM_PREFETCH_DEVICE(ptr, size, stream) \ - HYPRE_CUDA_CALL(cudaMemPrefetchAsync(ptr, size, hypre_HandleDevice(hypre_handle()), stream)) + do { \ + if (hypre_HandleDeviceUVM(hypre_handle())) \ + { \ + HYPRE_CUDA_CALL(cudaMemPrefetchAsync(ptr, size, hypre_HandleDevice(hypre_handle()), stream)); \ + } \ + } while (0) // Macro for host memory prefetching (< CUDART 13.0) #define HYPRE_MEM_PREFETCH_HOST(ptr, size, stream) \ - HYPRE_CUDA_CALL(cudaMemPrefetchAsync(ptr, size, cudaCpuDeviceId, stream)) + do { \ + if (hypre_HandleDeviceUVM(hypre_handle())) \ + { \ + HYPRE_CUDA_CALL(cudaMemPrefetchAsync(ptr, size, cudaCpuDeviceId, stream)); \ + } \ + } while (0) #endif #endif /* defined(HYPRE_USING_CUDA) */ @@ -986,6 +1002,7 @@ struct hypre_DeviceData HYPRE_Int device_max_work_group_size; #else HYPRE_Int device; + HYPRE_Int device_uvm; #endif hypre_int device_max_shmem_per_block[3]; /* by default, hypre puts GPU computations in this stream @@ -1015,6 +1032,7 @@ struct hypre_DeviceData }; #define hypre_DeviceDataDevice(data) ((data) -> device) +#define hypre_DeviceDataDeviceUVM(data) ((data) -> device_uvm) #define hypre_DeviceDataDeviceMaxWorkGroupSize(data) ((data) -> device_max_work_group_size) #define hypre_DeviceDataDeviceMaxShmemPerBlock(data) ((data) -> device_max_shmem_per_block) #define hypre_DeviceDataDeviceMaxShmemPerBlockInited(data) (((data) -> device_max_shmem_per_block)[2]) diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h index 635574b450..3a73e34518 100644 --- a/src/utilities/device_utils.h +++ b/src/utilities/device_utils.h @@ -117,25 +117,41 @@ using hypre_DeviceItem = void*; // Macro for device memory prefetching (CUDART 13.0+) #define HYPRE_MEM_PREFETCH_DEVICE(ptr, size, stream) \ do { \ - cudaMemLocation loc = {cudaMemLocationTypeDevice, hypre_HandleDevice(hypre_handle())}; \ - HYPRE_CUDA_CALL(cudaMemPrefetchAsync(ptr, size, loc, 0, stream)); \ + if (hypre_HandleDeviceUVM(hypre_handle())) \ + { \ + cudaMemLocation loc = {cudaMemLocationTypeDevice, hypre_HandleDevice(hypre_handle())}; \ + HYPRE_CUDA_CALL(cudaMemPrefetchAsync(ptr, size, loc, 0, stream)); \ + } \ } while (0) // Macro for host memory prefetching (CUDART 13.0+) #define HYPRE_MEM_PREFETCH_HOST(ptr, size, stream) \ do { \ - cudaMemLocation loc = {cudaMemLocationTypeHost, cudaCpuDeviceId}; \ - HYPRE_CUDA_CALL(cudaMemPrefetchAsync(ptr, size, loc, 0, stream)); \ + if (hypre_HandleDeviceUVM(hypre_handle())) \ + { \ + cudaMemLocation loc = {cudaMemLocationTypeHost, cudaCpuDeviceId}; \ + HYPRE_CUDA_CALL(cudaMemPrefetchAsync(ptr, size, loc, 0, stream)); \ + } \ } while (0) #else // Macro for device memory prefetching (< CUDART 13.0) #define HYPRE_MEM_PREFETCH_DEVICE(ptr, size, stream) \ - HYPRE_CUDA_CALL(cudaMemPrefetchAsync(ptr, size, hypre_HandleDevice(hypre_handle()), stream)) + do { \ + if (hypre_HandleDeviceUVM(hypre_handle())) \ + { \ + HYPRE_CUDA_CALL(cudaMemPrefetchAsync(ptr, size, hypre_HandleDevice(hypre_handle()), stream)); \ + } \ + } while (0) // Macro for host memory prefetching (< CUDART 13.0) #define HYPRE_MEM_PREFETCH_HOST(ptr, size, stream) \ - HYPRE_CUDA_CALL(cudaMemPrefetchAsync(ptr, size, cudaCpuDeviceId, stream)) + do { \ + if (hypre_HandleDeviceUVM(hypre_handle())) \ + { \ + HYPRE_CUDA_CALL(cudaMemPrefetchAsync(ptr, size, cudaCpuDeviceId, stream)); \ + } \ + } while (0) #endif #endif /* defined(HYPRE_USING_CUDA) */ @@ -768,6 +784,7 @@ struct hypre_DeviceData HYPRE_Int device_max_work_group_size; #else HYPRE_Int device; + HYPRE_Int device_uvm; #endif hypre_int device_max_shmem_per_block[3]; /* by default, hypre puts GPU computations in this stream @@ -797,6 +814,7 @@ struct hypre_DeviceData }; #define hypre_DeviceDataDevice(data) ((data) -> device) +#define hypre_DeviceDataDeviceUVM(data) ((data) -> device_uvm) #define hypre_DeviceDataDeviceMaxWorkGroupSize(data) ((data) -> device_max_work_group_size) #define hypre_DeviceDataDeviceMaxShmemPerBlock(data) ((data) -> device_max_shmem_per_block) #define hypre_DeviceDataDeviceMaxShmemPerBlockInited(data) (((data) -> device_max_shmem_per_block)[2]) diff --git a/src/utilities/general.c b/src/utilities/general.c index 151a72f242..3ae6d5f989 100644 --- a/src/utilities/general.c +++ b/src/utilities/general.c @@ -363,18 +363,20 @@ HYPRE_DeviceInitialize(void) hypre_CudaCompileFlagCheck(); #endif -#if defined(HYPRE_USING_DEVICE_POOL) - /* Keep this check here at the end of HYPRE_Initialize() - * Make sure that device pool allocator has not been setup in HYPRE_Initialize, - * otherwise users are not able to set all the parameters - */ - if ( hypre_HandleCubDevAllocator(handle) || - hypre_HandleCubUvmAllocator(handle) ) + /* Check if OS supports UVM */ +#if defined(HYPRE_USING_CUDA) && defined(HYPRE_USING_UNIFIED_MEMORY) { - char msg[256]; - hypre_sprintf(msg, "%s %s", "ERROR: device pool allocators have been created in", __func__); - hypre_error_w_msg(-1, msg); - } + struct cudaDeviceProp deviceProp; + + HYPRE_CUDA_CALL( cudaGetDevice(&device_id) ); + HYPRE_CUDA_CALL( cudaGetDeviceProperties(&deviceProp, device_id) ); + + if (deviceProp.managedMemory == 1 && deviceProp.concurrentManagedAccess == 1) + { + hypre_HandleDeviceUVM(handle) = 1; + } + } + #endif #endif /* if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_DEVICE_OPENMP) */ diff --git a/src/utilities/handle.h b/src/utilities/handle.h index 44ce6d382c..8005b12577 100644 --- a/src/utilities/handle.h +++ b/src/utilities/handle.h @@ -108,6 +108,7 @@ typedef struct #define hypre_HandleCubDevAllocator(hypre_handle) hypre_DeviceDataCubDevAllocator(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleCubUvmAllocator(hypre_handle) hypre_DeviceDataCubUvmAllocator(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleDevice(hypre_handle) hypre_DeviceDataDevice(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleDeviceUVM(hypre_handle) hypre_DeviceDataDeviceUVM(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleDeviceMaxWorkGroupSize(hypre_handle) hypre_DeviceDataDeviceMaxWorkGroupSize(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleDeviceMaxShmemPerBlock(hypre_handle) hypre_DeviceDataDeviceMaxShmemPerBlock(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleDeviceMaxShmemPerBlockInited(hypre_handle) hypre_DeviceDataDeviceMaxShmemPerBlockInited(hypre_HandleDeviceData(hypre_handle))