From 19343bda09087a0e5d3b3c60824e5af023a4e8e9 Mon Sep 17 00:00:00 2001 From: Willem Deconinck Date: Thu, 27 Feb 2025 14:32:17 +0000 Subject: [PATCH] Use pluto in atlas where possible instead of hic calls --- src/atlas/CMakeLists.txt | 1 + src/atlas/array/native/NativeDataStore.h | 165 ++++++-------------- src/atlas/library/Library.cc | 22 +-- src/atlas/parallel/acc/acc.cc | 20 +-- src/atlas/util/Allocate.cc | 41 +---- src/tests/mesh/test_connectivity_kernel.hic | 16 +- src/tests/parallel/test_haloexchange.cc | 37 +---- 7 files changed, 79 insertions(+), 223 deletions(-) diff --git a/src/atlas/CMakeLists.txt b/src/atlas/CMakeLists.txt index 3ca0b5a41..104a1a7b8 100644 --- a/src/atlas/CMakeLists.txt +++ b/src/atlas/CMakeLists.txt @@ -1017,6 +1017,7 @@ ecbuild_add_library( TARGET atlas atlas_io hic hicsparse + pluto $<${atlas_HAVE_EIGEN}:Eigen3::Eigen> $<${atlas_HAVE_OMP_CXX}:OpenMP::OpenMP_CXX> $<${atlas_HAVE_GRIDTOOLS_STORAGE}:GridTools::gridtools> diff --git a/src/atlas/array/native/NativeDataStore.h b/src/atlas/array/native/NativeDataStore.h index a63dc6077..a0377cd8e 100644 --- a/src/atlas/array/native/NativeDataStore.h +++ b/src/atlas/array/native/NativeDataStore.h @@ -16,6 +16,8 @@ #include // std::numeric_limits::signaling_NaN #include +#include "pluto/pluto.h" + #include "atlas/array/ArrayDataStore.h" #include "atlas/library/Library.h" #include "atlas/library/config.h" @@ -24,9 +26,6 @@ #include "atlas/runtime/Log.h" #include "eckit/log/Bytes.h" -#include "hic/hic.h" - - #define ATLAS_ACC_DEBUG 0 //------------------------------------------------------------------------------ @@ -94,26 +93,15 @@ template void initialise(Value[], size_t) {} #endif -static int devices() { - static int devices_ = [](){ - int n = 0; - auto err = hicGetDeviceCount(&n); - if (err != hicSuccess) { - n = 0; - static_cast(hicGetLastError()); - } - return n; - }(); - return devices_; -} - template class DataStore : public ArrayDataStore { public: - DataStore(size_t size): size_(size) { + DataStore(size_t size): size_(size), + host_allocator_{pluto::new_delete_resource()}, + device_allocator_{pluto::device_resource()} { allocateHost(); initialise(host_data_, size_); - if (ATLAS_HAVE_GPU && devices()) { + if (ATLAS_HAVE_GPU && pluto::devices()) { device_updated_ = false; } else { @@ -127,14 +115,11 @@ class DataStore : public ArrayDataStore { } void updateDevice() const override { - if (ATLAS_HAVE_GPU && devices()) { + if (ATLAS_HAVE_GPU && pluto::devices()) { if (not device_allocated_) { allocateDevice(); } - hicError_t err = hicMemcpy(device_data_, host_data_, size_*sizeof(Value), hicMemcpyHostToDevice); - if (err != hicSuccess) { - throw_AssertionFailed("Failed to updateDevice: "+std::string(hicGetErrorString(err)), Here()); - } + pluto::copy_host_to_device(device_data_, host_data_, size_); device_updated_ = true; } } @@ -142,10 +127,7 @@ class DataStore : public ArrayDataStore { void updateHost() const override { if constexpr (ATLAS_HAVE_GPU) { if (device_allocated_) { - hicError_t err = hicMemcpy(host_data_, device_data_, size_*sizeof(Value), hicMemcpyDeviceToHost); - if (err != hicSuccess) { - throw_AssertionFailed("Failed to updateHost: "+std::string(hicGetErrorString(err)), Here()); - } + pluto::copy_device_to_host(host_data_, device_data_, size_); host_updated_ = true; } } @@ -174,15 +156,12 @@ class DataStore : public ArrayDataStore { bool deviceAllocated() const override { return device_allocated_; } void allocateDevice() const override { - if (ATLAS_HAVE_GPU && devices()) { + if (ATLAS_HAVE_GPU && pluto::devices()) { if (device_allocated_) { return; } if (size_) { - hicError_t err = hicMalloc((void**)&device_data_, sizeof(Value)*size_); - if (err != hicSuccess) { - throw_AssertionFailed("Failed to allocate GPU memory: " + std::string(hicGetErrorString(err)), Here()); - } + device_data_ = device_allocator_.allocate(size_); device_allocated_ = true; accMap(); } @@ -190,16 +169,11 @@ class DataStore : public ArrayDataStore { } void deallocateDevice() const override { - if constexpr (ATLAS_HAVE_GPU) { - if (device_allocated_) { - accUnmap(); - hicError_t err = hicFree(device_data_); - if (err != hicSuccess) { - throw_AssertionFailed("Failed to deallocate GPU memory: " + std::string(hicGetErrorString(err)), Here()); - } - device_data_ = nullptr; - device_allocated_ = false; - } + if (device_allocated_) { + accUnmap(); + device_allocator_.deallocate(device_data_,size_); + device_data_ = nullptr; + device_allocated_ = false; } } @@ -259,36 +233,22 @@ class DataStore : public ArrayDataStore { throw_Exception(ss.str(), loc); } - void alloc_aligned(Value*& ptr, size_t n) { - if (n > 0) { - const size_t alignment = 64 * sizeof(Value); - size_t bytes = sizeof(Value) * n; - MemoryHighWatermark::instance() += bytes; - - int err = posix_memalign((void**)&ptr, alignment, bytes); - if (err) { - throw_AllocationFailed(bytes, Here()); - } - } - else { - ptr = nullptr; - } - } - - void free_aligned(Value*& ptr) { - if (ptr) { - free(ptr); - ptr = nullptr; - MemoryHighWatermark::instance() -= footprint(); - } - } - void allocateHost() { - alloc_aligned(host_data_, size_); + if (size_ > 0) { + MemoryHighWatermark::instance() += footprint(); + host_data_ = host_allocator_.allocate(size_); + } + else { + host_data_ = nullptr; + } } void deallocateHost() { - free_aligned(host_data_); + if (host_data_) { + host_allocator_.deallocate(host_data_, size_); + host_data_ = nullptr; + MemoryHighWatermark::instance() -= footprint(); + } } size_t footprint() const { return sizeof(Value) * size_; } @@ -302,6 +262,8 @@ class DataStore : public ArrayDataStore { mutable bool device_allocated_{false}; mutable bool acc_mapped_{false}; + pluto::allocator host_allocator_; + mutable pluto::allocator device_allocator_; }; //------------------------------------------------------------------------------ @@ -311,7 +273,7 @@ class WrappedDataStore : public ArrayDataStore { public: void init_device() { - if (ATLAS_HAVE_GPU && devices()) { + if (ATLAS_HAVE_GPU && pluto::devices()) { device_updated_ = false; } else { @@ -319,14 +281,15 @@ class WrappedDataStore : public ArrayDataStore { } } - WrappedDataStore(Value* host_data, size_t size): host_data_(host_data), size_(size) { + WrappedDataStore(Value* host_data, size_t size): host_data_(host_data), size_(size), + device_allocator_{pluto::device_resource()} { init_device(); } WrappedDataStore(Value* host_data, const ArraySpec& spec): host_data_(host_data), - size_(spec.size()) - { + size_(spec.size()), + device_allocator_{pluto::device_resource()} { init_device(); contiguous_ = spec.contiguous(); if (! contiguous_) { @@ -363,25 +326,17 @@ class WrappedDataStore : public ArrayDataStore { } void updateDevice() const override { - if (ATLAS_HAVE_GPU && devices()) { + if (ATLAS_HAVE_GPU && pluto::devices()) { if (not device_allocated_) { allocateDevice(); } if (contiguous_) { - hicError_t err = hicMemcpy(device_data_, host_data_, size_*sizeof(Value), hicMemcpyHostToDevice); - if (err != hicSuccess) { - throw_AssertionFailed("Failed to updateDevice: "+std::string(hicGetErrorString(err)), Here()); - } + pluto::copy_host_to_device(device_data_, host_data_, size_); } else { - hicError_t err = hicMemcpy2D( - device_data_, memcpy_h2d_pitch_ * sizeof(Value), + pluto::memcpy_host_to_device_2D(device_data_, memcpy_h2d_pitch_ * sizeof(Value), host_data_, memcpy_d2h_pitch_ * sizeof(Value), - memcpy_width_ * sizeof(Value), memcpy_height_, - hicMemcpyHostToDevice); - if (err != hicSuccess) { - throw_AssertionFailed("Failed to updateDevice: "+std::string(hicGetErrorString(err)), Here()); - } + memcpy_width_ * sizeof(Value), memcpy_height_); } device_updated_ = true; } @@ -391,20 +346,12 @@ class WrappedDataStore : public ArrayDataStore { if constexpr (ATLAS_HAVE_GPU) { if (device_allocated_) { if (contiguous_) { - hicError_t err = hicMemcpy(host_data_, device_data_, size_*sizeof(Value), hicMemcpyDeviceToHost); - if (err != hicSuccess) { - throw_AssertionFailed("Failed to updateHost: "+std::string(hicGetErrorString(err)), Here()); - } + pluto::copy_device_to_host(host_data_, device_data_, size_); } else { - hicError_t err = hicMemcpy2D( - host_data_, memcpy_d2h_pitch_ * sizeof(Value), + pluto::memcpy_device_to_host_2D(host_data_, memcpy_d2h_pitch_ * sizeof(Value), device_data_, memcpy_h2d_pitch_ * sizeof(Value), - memcpy_width_ * sizeof(Value), memcpy_height_, - hicMemcpyDeviceToHost); - if (err != hicSuccess) { - throw_AssertionFailed("Failed to updateHost: "+std::string(hicGetErrorString(err)), Here()); - } + memcpy_width_ * sizeof(Value), memcpy_height_); } host_updated_ = true; } @@ -435,15 +382,12 @@ class WrappedDataStore : public ArrayDataStore { bool deviceAllocated() const override { return device_allocated_; } void allocateDevice() const override { - if (ATLAS_HAVE_GPU && devices()) { + if (ATLAS_HAVE_GPU && pluto::devices()) { if (device_allocated_) { return; } if (size_) { - hicError_t err = hicMalloc((void**)&device_data_, sizeof(Value)*size_); - if (err != hicSuccess) { - throw_AssertionFailed("Failed to allocate GPU memory: " + std::string(hicGetErrorString(err)), Here()); - } + device_data_ = device_allocator_.allocate(size_); device_allocated_ = true; if (contiguous_) { accMap(); @@ -453,18 +397,13 @@ class WrappedDataStore : public ArrayDataStore { } void deallocateDevice() const override { - if constexpr (ATLAS_HAVE_GPU) { - if (device_allocated_) { - if (contiguous_) { - accUnmap(); - } - hicError_t err = hicFree(device_data_); - if (err != hicSuccess) { - throw_AssertionFailed("Failed to deallocate GPU memory: " + std::string(hicGetErrorString(err)), Here()); - } - device_data_ = nullptr; - device_allocated_ = false; + if (device_allocated_) { + if (contiguous_) { + accUnmap(); } + device_allocator_.deallocate(device_data_, size_); + device_data_ = nullptr; + device_allocated_ = false; } } @@ -505,7 +444,6 @@ class WrappedDataStore : public ArrayDataStore { } void accUnmap() const override { -#if ATLAS_HAVE_ACC if (acc_mapped_) { ATLAS_ASSERT(atlas::acc::is_present(host_data_, size_ * sizeof(Value))); if constexpr(ATLAS_ACC_DEBUG) { @@ -514,7 +452,6 @@ class WrappedDataStore : public ArrayDataStore { atlas::acc::unmap(host_data_); acc_mapped_ = false; } -#endif } private: @@ -532,6 +469,8 @@ class WrappedDataStore : public ArrayDataStore { mutable bool device_updated_{true}; mutable bool device_allocated_{false}; mutable bool acc_mapped_{false}; + + mutable pluto::allocator device_allocator_; }; } // namespace native diff --git a/src/atlas/library/Library.cc b/src/atlas/library/Library.cc index 084ea390f..f93d7241a 100644 --- a/src/atlas/library/Library.cc +++ b/src/atlas/library/Library.cc @@ -44,7 +44,7 @@ static bool feature_MKL() { } // namespace #endif -#include "hic/hic.h" +#include "pluto/pluto.h" #include "atlas_io/Trace.h" @@ -128,24 +128,6 @@ static void init_data_paths(std::vector& data_paths) { add_tokens(data_paths, "~atlas/share", ":"); } -static std::size_t devices() { - if constexpr (ATLAS_HAVE_GPU) { - static std::size_t _devices = []() -> std::size_t { - int num_devices = 0; - auto err = hicGetDeviceCount(&num_devices); - if (err) { - num_devices = 0; - } - return static_cast(num_devices); - }(); - return _devices; - } - else { - return 0; - } -} - - } // namespace //---------------------------------------------------------------------------------------------------------------------- @@ -363,7 +345,7 @@ void Library::initialise(const eckit::Parametrisation& config) { out << " OMP\n"; out << " max_threads [" << atlas_omp_get_max_threads() << "] \n"; out << " GPU\n"; - out << " devices [" << devices() << "] \n"; + out << " devices [" << pluto::devices() << "] \n"; out << " OpenACC [" << acc::devices() << "] \n"; out << " \n"; out << " log.info [" << str(info_) << "] \n"; diff --git a/src/atlas/parallel/acc/acc.cc b/src/atlas/parallel/acc/acc.cc index c1446f7a0..1f5c81323 100644 --- a/src/atlas/parallel/acc/acc.cc +++ b/src/atlas/parallel/acc/acc.cc @@ -13,20 +13,8 @@ #include "atlas/library/defines.h" #if ATLAS_HAVE_ACC -#include "hic/hic.h" +#include "pluto/pluto.h" #include "atlas_acc_support/atlas_acc.h" -static int hic_devices() { - static int devices_ = [](){ - int n = 0; - auto err = hicGetDeviceCount(&n); - if (err != hicSuccess) { - n = 0; - static_cast(hicGetLastError()); - } - return n; - }(); - return devices_; -} #endif namespace atlas::acc { @@ -34,15 +22,15 @@ namespace atlas::acc { int devices() { #if ATLAS_HAVE_ACC static int num_devices = [](){ - if (hic_devices() == 0) { + if (pluto::devices() == 0) { return 0; } auto devicetype = atlas_acc_get_device_type(); int _num_devices = atlas_acc_get_num_devices(); if (_num_devices == 1 && devicetype == atlas_acc_device_host) { --_num_devices; - } - return _num_devices; + } + return _num_devices; }(); return num_devices; #else diff --git a/src/atlas/util/Allocate.cc b/src/atlas/util/Allocate.cc index 58f2648d6..0f3cb5e48 100644 --- a/src/atlas/util/Allocate.cc +++ b/src/atlas/util/Allocate.cc @@ -13,11 +13,11 @@ #include "eckit/log/CodeLocation.h" +#include "pluto/pluto.h" + #include "atlas/library/config.h" #include "atlas/runtime/Exception.h" -#include "hic/hic.h" - namespace atlas { namespace util { @@ -25,59 +25,34 @@ namespace util { namespace detail { //------------------------------------------------------------------------------ -static int devices() { - static int devices_ = [](){ - int n = 0; - auto err = hicGetDeviceCount(&n); - if (err != hicSuccess) { - n = 0; - static_cast(hicGetLastError()); - } - return n; - }(); - return devices_; -} - void allocate_managed(void** ptr, size_t bytes) { if constexpr (not ATLAS_HAVE_GPU) { return allocate_host(ptr, bytes); } - if (devices() == 0) { - return allocate_host(ptr, bytes); - } - HIC_CALL(hicMallocManaged(ptr, bytes)); + *ptr = pluto::managed_resource()->allocate(bytes, pluto::default_alignment()); } void deallocate_managed(void* ptr, size_t bytes) { if constexpr (not ATLAS_HAVE_GPU) { return deallocate_host(ptr, bytes); } - if (devices() == 0) { - return deallocate_host(ptr, bytes); - } - HIC_CALL(hicDeviceSynchronize()); - HIC_CALL(hicFree(ptr)); + pluto::wait(); + pluto::managed_resource()->deallocate(ptr, bytes, pluto::default_alignment()); } void allocate_device(void** ptr, size_t bytes) { if constexpr (not ATLAS_HAVE_GPU) { return allocate_host(ptr, bytes); } - if (devices() == 0) { - return allocate_host(ptr, bytes); - } - HIC_CALL(hicMalloc(ptr, bytes)); + *ptr = pluto::device_resource()->allocate(bytes, pluto::default_alignment()); } void deallocate_device(void* ptr, size_t bytes) { if constexpr (not ATLAS_HAVE_GPU) { return deallocate_host(ptr, bytes); } - if (devices() == 0) { - return deallocate_host(ptr, bytes); - } - HIC_CALL(hicDeviceSynchronize()); - HIC_CALL(hicFree(ptr)); + pluto::wait(); + pluto::device_resource()->deallocate(ptr, bytes, pluto::default_alignment()); } void allocate_host(void** ptr, size_t bytes) { diff --git a/src/tests/mesh/test_connectivity_kernel.hic b/src/tests/mesh/test_connectivity_kernel.hic index 496fbb441..e4432f02f 100644 --- a/src/tests/mesh/test_connectivity_kernel.hic +++ b/src/tests/mesh/test_connectivity_kernel.hic @@ -8,7 +8,7 @@ * does it submit to any jurisdiction. */ -#include "hic/hic.h" +#include "pluto/pluto.h" #include "atlas/mesh/Connectivity.h" #include "tests/AtlasTestEnvironment.h" @@ -16,12 +16,13 @@ template class managed { public: - managed() { - hicMallocManaged(&data_, sizeof(T)); + managed() : + alloc_(pluto::managed_resource()) { + data_ = alloc_.allocate(1); } ~managed() { - hicFree(data_); + alloc_.deallocate(data_, 1); } const T* data() const { return data_; } @@ -32,6 +33,7 @@ public: private: T* data_; + pluto::allocator alloc_; }; using namespace atlas::mesh; @@ -122,7 +124,7 @@ CASE( "test_block_connectivity" ) kernel_block<<<1,1>>>(conn, result.data()); - hicDeviceSynchronize(); + pluto::wait(); EXPECT( result.value() == true ); @@ -148,7 +150,7 @@ CASE( "test_irregular_connectivity" ) kernel_irr<<<1,1>>>(conn, result.data()); - hicDeviceSynchronize(); + pluto::wait(); EXPECT( result.value() == true ); @@ -173,7 +175,7 @@ CASE( "test_multiblock_connectivity" ) kernel_multiblock<<<1,1>>>(conn, result.data()); - hicDeviceSynchronize(); + pluto::wait(); EXPECT( result.value() == true ); diff --git a/src/tests/parallel/test_haloexchange.cc b/src/tests/parallel/test_haloexchange.cc index 5009c4054..b7b542b59 100644 --- a/src/tests/parallel/test_haloexchange.cc +++ b/src/tests/parallel/test_haloexchange.cc @@ -13,7 +13,7 @@ #include #include -#include "hic/hic.h" +#include "pluto/pluto.h" #include "atlas/array.h" #include "atlas/array/ArrayView.h" @@ -709,49 +709,23 @@ CASE("test_haloexchange") { Fixture f; SECTION("test_rank0_arrview") { test_rank0_arrview(f); } - SECTION("test_rank1") { test_rank1(f); } - SECTION("test_rank1_strided_v1") { test_rank1_strided_v1(f); } - SECTION("test_rank1_strided_v2") { test_rank1_strided_v2(f); } - SECTION("test_rank2") { test_rank2(f); } - SECTION("test_rank2_l1") { test_rank2_l1(f); } - SECTION("test_rank2_l2_v2") { test_rank2_l2_v2(f); } - SECTION("test_rank2_v2") { test_rank2_v2(f); } - SECTION("test_rank0_wrap") { test_rank0_wrap(f); } - SECTION("test_rank1_paralleldim_1") { test_rank1_paralleldim1(f); } - SECTION("test_rank2_paralleldim_2") { test_rank2_paralleldim2(f); } - SECTION("test_rank1_cinterface") { test_rank1_cinterface(f); } } -#if ATLAS_HAVE_GPU - //----------------------------------------------------------------------------- - -static int devices() { - static int devices_ = [](){ - int n = 0; - auto err = hicGetDeviceCount(&n); - if (err != hicSuccess) { - n = 0; - static_cast(hicGetLastError()); - } - return n; - }(); - return devices_; -} - +#if ATLAS_HAVE_GPU CASE("test_haloexchange on device") { - if (devices() == 0) { + if (pluto::devices() == 0) { Log::warning() << "\"test_haloexchange on device skipped\": No devices available" << std::endl; return; } @@ -760,16 +734,11 @@ CASE("test_haloexchange on device") { Fixture f(on_device); SECTION("test_rank0_arrview") { test_rank0_arrview(f); } - SECTION("test_rank1") { test_rank1(f); } - SECTION("test_rank2") { test_rank2(f); } - SECTION("test_rank0_wrap") { test_rank0_wrap(f); } } #endif - - //----------------------------------------------------------------------------- } // namespace test