From 6147df293f95d951ce97e7e7987d1347f1f59c5f Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Wed, 3 Apr 2024 23:08:49 -0700 Subject: [PATCH 01/44] Basic reference counting on host; host & device construction in make_shared; --- src/chai/ManagedSharedPtr.hpp | 324 ++++++++++++++++++++++++++++++++++ 1 file changed, 324 insertions(+) create mode 100644 src/chai/ManagedSharedPtr.hpp diff --git a/src/chai/ManagedSharedPtr.hpp b/src/chai/ManagedSharedPtr.hpp new file mode 100644 index 00000000..77652dd6 --- /dev/null +++ b/src/chai/ManagedSharedPtr.hpp @@ -0,0 +1,324 @@ +#ifndef CHAI_MANAGED_SHARED_PTR +#define CHAI_MANAGED_SHARED_PTR + +#include + +#include "chai/ExecutionSpaces.hpp" +#include "chai/ArrayManager.hpp" +#include "chai/managed_ptr.hpp" + +namespace chai { + +template +struct msp_pointer_record { + + // Using NUM_EXECUTION_SPACES for the time being, this will help with logical + // control since ExecutionSpaces are already defined. + // Only CPU and GPU spaces will be used. + // If other spaces are enabled they will not be used by ManagedSharedPtr. + Tp* m_pointers[NUM_EXECUTION_SPACES]; + bool m_touched[NUM_EXECUTION_SPACES]; + bool m_owned[NUM_EXECUTION_SPACES]; + + ExecutionSpace m_last_space; + //UserCallback m_user_callback; + + int m_allocators[NUM_EXECUTION_SPACES]; + + msp_pointer_record(Tp* host_p = nullptr, Tp* device_p = nullptr) : m_last_space(NONE) { + for (int space = 0; space < NUM_EXECUTION_SPACES; ++space ) { + m_pointers[space] = nullptr; + m_touched[space] = false; + m_owned[space] = true; + m_allocators[space] = 0; + } + m_pointers[CPU] = host_p; + m_pointers[GPU] = device_p; + } + +}; + + +class msp_counted_base { +public: + msp_counted_base() noexcept : m_use_count(1) {} + + virtual ~msp_counted_base() noexcept {} + + virtual void m_dispose() noexcept = 0; + virtual void m_destroy() noexcept { delete this; } + + void m_add_ref_copy() noexcept { ++m_use_count; } + + void m_release() noexcept { + if(--m_use_count == 0) { + m_dispose(); + m_destroy(); + } + } + + long m_get_use_count() const noexcept { return m_use_count; } +private: + msp_counted_base(msp_counted_base const&) = delete; + msp_counted_base& operator=(msp_counted_base const&) = delete; + + long m_use_count = 0; +}; + +template +class msp_counted_ptr final : public msp_counted_base { +public: + msp_counted_ptr(Ptr p) noexcept : m_ptr(p) {} + virtual void m_dispose() noexcept { delete m_ptr.m_pointers[chai::CPU]; }// TODO : Other Exec spaces... + virtual void m_destroy() noexcept { delete this; } + msp_counted_ptr(msp_counted_ptr const&) = delete; + msp_counted_ptr& operator=(msp_counted_ptr const&) = delete; +private: + Ptr m_ptr; +}; + +template +class msp_counted_deleter final : public msp_counted_base { + + class impl { + public: + impl(Ptr p, Deleter d) : m_ptr(p), m_deleter(std::move(d)) {} + Deleter& m_del() noexcept { return m_deleter; } + Ptr m_ptr; + Deleter m_deleter; + }; + +public: + msp_counted_deleter(Ptr p, Deleter d) noexcept : m_impl(p, std::move(d)) {} + virtual void m_dispose() noexcept { m_impl.m_del()(m_impl.m_ptr->m_pointers[chai::CPU]); } + virtual void m_destroy() noexcept { this->~msp_counted_deleter(); } + msp_counted_deleter(msp_counted_deleter const&) = delete; + msp_counted_deleter& operator=(msp_counted_deleter const&) = delete; +private: + impl m_impl; +}; + + +class msp_shared_count { +public: + constexpr msp_shared_count() noexcept : m_pi(0) {} + + template + explicit msp_shared_count(Ptr p) + : m_pi( new msp_counted_ptr(p) ) {} + + template + explicit msp_shared_count(Ptr p, Deleter d) + : m_pi( new msp_counted_deleter(p, d) ) {} + + ~msp_shared_count() noexcept + { if (m_pi) m_pi->m_release(); } + + msp_shared_count(msp_shared_count const& rhs) noexcept : m_pi(rhs.m_pi) + { if (m_pi) m_pi->m_add_ref_copy(); } + + msp_shared_count& operator=(msp_shared_count const& rhs) noexcept { + msp_counted_base* temp = rhs.m_pi; + if (temp != m_pi) + { + if (temp) temp->m_add_ref_copy(); + if (m_pi) m_pi->m_release(); + m_pi = temp; + } + return *this; + } + + void m_swap(msp_shared_count& rhs) noexcept { + msp_counted_base* temp = rhs.m_pi; + rhs.m_pi = m_pi; + m_pi = temp; + } + + long m_get_use_count() const noexcept + { return m_pi ? m_pi->m_get_use_count() : 0; } + + friend inline bool + operator==(msp_shared_count const& a, msp_shared_count const& b) noexcept + { return a.m_pi == b.m_pi; } + + msp_counted_base* m_pi; + +}; + + + + + + +// Type traits for SFINAE +template +struct msp_is_constructible : std::is_convertible::type {}; + +template +struct msp_compatible_with : std::false_type {}; + +template +struct msp_compatible_with : std::is_convertible::type {}; + + + + +template +class ManagedSharedPtr { + +public: + using element_type = Tp;//typename std::remove_extent::type; + +private: + template + using SafeConv = typename std::enable_if< + msp_is_constructible::value + >::type; + + template + using Compatible = typename std::enable_if< + msp_compatible_with::value, + Res + >::type; + + template + using Assignable = Compatible; + +public: + // Default Ctor with same type Tp + constexpr ManagedSharedPtr() noexcept : m_ref_count() {} + + // *Default* Ctor with convertible type Yp -> Tp + template> + explicit ManagedSharedPtr(Yp* host_p) : + m_pointer_record(new msp_pointer_record(host_p)), + m_ref_count(m_pointer_record) + {} + + template> + ManagedSharedPtr(Yp* host_p, Deleter d) : + m_pointer_record(new msp_pointer_record(host_p)), + m_ref_count(m_pointer_record, std::move(d)) + { + } + + template> + ManagedSharedPtr(Yp* host_p, Yp* device_p, Deleter d) : + m_pointer_record(new msp_pointer_record(host_p, device_p)), + m_ref_count(m_pointer_record, std::move(d)) + { + } + + + long use_count() const noexcept { return m_ref_count.m_get_use_count(); } + + + + +private: + + template + friend class ManagedSharedPtr; + + template + friend ManagedSharedPtr make_managed(Args... args); + + mutable msp_pointer_record* m_pointer_record = nullptr; + msp_shared_count m_ref_count; + + //mutable ArrayManager* m_resource_manager = nullptr; +}; + +//template +//T* make_on_host(Args&&... args) { +//#if !defined(CHAI_DISABLE_RM) +// // Get the ArrayManager and save the current execution space +// chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance(); +// ExecutionSpace currentSpace = arrayManager->getExecutionSpace(); +// +// // Set the execution space so that ManagedArrays and managed_ptrs +// // are handled properly +// arrayManager->setExecutionSpace(CPU); +//#endif +// +// // Create on the host +// T* cpuPointer = new T(args...); +// +//#if !defined(CHAI_DISABLE_RM) +// // Set the execution space back to the previous value +// arrayManager->setExecutionSpace(currentSpace); +//#endif +// +// // Return the CPU pointer +// return cpuPointer; +//} +// +//namespace detail { +// +//template +//__global__ void make_on_device(T** gpuPointer, Args... args) +//{ +// *gpuPointer = new T(args...); +//} +// +//}// namespace detail +// +//template +//T* make_on_device(Args... args) { +//#if !defined(CHAI_DISABLE_RM) +// // Get the ArrayManager and save the current execution space +// chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance(); +// ExecutionSpace currentSpace = arrayManager->getExecutionSpace(); +// +// // Set the execution space so that ManagedArrays and managed_ptrs +// // are handled properly +// arrayManager->setExecutionSpace(GPU); +//#endif +// +// // Allocate space on the GPU to hold the pointer to the new object +// T** gpuBuffer; +// gpuMalloc((void**)(&gpuBuffer), sizeof(T*)); +// +// // Create the object on the device +//#if defined(__CUDACC__) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) +// detail::make_on_device<<<1, 1>>>(gpuBuffer, args...); +//#elif defined(__HIPCC__) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) +// hipLaunchKernelGGL(detail::make_on_device, 1, 1, 0, 0, gpuBuffer, args...); +//#endif +// +// // Allocate space on the CPU for the pointer and copy the pointer to the CPU +// T** cpuBuffer = (T**) malloc(sizeof(T*)); +// gpuMemcpy(cpuBuffer, gpuBuffer, sizeof(T*), gpuMemcpyDeviceToHost); +// +// // Get the GPU pointer +// T* gpuPointer = cpuBuffer[0]; +// +// // Free the host and device buffers +// free(cpuBuffer); +// gpuFree(gpuBuffer); +// +//#if !defined(CHAI_DISABLE_RM) +// // Set the execution space back to the previous value +// arrayManager->setExecutionSpace(currentSpace); +//#endif +// +// // Return the GPU pointer +// return gpuPointer; +//} + +template +ManagedSharedPtr make_shared(Args... args) { + Tp* gpu_pointer = make_on_device(args...); + Tp* cpu_pointer = make_on_host(args...); + + return ManagedSharedPtr(cpu_pointer, gpu_pointer, [](Tp* p){delete p;}); +} + + +} // namespace chai + + +#endif // CHAI_MANAGED_SHARED_PTR From f489a15840ecc1e1959c22277a51af4e5ed9e6bd Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 9 Apr 2024 09:37:11 -0700 Subject: [PATCH 02/44] reinterpret cast record pointer on copy; adding accessors; custom deleters working w/ record. --- src/chai/ManagedSharedPtr.hpp | 171 +++++++++++------------- src/chai/managed_ptr.hpp | 2 + tests/integration/managed_ptr_tests.cpp | 65 ++++++++- 3 files changed, 140 insertions(+), 98 deletions(-) diff --git a/src/chai/ManagedSharedPtr.hpp b/src/chai/ManagedSharedPtr.hpp index 77652dd6..4242b1b0 100644 --- a/src/chai/ManagedSharedPtr.hpp +++ b/src/chai/ManagedSharedPtr.hpp @@ -25,6 +25,19 @@ struct msp_pointer_record { int m_allocators[NUM_EXECUTION_SPACES]; + //template + //msp_pointer_record(Yp* host_p = nullptr, Yp* device_p = nullptr) : m_last_space(NONE) { + // for (int space = 0; space < NUM_EXECUTION_SPACES; ++space ) { + // m_pointers[space] = nullptr; + // m_touched[space] = false; + // m_owned[space] = true; + // m_allocators[space] = 0; + // } + // m_pointers[CPU] = host_p; + // m_pointers[GPU] = device_p; + //} + + msp_pointer_record(Tp* host_p = nullptr, Tp* device_p = nullptr) : m_last_space(NONE) { for (int space = 0; space < NUM_EXECUTION_SPACES; ++space ) { m_pointers[space] = nullptr; @@ -36,6 +49,18 @@ struct msp_pointer_record { m_pointers[GPU] = device_p; } + //Tp* get_pointer(ExecutionSpace space) noexcept { return m_pointers[space]; } + //template + //msp_pointer_record(msp_pointer_record const& rhs) : + // m_pointers(rhs.m_pointers), + // m_touched(rhs.m_touched), + // m_owned(rhs.m_owned), + // m_last_space(rhs.m_last_space), + // m_allocators(rhs.m_allocators) + //{} + + + }; @@ -69,7 +94,8 @@ template class msp_counted_ptr final : public msp_counted_base { public: msp_counted_ptr(Ptr p) noexcept : m_ptr(p) {} - virtual void m_dispose() noexcept { delete m_ptr.m_pointers[chai::CPU]; }// TODO : Other Exec spaces... + //virtual void m_dispose() noexcept { delete (m_ptr.get_pointer(chai::CPU)); }// TODO : Other Exec spaces... + virtual void m_dispose() noexcept { delete m_ptr->m_pointers[chai::CPU]; }// TODO : Other Exec spaces... virtual void m_destroy() noexcept { delete this; } msp_counted_ptr(msp_counted_ptr const&) = delete; msp_counted_ptr& operator=(msp_counted_ptr const&) = delete; @@ -185,130 +211,89 @@ class ManagedSharedPtr { using Assignable = Compatible; public: - // Default Ctor with same type Tp + + /* + * Constructors + */ constexpr ManagedSharedPtr() noexcept : m_ref_count() {} // *Default* Ctor with convertible type Yp -> Tp template> explicit ManagedSharedPtr(Yp* host_p) : m_pointer_record(new msp_pointer_record(host_p)), - m_ref_count(m_pointer_record) + m_ref_count(m_pointer_record), + m_active_pointer(m_pointer_record->m_pointers[chai::CPU]) + {} + + template> + explicit ManagedSharedPtr(Yp* host_p, Yp* device_p) : + m_pointer_record(new msp_pointer_record(host_p, device_p)), + m_ref_count(m_pointer_record), + m_active_pointer(m_pointer_record->m_pointers[chai::CPU]) {} template> ManagedSharedPtr(Yp* host_p, Deleter d) : - m_pointer_record(new msp_pointer_record(host_p)), - m_ref_count(m_pointer_record, std::move(d)) - { - } + m_pointer_record(new msp_pointer_record(host_p)), + m_ref_count(m_pointer_record, std::move(d)), + m_active_pointer(m_pointer_record->m_pointers[chai::CPU]) + {} template> ManagedSharedPtr(Yp* host_p, Yp* device_p, Deleter d) : - m_pointer_record(new msp_pointer_record(host_p, device_p)), - m_ref_count(m_pointer_record, std::move(d)) + m_pointer_record(new msp_pointer_record(host_p, device_p)), + m_ref_count(m_pointer_record, std::move(d)), + m_active_pointer(m_pointer_record->m_pointers[chai::CPU]) + {} + + /* + * Copy Constructors + */ + ManagedSharedPtr(ManagedSharedPtr const&) noexcept = default; // TODO: this is *NOT* going to be default + + template> + ManagedSharedPtr(ManagedSharedPtr const& rhs) noexcept : + m_ref_count(rhs.m_ref_count), + m_active_pointer(rhs.m_active_pointer) { + // TODO : Is this safe?? + m_pointer_record = reinterpret_cast*>(rhs.m_pointer_record); } + + /* + * Accessors + */ + element_type* get(ExecutionSpace space = chai::CPU) const noexcept { return m_active_pointer; } - long use_count() const noexcept { return m_ref_count.m_get_use_count(); } + element_type& operator*() const noexcept { assert(m_get() != nullptr); return *m_get(); } + element_type* operator->() const noexcept { assert(m_get() != nullptr); return m_get(); } +private: + element_type* m_get() const noexcept { return static_cast*>(this)->get(); } -private: +public: + long use_count() const noexcept { return m_ref_count.m_get_use_count(); } + /* + * Private Members + */ +private: template friend class ManagedSharedPtr; - template - friend ManagedSharedPtr make_managed(Args... args); + //template + //friend ManagedSharedPtr make_managed(Args... args); mutable msp_pointer_record* m_pointer_record = nullptr; msp_shared_count m_ref_count; + mutable element_type* m_active_pointer = nullptr; //mutable ArrayManager* m_resource_manager = nullptr; }; -//template -//T* make_on_host(Args&&... args) { -//#if !defined(CHAI_DISABLE_RM) -// // Get the ArrayManager and save the current execution space -// chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance(); -// ExecutionSpace currentSpace = arrayManager->getExecutionSpace(); -// -// // Set the execution space so that ManagedArrays and managed_ptrs -// // are handled properly -// arrayManager->setExecutionSpace(CPU); -//#endif -// -// // Create on the host -// T* cpuPointer = new T(args...); -// -//#if !defined(CHAI_DISABLE_RM) -// // Set the execution space back to the previous value -// arrayManager->setExecutionSpace(currentSpace); -//#endif -// -// // Return the CPU pointer -// return cpuPointer; -//} -// -//namespace detail { -// -//template -//__global__ void make_on_device(T** gpuPointer, Args... args) -//{ -// *gpuPointer = new T(args...); -//} -// -//}// namespace detail -// -//template -//T* make_on_device(Args... args) { -//#if !defined(CHAI_DISABLE_RM) -// // Get the ArrayManager and save the current execution space -// chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance(); -// ExecutionSpace currentSpace = arrayManager->getExecutionSpace(); -// -// // Set the execution space so that ManagedArrays and managed_ptrs -// // are handled properly -// arrayManager->setExecutionSpace(GPU); -//#endif -// -// // Allocate space on the GPU to hold the pointer to the new object -// T** gpuBuffer; -// gpuMalloc((void**)(&gpuBuffer), sizeof(T*)); -// -// // Create the object on the device -//#if defined(__CUDACC__) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) -// detail::make_on_device<<<1, 1>>>(gpuBuffer, args...); -//#elif defined(__HIPCC__) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) -// hipLaunchKernelGGL(detail::make_on_device, 1, 1, 0, 0, gpuBuffer, args...); -//#endif -// -// // Allocate space on the CPU for the pointer and copy the pointer to the CPU -// T** cpuBuffer = (T**) malloc(sizeof(T*)); -// gpuMemcpy(cpuBuffer, gpuBuffer, sizeof(T*), gpuMemcpyDeviceToHost); -// -// // Get the GPU pointer -// T* gpuPointer = cpuBuffer[0]; -// -// // Free the host and device buffers -// free(cpuBuffer); -// gpuFree(gpuBuffer); -// -//#if !defined(CHAI_DISABLE_RM) -// // Set the execution space back to the previous value -// arrayManager->setExecutionSpace(currentSpace); -//#endif -// -// // Return the GPU pointer -// return gpuPointer; -//} - template ManagedSharedPtr make_shared(Args... args) { Tp* gpu_pointer = make_on_device(args...); diff --git a/src/chai/managed_ptr.hpp b/src/chai/managed_ptr.hpp index 14988cb0..708261e1 100644 --- a/src/chai/managed_ptr.hpp +++ b/src/chai/managed_ptr.hpp @@ -759,6 +759,7 @@ namespace chai { typename... Args> __global__ void make_on_device(T** gpuPointer, Args... args) { + printf("On GPU\n"); *gpuPointer = new T(processArguments(args)...); } @@ -879,6 +880,7 @@ namespace chai { arrayManager->setExecutionSpace(CPU); #endif + printf("On Host\n"); // Create on the host T* cpuPointer = new T(detail::processArguments(args)...); diff --git a/tests/integration/managed_ptr_tests.cpp b/tests/integration/managed_ptr_tests.cpp index 579dbe35..259f9e61 100644 --- a/tests/integration/managed_ptr_tests.cpp +++ b/tests/integration/managed_ptr_tests.cpp @@ -4,6 +4,7 @@ // // SPDX-License-Identifier: BSD-3-Clause ////////////////////////////////////////////////////////////////////////////// +#include "chai/ManagedSharedPtr.hpp" #include "gtest/gtest.h" #define GPU_TEST(X, Y) \ @@ -15,6 +16,7 @@ #include "chai/ArrayManager.hpp" #include "chai/ManagedArray.hpp" #include "chai/managed_ptr.hpp" +#include "chai/ManagedSharedPtr.hpp" #include "../src/util/forall.hpp" @@ -71,22 +73,31 @@ class RawPointerClass { class TestBase { public: - CHAI_HOST_DEVICE TestBase() {} - CHAI_HOST_DEVICE virtual ~TestBase() {} + CHAI_HOST_DEVICE TestBase() {printf("TestBase Ctor\n");} + CHAI_HOST_DEVICE virtual ~TestBase() {printf("TestBase Dtor\n");} CHAI_HOST_DEVICE virtual int getValue(const int i) const = 0; + CHAI_HOST_DEVICE virtual int getMemberValue() const = 0; + CHAI_HOST_DEVICE virtual void setMemberValue(int v) = 0; + CHAI_HOST virtual void doSomething() const = 0; }; class TestDerived : public TestBase { public: - CHAI_HOST_DEVICE TestDerived() : TestBase(), m_values(nullptr) {} + CHAI_HOST_DEVICE TestDerived() : TestBase(), m_values(nullptr) {printf("TestDerived Ctor\n");} CHAI_HOST_DEVICE TestDerived(chai::ManagedArray values) : TestBase(), m_values(values) {} - CHAI_HOST_DEVICE virtual ~TestDerived() {} + CHAI_HOST_DEVICE virtual ~TestDerived() {printf("TestDerived Dtor\n");} CHAI_HOST_DEVICE virtual int getValue(const int i) const { return m_values[i]; } + CHAI_HOST_DEVICE virtual int getMemberValue() const {return m_member;} + + CHAI_HOST_DEVICE void setMemberValue(int v) { m_member = v; } + + CHAI_HOST virtual void doSomething() const {printf("TestDerived doSomething()\n");} private: chai::ManagedArray m_values; + int m_member = -1; }; class TestInnerBase { @@ -151,6 +162,27 @@ class MultipleRawArrayClass { int* m_values2; }; +TEST(managed_ptr, shared_ptr) +{ + + //chai::ManagedSharedPtr sptr(new TestDerived(), + // [](TestDerived*p){ printf("Deleter Call\n"); p->~TestDerived(); }); + chai::ManagedSharedPtr sptr = chai::ManagedSharedPtr(new TestDerived(), + [](TestDerived*p){ printf("Custom Deleter Call\n"); delete p; }); + //chai::ManagedSharedPtr sptr(new TestDerived()); + + //chai::ManagedSharedPtr sptr = chai::make_shared(); + std::cout << "use_count : " << sptr.use_count() << std::endl; + + auto sptr2 = sptr; + sptr2->doSomething(); + std::cout << "use_count : " << sptr.use_count() << std::endl; + + + + +} + TEST(managed_ptr, class_with_raw_array) { const int expectedValue = rand(); @@ -523,6 +555,7 @@ GPU_TEST(managed_ptr, gpu_class_with_raw_array_and_callback) GPU_TEST(managed_ptr, gpu_class_with_managed_array) { const int expectedValue = rand(); + const int expectedMemberValue = rand(); chai::ManagedArray array(1, chai::CPU); @@ -531,16 +564,38 @@ GPU_TEST(managed_ptr, gpu_class_with_managed_array) }); chai::managed_ptr derived = chai::make_managed(array); + derived->setMemberValue(expectedMemberValue); + + derived.set_callback([=] (chai::Action action, chai::ExecutionSpace space, void*) mutable { + if (action == chai::ACTION_MOVE) { + //printf("trigger move : "); + //if (space == chai::NONE) printf("NONE\n"); + //if (space == chai::CPU) printf("CPU\n"); + //if (space == chai::GPU) printf("GPU\n"); + auto temp = array; // Trigger copy constructor in order to move inner ManagedArray to correct memory space + (void) temp; // Get rid of unused variable warnings + return true; + } + else if (action == chai::ACTION_FREE && space == chai::NONE) { + array.free(); // If TestDerived does not take ownership of the ManagedArray, you can use the callback to clean it up + return true; + } + else { + return false; + } + }); - chai::ManagedArray results(1, chai::GPU); + chai::ManagedArray results(2, chai::GPU); forall(gpu(), 0, 1, [=] __device__ (int i) { results[i] = derived->getValue(i); + results[1] = derived->getMemberValue(); }); results.move(chai::CPU); ASSERT_EQ(results[0], expectedValue); + ASSERT_EQ(results[1], expectedMemberValue); results.free(); derived.free(); From c18723a83d856ac2b8e5ef1d4a7ad302db2e8d2a Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 9 Apr 2024 13:47:26 -0700 Subject: [PATCH 03/44] SharedPointerRecord header; Generalizing Record w/ void* pointers; Focussing on make_shared as the only construction option for now. --- src/chai/CMakeLists.txt | 1 + src/chai/ManagedSharedPtr.hpp | 146 +++--- src/chai/PointerRecord.hpp | 2 + src/chai/SharedPointerRecord.hpp | 81 ++++ src/chai/SharedPtrManager.cpp | 616 ++++++++++++++++++++++++ src/chai/SharedPtrManager.hpp | 535 ++++++++++++++++++++ src/chai/SharedPtrManager.inl | 129 +++++ tests/integration/managed_ptr_tests.cpp | 9 +- 8 files changed, 1424 insertions(+), 95 deletions(-) create mode 100644 src/chai/SharedPointerRecord.hpp create mode 100644 src/chai/SharedPtrManager.cpp create mode 100644 src/chai/SharedPtrManager.hpp create mode 100644 src/chai/SharedPtrManager.inl diff --git a/src/chai/CMakeLists.txt b/src/chai/CMakeLists.txt index 2285c544..ed138282 100644 --- a/src/chai/CMakeLists.txt +++ b/src/chai/CMakeLists.txt @@ -29,6 +29,7 @@ if(CHAI_DISABLE_RM) endif () set (chai_sources + # SharedPtrManager.cpp ArrayManager.cpp) set (chai_depends diff --git a/src/chai/ManagedSharedPtr.hpp b/src/chai/ManagedSharedPtr.hpp index 4242b1b0..ad9b04ad 100644 --- a/src/chai/ManagedSharedPtr.hpp +++ b/src/chai/ManagedSharedPtr.hpp @@ -4,64 +4,12 @@ #include #include "chai/ExecutionSpaces.hpp" -#include "chai/ArrayManager.hpp" +//#include "chai/SharedPtrManager.hpp" +#include "chai/SharedPointerRecord.hpp" #include "chai/managed_ptr.hpp" namespace chai { -template -struct msp_pointer_record { - - // Using NUM_EXECUTION_SPACES for the time being, this will help with logical - // control since ExecutionSpaces are already defined. - // Only CPU and GPU spaces will be used. - // If other spaces are enabled they will not be used by ManagedSharedPtr. - Tp* m_pointers[NUM_EXECUTION_SPACES]; - bool m_touched[NUM_EXECUTION_SPACES]; - bool m_owned[NUM_EXECUTION_SPACES]; - - ExecutionSpace m_last_space; - //UserCallback m_user_callback; - - int m_allocators[NUM_EXECUTION_SPACES]; - - //template - //msp_pointer_record(Yp* host_p = nullptr, Yp* device_p = nullptr) : m_last_space(NONE) { - // for (int space = 0; space < NUM_EXECUTION_SPACES; ++space ) { - // m_pointers[space] = nullptr; - // m_touched[space] = false; - // m_owned[space] = true; - // m_allocators[space] = 0; - // } - // m_pointers[CPU] = host_p; - // m_pointers[GPU] = device_p; - //} - - - msp_pointer_record(Tp* host_p = nullptr, Tp* device_p = nullptr) : m_last_space(NONE) { - for (int space = 0; space < NUM_EXECUTION_SPACES; ++space ) { - m_pointers[space] = nullptr; - m_touched[space] = false; - m_owned[space] = true; - m_allocators[space] = 0; - } - m_pointers[CPU] = host_p; - m_pointers[GPU] = device_p; - } - - //Tp* get_pointer(ExecutionSpace space) noexcept { return m_pointers[space]; } - //template - //msp_pointer_record(msp_pointer_record const& rhs) : - // m_pointers(rhs.m_pointers), - // m_touched(rhs.m_touched), - // m_owned(rhs.m_owned), - // m_last_space(rhs.m_last_space), - // m_allocators(rhs.m_allocators) - //{} - - - -}; class msp_counted_base { @@ -90,33 +38,36 @@ class msp_counted_base { long m_use_count = 0; }; -template +template class msp_counted_ptr final : public msp_counted_base { public: - msp_counted_ptr(Ptr p) noexcept : m_ptr(p) {} - //virtual void m_dispose() noexcept { delete (m_ptr.get_pointer(chai::CPU)); }// TODO : Other Exec spaces... - virtual void m_dispose() noexcept { delete m_ptr->m_pointers[chai::CPU]; }// TODO : Other Exec spaces... + msp_counted_ptr(Record p) noexcept : m_record(p) {} + //virtual void m_dispose() noexcept { delete (m_record.get_pointer(chai::CPU)); }// TODO : Other Exec spaces... + virtual void m_dispose() noexcept { delete (Ptr)m_record->m_pointers[chai::CPU]; }// TODO : Other Exec spaces... virtual void m_destroy() noexcept { delete this; } msp_counted_ptr(msp_counted_ptr const&) = delete; msp_counted_ptr& operator=(msp_counted_ptr const&) = delete; private: - Ptr m_ptr; + Record m_record; }; -template +template class msp_counted_deleter final : public msp_counted_base { class impl { public: - impl(Ptr p, Deleter d) : m_ptr(p), m_deleter(std::move(d)) {} + impl(Record p, Deleter d) : m_record(p), m_deleter(std::move(d)) {} Deleter& m_del() noexcept { return m_deleter; } - Ptr m_ptr; + Record m_record; Deleter m_deleter; }; public: - msp_counted_deleter(Ptr p, Deleter d) noexcept : m_impl(p, std::move(d)) {} - virtual void m_dispose() noexcept { m_impl.m_del()(m_impl.m_ptr->m_pointers[chai::CPU]); } + msp_counted_deleter(Record p, Deleter d) noexcept : m_impl(p, std::move(d)) {} + virtual void m_dispose() noexcept { + printf("Delete GPU Memory Here...\n"); + m_impl.m_del()((Ptr)m_impl.m_record->m_pointers[chai::CPU]); + } virtual void m_destroy() noexcept { this->~msp_counted_deleter(); } msp_counted_deleter(msp_counted_deleter const&) = delete; msp_counted_deleter& operator=(msp_counted_deleter const&) = delete; @@ -129,13 +80,13 @@ class msp_shared_count { public: constexpr msp_shared_count() noexcept : m_pi(0) {} - template - explicit msp_shared_count(Ptr p) - : m_pi( new msp_counted_ptr(p) ) {} + template + explicit msp_shared_count(Ptr, Record p) + : m_pi( new msp_counted_ptr(p) ) {} - template - explicit msp_shared_count(Ptr p, Deleter d) - : m_pi( new msp_counted_deleter(p, d) ) {} + template + explicit msp_shared_count(Ptr, Record p, Deleter d) + : m_pi( new msp_counted_deleter(p, d) ) {} ~msp_shared_count() noexcept { if (m_pi) m_pi->m_release(); } @@ -217,33 +168,37 @@ class ManagedSharedPtr { */ constexpr ManagedSharedPtr() noexcept : m_ref_count() {} - // *Default* Ctor with convertible type Yp -> Tp - template> - explicit ManagedSharedPtr(Yp* host_p) : - m_pointer_record(new msp_pointer_record(host_p)), - m_ref_count(m_pointer_record), - m_active_pointer(m_pointer_record->m_pointers[chai::CPU]) - {} + //// *Default* Ctor with convertible type Yp -> Tp + //template> + //explicit ManagedSharedPtr(Yp* host_p) : + // m_pointer_record(new msp_pointer_record(host_p)), + // m_ref_count(host_p, m_pointer_record), + // m_active_pointer(static_cast(m_pointer_record->m_pointers[chai::CPU])) + // //m_resource_manager(SharedPtrManager::getInstance()) + //{} - template> - explicit ManagedSharedPtr(Yp* host_p, Yp* device_p) : - m_pointer_record(new msp_pointer_record(host_p, device_p)), - m_ref_count(m_pointer_record), - m_active_pointer(m_pointer_record->m_pointers[chai::CPU]) - {} + //template> + //explicit ManagedSharedPtr(Yp* host_p, Yp* device_p) : + // m_pointer_record(new msp_pointer_record(host_p, device_p)), + // m_ref_count(host_p, m_pointer_record), + // m_active_pointer(static_cast(m_pointer_record->m_pointers[chai::CPU])) + // //m_resource_manager(SharedPtrManager::getInstance()) + //{} - template> - ManagedSharedPtr(Yp* host_p, Deleter d) : - m_pointer_record(new msp_pointer_record(host_p)), - m_ref_count(m_pointer_record, std::move(d)), - m_active_pointer(m_pointer_record->m_pointers[chai::CPU]) - {} + //template> + //ManagedSharedPtr(Yp* host_p, Deleter d) : + // m_pointer_record(new msp_pointer_record(host_p)), + // m_ref_count(host_p, m_pointer_record, std::move(d)), + // m_active_pointer(static_cast(m_pointer_record->m_pointers[chai::CPU])) + // //m_resource_manager(SharedPtrManager::getInstance()) + //{} template> ManagedSharedPtr(Yp* host_p, Yp* device_p, Deleter d) : m_pointer_record(new msp_pointer_record(host_p, device_p)), - m_ref_count(m_pointer_record, std::move(d)), - m_active_pointer(m_pointer_record->m_pointers[chai::CPU]) + m_ref_count(host_p, m_pointer_record, std::move(d)), + m_active_pointer(static_cast(m_pointer_record->m_pointers[chai::CPU])) + //m_resource_manager(SharedPtrManager::getInstance()) {} /* @@ -291,7 +246,7 @@ class ManagedSharedPtr { msp_shared_count m_ref_count; mutable element_type* m_active_pointer = nullptr; - //mutable ArrayManager* m_resource_manager = nullptr; + //mutable SharedPtrManager* m_resource_manager = nullptr; }; template @@ -302,6 +257,13 @@ ManagedSharedPtr make_shared(Args... args) { return ManagedSharedPtr(cpu_pointer, gpu_pointer, [](Tp* p){delete p;}); } +template +ManagedSharedPtr make_shared_deleter(Args... args, Deleter d) { + Tp* gpu_pointer = make_on_device(args...); + Tp* cpu_pointer = make_on_host(args...); + + return ManagedSharedPtr(cpu_pointer, gpu_pointer, std::move(d)); +} } // namespace chai diff --git a/src/chai/PointerRecord.hpp b/src/chai/PointerRecord.hpp index e46ea899..ef28a185 100644 --- a/src/chai/PointerRecord.hpp +++ b/src/chai/PointerRecord.hpp @@ -71,6 +71,8 @@ struct PointerRecord { } }; +struct MyPointerRecord final : public PointerRecord {}; + } // end of namespace chai #endif // CHAI_PointerRecord_HPP diff --git a/src/chai/SharedPointerRecord.hpp b/src/chai/SharedPointerRecord.hpp new file mode 100644 index 00000000..3dad340d --- /dev/null +++ b/src/chai/SharedPointerRecord.hpp @@ -0,0 +1,81 @@ +////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC and CHAI +// project contributors. See the CHAI LICENSE file for details. +// +// SPDX-License-Identifier: BSD-3-Clause +////////////////////////////////////////////////////////////////////////////// +#ifndef CHAI_SharedPointerRecord_HPP +#define CHAI_SharedPointerRecord_HPP + +#include "chai/ExecutionSpaces.hpp" +#include "chai/Types.hpp" + +#include +#include + +namespace chai +{ + +/*! + * \brief Struct holding details about each pointer. + */ +template +struct msp_pointer_record { + + // Using NUM_EXECUTION_SPACES for the time being, this will help with logical + // control since ExecutionSpaces are already defined. + // Only CPU and GPU spaces will be used. + // If other spaces are enabled they will not be used by ManagedSharedPtr. + void* m_pointers[NUM_EXECUTION_SPACES]; + bool m_touched[NUM_EXECUTION_SPACES]; + bool m_owned[NUM_EXECUTION_SPACES]; + + ExecutionSpace m_last_space; + //UserCallback m_user_callback; + + int m_allocators[NUM_EXECUTION_SPACES]; + + //template + //msp_pointer_record(Yp* host_p = nullptr, Yp* device_p = nullptr) : m_last_space(NONE) { + // for (int space = 0; space < NUM_EXECUTION_SPACES; ++space ) { + // m_pointers[space] = nullptr; + // m_touched[space] = false; + // m_owned[space] = true; + // m_allocators[space] = 0; + // } + // m_pointers[CPU] = host_p; + // m_pointers[GPU] = device_p; + //} + + + msp_pointer_record(void* host_p = nullptr, void* device_p = nullptr) : m_last_space(NONE) { + for (int space = 0; space < NUM_EXECUTION_SPACES; ++space ) { + m_pointers[space] = nullptr; + m_touched[space] = false; + m_owned[space] = true; + m_allocators[space] = 0; + } + m_pointers[CPU] = host_p; + m_pointers[GPU] = device_p; + } + + //Tp* get_pointer(ExecutionSpace space) noexcept { return m_pointers[space]; } + //template + //msp_pointer_record(msp_pointer_record const& rhs) : + // m_pointers(rhs.m_pointers), + // m_touched(rhs.m_touched), + // m_owned(rhs.m_owned), + // m_last_space(rhs.m_last_space), + // m_allocators(rhs.m_allocators) + //{} + + + +}; + + + + +} // end of namespace chai + +#endif // CHAI_SharedPointerRecord_HPP diff --git a/src/chai/SharedPtrManager.cpp b/src/chai/SharedPtrManager.cpp new file mode 100644 index 00000000..1845e383 --- /dev/null +++ b/src/chai/SharedPtrManager.cpp @@ -0,0 +1,616 @@ +////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC and CHAI +// project contributors. See the CHAI LICENSE file for details. +// +// SPDX-License-Identifier: BSD-3-Clause +////////////////////////////////////////////////////////////////////////////// +#include "chai/SharedPtrManager.hpp" + +#include "chai/config.hpp" + +#if defined(CHAI_ENABLE_CUDA) +#if !defined(CHAI_THIN_GPU_ALLOCATE) +#include "cuda_runtime_api.h" +#endif +#endif + +#include "umpire/ResourceManager.hpp" + +namespace chai +{ +thread_local ExecutionSpace SharedPtrManager::m_current_execution_space; +thread_local bool SharedPtrManager::m_synced_since_last_kernel = false; + +msp_pointer_record SharedPtrManager::s_null_record = msp_pointer_record(); + +SharedPtrManager* SharedPtrManager::getInstance() +{ + static SharedPtrManager s_resource_manager_instance; + return &s_resource_manager_instance; +} + +SharedPtrManager::SharedPtrManager() : + m_pointer_map{}, + m_allocators{}, + m_resource_manager{umpire::ResourceManager::getInstance()}, + m_callbacks_active{true} +{ + m_pointer_map.clear(); + m_current_execution_space = NONE; + m_default_allocation_space = CPU; + + m_allocators[CPU] = + new umpire::Allocator(m_resource_manager.getAllocator("HOST")); + +#if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP) || defined(CHAI_ENABLE_GPU_SIMULATION_MODE) +#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + m_allocators[GPU] = + new umpire::Allocator(m_resource_manager.getAllocator("HOST")); +#else + m_allocators[GPU] = + new umpire::Allocator(m_resource_manager.getAllocator("DEVICE")); +#endif +#endif + +#if defined(CHAI_ENABLE_UM) + m_allocators[UM] = + new umpire::Allocator(m_resource_manager.getAllocator("UM")); +#endif + +#if defined(CHAI_ENABLE_PINNED) +#if (defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP)) && !defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + m_allocators[PINNED] = + new umpire::Allocator(m_resource_manager.getAllocator("PINNED")); +#else + m_allocators[PINNED] = + new umpire::Allocator(m_resource_manager.getAllocator("HOST")); +#endif +#endif +} + +void SharedPtrManager::registerPointer( + msp_pointer_record* record, + ExecutionSpace space, + bool owned) +{ + std::lock_guard lock(m_mutex); + auto pointer = record->m_pointers[space]; + + // if we are registering a new pointer record for a pointer where there is already + // a pointer record, we assume the old record was somehow abandoned by the host + // application and trigger an ACTION_FOUND_ABANDONED callback + auto found_pointer_record_pair = m_pointer_map.find(pointer); + if (found_pointer_record_pair != m_pointer_map.end()) { + msp_pointer_record ** found_pointer_record_addr = found_pointer_record_pair->second; + if (found_pointer_record_addr != nullptr) { + + msp_pointer_record *foundRecord = *found_pointer_record_addr; + // if it's actually the same pointer record, then we're OK. If it's a different + // one, delete the old one. + if (foundRecord != record) { + CHAI_LOG(Warning, "SharedPtrManager::registerPointer found a record for " << + pointer << " already there. Deleting abandoned pointer record."); + + callback(foundRecord, ACTION_FOUND_ABANDONED, space); + + for (int fspace = CPU; fspace < NUM_EXECUTION_SPACES; ++fspace) { + foundRecord->m_pointers[fspace] = nullptr; + } + + delete foundRecord; + } + } + } + + CHAI_LOG(Debug, "Registering " << pointer << " in space " << space); + + m_pointer_map.insert(pointer, record); + + for (int i = 0; i < NUM_EXECUTION_SPACES; i++) { + if (!record->m_pointers[i]) record->m_owned[i] = true; + } + record->m_owned[space] = owned; + + if (pointer) { + // if umpire already knows about this pointer, we want to make sure its records and ours + // are consistent + if (m_resource_manager.hasAllocator(pointer)) { + umpire::util::AllocationRecord *allocation_record = const_cast(m_resource_manager.findAllocationRecord(pointer)); + allocation_record->size = record->m_size; + } + // register with umpire if it's not there so that umpire can perform data migrations + else { + umpire::util::AllocationRecord new_allocation_record; + new_allocation_record.ptr = pointer; + new_allocation_record.size = record->m_size; + new_allocation_record.strategy = m_resource_manager.getAllocator(record->m_allocators[space]).getAllocationStrategy(); + + m_resource_manager.registerAllocation(pointer, new_allocation_record); + } + } +} + +void SharedPtrManager::deregisterPointer(msp_pointer_record* record, bool deregisterFromUmpire) +{ + std::lock_guard lock(m_mutex); + for (int i = 0; i < NUM_EXECUTION_SPACES; i++) { + void * pointer = record->m_pointers[i]; + if (pointer) { + if (deregisterFromUmpire) { + m_resource_manager.deregisterAllocation(pointer); + } + CHAI_LOG(Debug, "De-registering " << pointer); + m_pointer_map.erase(pointer); + } + } + if (record != &s_null_record) { + delete record; + } +} + +void * SharedPtrManager::frontOfAllocation(void * pointer) { + if (pointer) { + if (m_resource_manager.hasAllocator(pointer)) { + auto allocation_record = m_resource_manager.findAllocationRecord(pointer); + if (allocation_record) { + return allocation_record->ptr; + } + } + } + return nullptr; +} + +void SharedPtrManager::setExecutionSpace(ExecutionSpace space) +{ +#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + if (isGPUSimMode()) { + space = chai::GPU; + } +#endif + + CHAI_LOG(Debug, "Setting execution space to " << space); + + if (chai::GPU == space) { + m_synced_since_last_kernel = false; + } + +#if defined(CHAI_THIN_GPU_ALLOCATE) + if (chai::CPU == space) { + syncIfNeeded(); + } +#endif + + m_current_execution_space = space; +} + +void* SharedPtrManager::move(void* pointer, + msp_pointer_record* pointer_record, + ExecutionSpace space) +{ + // Check for default arg (NONE) + if (space == NONE) { + space = m_current_execution_space; + } + + if (space == NONE) { + return pointer; + } + + move(pointer_record, space); + + return pointer_record->m_pointers[space]; +} + +ExecutionSpace SharedPtrManager::getExecutionSpace() +{ + return m_current_execution_space; +} + +void SharedPtrManager::registerTouch(msp_pointer_record* pointer_record) +{ + registerTouch(pointer_record, m_current_execution_space); +} + +void SharedPtrManager::registerTouch(msp_pointer_record* pointer_record, + ExecutionSpace space) +{ + if (pointer_record && pointer_record != &s_null_record) { + + if (space != NONE) { + CHAI_LOG(Debug, pointer_record->m_pointers[space] << " touched in space " << space); + pointer_record->m_touched[space] = true; + pointer_record->m_last_space = space; + } + } +} + + +void SharedPtrManager::resetTouch(msp_pointer_record* pointer_record) +{ + if (pointer_record && pointer_record!= &s_null_record) { + for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { + pointer_record->m_touched[space] = false; + } + } +} + + +/* Not all GPU platform runtimes (notably HIP), will give you asynchronous copies to the device by default, so we leverage + * umpire's API for asynchronous copies using camp resources in this method, based off of the CHAI destination space + * */ +static void copy(void * dst_pointer, void * src_pointer, umpire::ResourceManager & manager, ExecutionSpace dst_space, ExecutionSpace src_space) { + +#ifdef CHAI_ENABLE_CUDA + camp::resources::Resource device_resource(camp::resources::Cuda::get_default()); +#elif defined(CHAI_ENABLE_HIP) + camp::resources::Resource device_resource(camp::resources::Hip::get_default()); +#else + camp::resources::Resource device_resource(camp::resources::Host::get_default()); +#endif + + camp::resources::Resource host_resource(camp::resources::Host::get_default()); + if (dst_space == GPU || src_space == GPU) { + // Do the copy using the device resource + manager.copy(dst_pointer, src_pointer, device_resource); + } else { + // Do the copy using the host resource + manager.copy(dst_pointer, src_pointer, host_resource); + } + // Ensure device to host copies are synchronous + if (dst_space == CPU && src_space == GPU) { + device_resource.wait(); + } +} + +void SharedPtrManager::move(msp_pointer_record* record, ExecutionSpace space) +{ + if (space == NONE) { + return; + } + + callback(record, ACTION_CAPTURED, space); + + if (space == record->m_last_space) { + return; + } + +#if defined(CHAI_ENABLE_UM) + if (record->m_last_space == UM) { + return; + } +#endif + +#if defined(CHAI_ENABLE_PINNED) + if (record->m_last_space == PINNED) { + if (space == CPU) { + syncIfNeeded(); + } + return; + } +#endif + + ExecutionSpace prev_space = record->m_last_space; + + void* src_pointer = record->m_pointers[prev_space]; + void* dst_pointer = record->m_pointers[space]; + + if (!dst_pointer) { + allocate(record, space); + dst_pointer = record->m_pointers[space]; + } + + + if ( (!record->m_touched[record->m_last_space]) || (! src_pointer )) { + return; + } else if (dst_pointer != src_pointer) { + // Exclude the copy if src and dst are the same (can happen for PINNED memory) + { + chai::copy(dst_pointer, src_pointer, m_resource_manager, space, prev_space); + } + + callback(record, ACTION_MOVE, space); + } + + resetTouch(record); +} + +void SharedPtrManager::allocate( + msp_pointer_record* pointer_record, + ExecutionSpace space) +{ + auto size = pointer_record->m_size; + auto alloc = m_resource_manager.getAllocator(pointer_record->m_allocators[space]); + + pointer_record->m_pointers[space] = alloc.allocate(size); + callback(pointer_record, ACTION_ALLOC, space); + registerPointer(pointer_record, space); + + CHAI_LOG(Debug, "Allocated array at: " << pointer_record->m_pointers[space]); +} + +void SharedPtrManager::free(msp_pointer_record* pointer_record, ExecutionSpace spaceToFree) +{ + if (!pointer_record) return; + + for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { + if (space == spaceToFree || spaceToFree == NONE) { + if (pointer_record->m_pointers[space]) { + void* space_ptr = pointer_record->m_pointers[space]; + if (pointer_record->m_owned[space]) { +#if defined(CHAI_ENABLE_UM) + if (space_ptr == pointer_record->m_pointers[UM]) { + callback(pointer_record, + ACTION_FREE, + ExecutionSpace(UM)); + + auto alloc = m_resource_manager.getAllocator(pointer_record->m_allocators[UM]); + alloc.deallocate(space_ptr); + + for (int space_t = CPU; space_t < NUM_EXECUTION_SPACES; ++space_t) { + if (space_ptr == pointer_record->m_pointers[space_t]) { + pointer_record->m_pointers[space_t] = nullptr; + } + } + } else +#endif +#if defined(CHAI_ENABLE_PINNED) + if (space_ptr == pointer_record->m_pointers[PINNED]) { + callback(pointer_record, + ACTION_FREE, + ExecutionSpace(PINNED)); + + auto alloc = m_resource_manager.getAllocator( + pointer_record->m_allocators[PINNED]); + alloc.deallocate(space_ptr); + + for (int space_t = CPU; space_t < NUM_EXECUTION_SPACES; ++space_t) { + if (space_ptr == pointer_record->m_pointers[space_t]) { + pointer_record->m_pointers[space_t] = nullptr; + } + } + } else +#endif + { + callback(pointer_record, + ACTION_FREE, + ExecutionSpace(space)); + + auto alloc = m_resource_manager.getAllocator( + pointer_record->m_allocators[space]); + alloc.deallocate(space_ptr); + + pointer_record->m_pointers[space] = nullptr; + } + } + else + { + m_resource_manager.deregisterAllocation(space_ptr); + } + { + CHAI_LOG(Debug, "DeRegistering " << space_ptr); + std::lock_guard lock(m_mutex); + m_pointer_map.erase(space_ptr); + } + } + } + } + + if (pointer_record != &s_null_record && spaceToFree == NONE) { + delete pointer_record; + } +} + +size_t SharedPtrManager::getSize(void* ptr) +{ + // TODO + auto pointer_record = getPointerRecord(ptr); + return pointer_record->m_size; +} + +void SharedPtrManager::setDefaultAllocationSpace(ExecutionSpace space) +{ + m_default_allocation_space = space; +} + +ExecutionSpace SharedPtrManager::getDefaultAllocationSpace() +{ + return m_default_allocation_space; +} + + +void SharedPtrManager::setUserCallback(void* pointer, UserCallback const& f) +{ + // TODO ?? + auto pointer_record = getPointerRecord(pointer); + pointer_record->m_user_callback = f; +} + +void SharedPtrManager::setGlobalUserCallback(UserCallback const& f) +{ + m_user_callback = f; +} + +msp_pointer_record* SharedPtrManager::getPointerRecord(void* pointer) +{ + std::lock_guard lock(m_mutex); + auto record = m_pointer_map.find(pointer); + return record->second ? *record->second : &s_null_record; +} + +msp_pointer_record* SharedPtrManager::makeManaged(void* pointer, + size_t size, + ExecutionSpace space, + bool owned) +{ + if (pointer == nullptr) { + return &s_null_record ; + } + + if (space == NONE) { + space = getDefaultAllocationSpace(); + } + + m_resource_manager.registerAllocation( + pointer, + {pointer, size, m_allocators[space]->getAllocationStrategy()}); + + auto pointer_record = getPointerRecord(pointer); + + if (pointer_record == &s_null_record) { + if (pointer) { + pointer_record = new msp_pointer_record(); + } else { + return pointer_record; + } + } + else { + CHAI_LOG(Warning, "SharedPtrManager::makeManaged found abandoned pointer record!!!"); + callback(pointer_record, ACTION_FOUND_ABANDONED, space); + } + + pointer_record->m_pointers[space] = pointer; + pointer_record->m_owned[space] = owned; + pointer_record->m_size = size; + pointer_record->m_user_callback = [] (const msp_pointer_record*, Action, ExecutionSpace) {}; + + for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { + pointer_record->m_allocators[space] = getAllocatorId(ExecutionSpace(space)); + } + + if (pointer && size > 0) { + registerPointer(pointer_record, space, owned); + } + + return pointer_record; +} + +msp_pointer_record* SharedPtrManager::deepCopyRecord(msp_pointer_record const* record) +{ + msp_pointer_record* new_record = new msp_pointer_record{}; + const size_t size = record->m_size; + new_record->m_size = size; + new_record->m_user_callback = [] (const msp_pointer_record*, Action, ExecutionSpace) {}; + + const ExecutionSpace last_space = record->m_last_space; + new_record->m_last_space = last_space; + for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { + new_record->m_allocators[space] = record->m_allocators[space]; + } + + allocate(new_record, last_space); + + for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { + new_record->m_owned[space] = true; + new_record->m_touched[space] = false; + } + + new_record->m_touched[last_space] = true; + + void* dst_pointer = new_record->m_pointers[last_space]; + void* src_pointer = record->m_pointers[last_space]; + + chai::copy(dst_pointer, src_pointer, m_resource_manager, last_space, last_space); + + return new_record; +} + +std::unordered_map +SharedPtrManager::getPointerMap() const +{ + std::lock_guard lock(m_mutex); + std::unordered_map mapCopy; + + for (const auto& entry : m_pointer_map) { + mapCopy[entry.first] = *entry.second; + } + + return mapCopy; +} + +size_t SharedPtrManager::getTotalNumArrays() const { return m_pointer_map.size(); } + +// TODO: Investigate counting memory allocated in each execution space if +// possible +size_t SharedPtrManager::getTotalSize() const +{ + std::lock_guard lock(m_mutex); + size_t total = 0; + + for (const auto& entry : m_pointer_map) { + total += (*entry.second)->m_size; + } + + return total; +} + +void SharedPtrManager::reportLeaks() const +{ + std::lock_guard lock(m_mutex); + for (const auto& entry : m_pointer_map) { + const void* pointer = entry.first; + const msp_pointer_record* record = *entry.second; + + for (int s = CPU; s < NUM_EXECUTION_SPACES; ++s) { + if (pointer == record->m_pointers[s]) { + callback(record, ACTION_LEAKED, ExecutionSpace(s)); + } + } + } +} + +int +SharedPtrManager::getAllocatorId(ExecutionSpace space) const +{ + return m_allocators[space]->getId(); +} + +void SharedPtrManager::evict(ExecutionSpace space, ExecutionSpace destinationSpace) { + // Check arguments + if (space == NONE) { + // Nothing to be done + return; + } + + if (destinationSpace == NONE) { + // If the destination space is NONE, evicting invalidates all data and + // leaves us in a bad state (if the last touch was in the eviction space). + CHAI_LOG(Warning, "evict does nothing with destinationSpace == NONE!"); + return; + } + + if (space == destinationSpace) { + // It doesn't make sense to evict to the same space, so do nothing + CHAI_LOG(Warning, "evict does nothing with space == destinationSpace!"); + return; + } + + // Now move and evict + std::vector pointersToEvict; + { + std::lock_guard lock(m_mutex); + for (const auto& entry : m_pointer_map) { + // Get the pointer record + auto record = *entry.second; + + // Move the data and register the touches + move(record, destinationSpace); + registerTouch(record, destinationSpace); + + // If the destinationSpace is ever allowed to be NONE, then we will need to + // update the touch in the eviction space and make sure the last space is not + // the eviction space. + + // Mark record for eviction later in this routine + pointersToEvict.push_back(record); + } + } + + // This must be done in a second pass because free erases from m_pointer_map, + // which would invalidate the iterator in the above loop + for (const auto& entry : pointersToEvict) { + free(entry, space); + } +} + + +} // end of namespace chai diff --git a/src/chai/SharedPtrManager.hpp b/src/chai/SharedPtrManager.hpp new file mode 100644 index 00000000..c4a88097 --- /dev/null +++ b/src/chai/SharedPtrManager.hpp @@ -0,0 +1,535 @@ +////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC and CHAI +// project contributors. See the CHAI LICENSE file for details. +// +// SPDX-License-Identifier: BSD-3-Clause +////////////////////////////////////////////////////////////////////////////// +#ifndef CHAI_SharedPtrManager_HPP +#define CHAI_SharedPtrManager_HPP + +#include "chai/config.hpp" +#include "chai/ChaiMacros.hpp" +#include "chai/ExecutionSpaces.hpp" +//#include "chai/PointerRecord.hpp" +#include "chai/SharedPointerRecord.hpp" +#include "chai/Types.hpp" +#include "chai/ArrayManager.hpp" + +#if defined(CHAI_ENABLE_RAJA_PLUGIN) +#include "chai/pluginLinker.hpp" +#endif + +#include + +#include "umpire/Allocator.hpp" +#include "umpire/util/MemoryMap.hpp" + +#if defined(CHAI_ENABLE_CUDA) +#include +#endif +#if defined(CHAI_ENABLE_HIP) +#include "hip/hip_runtime_api.h" +#endif + +namespace chai +{ + +// CHAI_GPU_ERROR_CHECK macro +//#if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP) +// +//#ifdef CHAI_ENABLE_GPU_ERROR_CHECKING +// +//#ifdef CHAI_ENABLE_CUDA +//inline void gpuErrorCheck(cudaError_t code, const char *file, int line, bool abort=true) +//{ +// if (code != cudaSuccess) { +// fprintf(stderr, "[CHAI] GPU Error: %s %s %d\n", cudaGetErrorString(code), file, line); +// if (abort) { +// exit(code); +// } +// } +//} +//#elif defined(CHAI_ENABLE_HIP) +//inline void gpuErrorCheck(hipError_t code, const char *file, int line, bool abort=true) +//{ +// if (code != hipSuccess) { +// fprintf(stderr, "[CHAI] GPU Error: %s %s %d\n", hipGetErrorString(code), file, line); +// if (abort) { +// exit(code); +// } +// } +//} +//#endif +// +// +//#define CHAI_GPU_ERROR_CHECK(code) { gpuErrorCheck((code), __FILE__, __LINE__); } +//#else // CHAI_ENABLE_GPU_ERROR_CHECKING +//#define CHAI_GPU_ERROR_CHECK(code) code +//#endif // CHAI_ENABLE_GPU_ERROR_CHECKING +// +//#endif + +//// wrapper for hip/cuda synchronize +//inline void synchronize() { +//#if defined (CHAI_ENABLE_HIP) &&!defined(__HIP_DEVICE_COMPILE__) +// CHAI_GPU_ERROR_CHECK(hipDeviceSynchronize()); +//#elif defined (CHAI_ENABLE_CUDA) &&!defined(__CUDA_ARCH__) +// CHAI_GPU_ERROR_CHECK(cudaDeviceSynchronize()); +//#endif +//} +// +//#if defined(CHAI_GPUCC) +// +//// wrapper for hip/cuda free +//CHAI_HOST inline void gpuFree(void* buffer) { +//#if defined (CHAI_ENABLE_HIP) +// CHAI_GPU_ERROR_CHECK(hipFree(buffer)); +//#elif defined (CHAI_ENABLE_CUDA) +// CHAI_GPU_ERROR_CHECK(cudaFree(buffer)); +//#endif +//} +// +//// wrapper for hip/cuda malloc +//CHAI_HOST inline void gpuMalloc(void** devPtr, size_t size) { +//#if defined (CHAI_ENABLE_HIP) +// CHAI_GPU_ERROR_CHECK(hipMalloc(devPtr, size)); +//#elif defined (CHAI_ENABLE_CUDA) +// CHAI_GPU_ERROR_CHECK(cudaMalloc(devPtr, size)); +//#endif +//} +// +//// wrapper for hip/cuda managed malloc +//CHAI_HOST inline void gpuMallocManaged(void** devPtr, size_t size) { +//#if defined (CHAI_ENABLE_HIP) +// CHAI_GPU_ERROR_CHECK(hipMallocManaged(devPtr, size)); +//#elif defined (CHAI_ENABLE_CUDA) +// CHAI_GPU_ERROR_CHECK(cudaMallocManaged(devPtr, size)); +//#endif +//} +// +//// wrapper for hip/cuda mem copy +//CHAI_HOST inline void gpuMemcpy(void* dst, const void* src, size_t count, gpuMemcpyKind kind) { +//#if defined (CHAI_ENABLE_HIP) +// CHAI_GPU_ERROR_CHECK(hipMemcpy(dst, src, count, kind)); +//#elif defined (CHAI_ENABLE_CUDA) +// CHAI_GPU_ERROR_CHECK(cudaMemcpy(dst, src, count, kind)); +//#endif +//} +// +//#endif //#if defined(CHAI_GPUCC) + +/*! + * \brief Singleton that manages caching and movement of ManagedArray objects. + * + * The SharedPtrManager class co-ordinates the allocation and movement of + * ManagedArray objects. These objects are cached, and data is only copied + * between ExecutionSpaces when necessary. This functionality is typically + * hidden behind a programming model layer, such as RAJA, or the exmaple + * included in util/forall.hpp + * + * The SharedPtrManager is a singleton, so must always be accessed through the + * static getInstance method. Here is an example using the SharedPtrManager: + * + * \code + * const chai::SharedPtrManager* rm = chai::SharedPtrManager::getInstance(); + * rm->setExecutionSpace(chai::CPU); + * // Do something in with ManagedArrays on the CPU... but they must be copied! + * rm->setExecutionSpace(chai::NONE); + * \endcode + */ +class SharedPtrManager +{ +public: + template + using T_non_const = typename std::remove_const::type; + + using PointerMap = umpire::util::MemoryMap; + + CHAISHAREDDLL_API static msp_pointer_record s_null_record; + + /*! + * \brief Get the singleton instance. + * + * \return Pointer to the SharedPtrManager instance. + * + */ + CHAISHAREDDLL_API + static SharedPtrManager* getInstance(); + + /*! + * \brief Set the current execution space. + * + * \param space The space to set as current. + */ + CHAISHAREDDLL_API void setExecutionSpace(ExecutionSpace space); + + /*! + * \brief Get the current execution space. + * + * \return The current execution space.jo + */ + CHAISHAREDDLL_API ExecutionSpace getExecutionSpace(); + + /*! + * \brief Move data in pointer to the current execution space. + * + * \param pointer Pointer to data in any execution space. + * \return Pointer to data in the current execution space. + */ + CHAISHAREDDLL_API void* move(void* pointer, + msp_pointer_record* pointer_record, + ExecutionSpace = NONE); + + /*! + * \brief Register a touch of the pointer in the current execution space. + * + * \param pointer Raw pointer to register a touch of. + */ + CHAISHAREDDLL_API void registerTouch(msp_pointer_record* pointer_record); + + /*! + * \brief Register a touch of the pointer in the given execution space. + * + * The pointer doesn't need to exist in the space being touched. + * + * \param pointer Raw pointer to register a touch of. + * \param space Space to register touch. + */ + CHAISHAREDDLL_API void registerTouch(msp_pointer_record* pointer_record, ExecutionSpace space); + + /*! + * \brief Make a new allocation of the data described by the msp_pointer_record in + * the given space. + * + * \param pointer_record + * \param space Space in which to make the allocation. + */ + CHAISHAREDDLL_API void allocate(msp_pointer_record* pointer_record, ExecutionSpace space = CPU); + + /*! + * \brief Reallocate data. + * + * Data is reallocated in all spaces this pointer is associated with. + * + * \param ptr Pointer to address to reallocate + * \param elems The number of elements to allocate. + * \tparam T The type of data to allocate. + * + * \return Pointer to the allocated memory. + */ + template + void* reallocate(void* pointer, + size_t elems, + msp_pointer_record* record); + + /*! + * \brief Set the default space for new ManagedArray allocations. + * + * ManagedArrays allocated without an explicit ExecutionSpace argument will + * be allocated in space after this routine is called. + * + * \param space New space for default allocations. + */ + CHAISHAREDDLL_API void setDefaultAllocationSpace(ExecutionSpace space); + + /*! + * \brief Get the currently set default allocation space. + * + * See also setDefaultAllocationSpace. + * + * \return Current default space for allocations. + */ + CHAISHAREDDLL_API ExecutionSpace getDefaultAllocationSpace(); + + /*! + * \brief Free allocation(s) associated with the given msp_pointer_record. + * Default (space == NONE) will free all allocations and delete + * the pointer record. + */ + CHAISHAREDDLL_API void free(msp_pointer_record* pointer, ExecutionSpace space = NONE); + +#if defined(CHAI_ENABLE_PICK) + template + T_non_const pick(T* src_ptr, size_t index); + + template + void set(T* dst_ptr, size_t index, const T& val); +#endif + + /*! + * \brief Get the size of the given pointer. + * + * \param pointer Pointer to find the size of. + * \return Size of pointer. + */ + CHAISHAREDDLL_API size_t getSize(void* pointer); + + CHAISHAREDDLL_API msp_pointer_record* makeManaged(void* pointer, + size_t size, + ExecutionSpace space, + bool owned); + + /*! + * \brief Assign a user-defined callback triggered upon memory operations. + * This callback applies to a single ManagedArray. + */ + CHAISHAREDDLL_API void setUserCallback(void* pointer, UserCallback const& f); + + /*! + * \brief Assign a user-defined callback triggered upon memory operations. + * This callback applies to all ManagedArrays. + */ + CHAISHAREDDLL_API void setGlobalUserCallback(UserCallback const& f); + + /*! + * \brief Set touched to false in all spaces for the given msp_pointer_record. + * + * \param pointer_record msp_pointer_record to reset. + */ + CHAISHAREDDLL_API void resetTouch(msp_pointer_record* pointer_record); + + /*! + * \brief Find the msp_pointer_record corresponding to the raw pointer. + * + * \param pointer Raw pointer to find the msp_pointer_record for. + * + * \return msp_pointer_record containing the raw pointer, or an empty + * msp_pointer_record if none found. + */ + CHAISHAREDDLL_API msp_pointer_record* getPointerRecord(void* pointer); + + /*! + * \brief Create a copy of the given msp_pointer_record with a new allocation + * in the active space. + * + * \param record The msp_pointer_record to copy. + * + * \return A copy of the given msp_pointer_record, must be free'd with delete. + */ + CHAISHAREDDLL_API msp_pointer_record* deepCopyRecord(msp_pointer_record const* record); + + /*! + * \brief Create a copy of the pointer map. + * + * \return A copy of the pointer map. Can be used to find memory leaks. + */ + CHAISHAREDDLL_API std::unordered_map getPointerMap() const; + + /*! + * \brief Get the total number of arrays registered with the array manager. + * + * \return The total number of arrays registered with the array manager. + */ + CHAISHAREDDLL_API size_t getTotalNumArrays() const; + + /*! + * \brief Get the total amount of memory allocated. + * + * \return The total amount of memory allocated. + */ + CHAISHAREDDLL_API size_t getTotalSize() const; + + /*! + * \brief Calls callbacks of pointers still in the map with ACTION_LEAKED. + */ + CHAISHAREDDLL_API void reportLeaks() const; + + /*! + * \brief Get the allocator ID + * + * \return The allocator ID. + */ + CHAISHAREDDLL_API int getAllocatorId(ExecutionSpace space) const; + + /*! + * \brief Wraps our resource manager's copy. + */ + CHAISHAREDDLL_API void copy(void * dst, void * src, size_t size); + + /*! + * \brief Registering an allocation with the SharedPtrManager + * + * \param record msp_pointer_record of this allocation. + * \param space Space in which the pointer was allocated. + * \param owned Should the allocation be free'd by CHAI? + */ + CHAISHAREDDLL_API void registerPointer(msp_pointer_record* record, + ExecutionSpace space, + bool owned = true); + + /*! + * \brief Deregister a msp_pointer_record from the SharedPtrManager. + * + * \param record msp_pointer_record of allocation to deregister. + * \param deregisterFromUmpire If true, deregister from umpire as well. + */ + CHAISHAREDDLL_API void deregisterPointer(msp_pointer_record* record, bool deregisterFromUmpire=false); + + /*! + * \brief Returns the front of the allocation associated with this pointer, nullptr if allocation not found. + * + * \param pointer Pointer to address of that we want the front of the allocation for. + */ + CHAISHAREDDLL_API void * frontOfAllocation(void * pointer); + + /*! + * \brief set the allocator for an execution space. + * + * \param space Execution space to set the default allocator for. + * \param allocator The allocator to use for this space. Will be copied into chai. + */ + void setAllocator(ExecutionSpace space, umpire::Allocator &allocator); + + /*! + * \brief Get the allocator for an execution space. + * + * \param space Execution space of the allocator to get. + * + * \return The allocator for the given space. + */ + umpire::Allocator getAllocator(ExecutionSpace space); + + /*! + * \brief Turn callbacks on. + */ + void enableCallbacks() { m_callbacks_active = true; } + + /*! + * \brief Turn callbacks off. + */ + void disableCallbacks() { m_callbacks_active = false; } + + /*! + * \brief synchronize the device if there hasn't been a synchronize since the last kernel + */ + CHAISHAREDDLL_API bool syncIfNeeded(); + +#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + /*! + * \brief Turn the GPU simulation mode on or off. + */ + void setGPUSimMode(bool gpuSimMode) { m_gpu_sim_mode = gpuSimMode; } + + /*! + * \brief Return true if GPU simulation mode is on, false otherwise. + */ + bool isGPUSimMode() { return m_gpu_sim_mode; } +#endif + + /*! + * \brief Evicts the data in the given space. + * + * \param space Execution space to evict. + * \param destinationSpace The execution space to move the data to. + * Must not equal space or NONE. + */ + CHAISHAREDDLL_API void evict(ExecutionSpace space, ExecutionSpace destinationSpace); + + +protected: + /*! + * \brief Construct a new SharedPtrManager. + * + * The constructor is a protected member, ensuring that it can + * only be called by the singleton getInstance method. + */ + SharedPtrManager(); + + + +private: + + + /*! + * \brief Move data in msp_pointer_record to the corresponding ExecutionSpace. + * + * \param record + * \param space + */ + void move(msp_pointer_record* record, ExecutionSpace space); + + /*! + * \brief Execute a user callback if callbacks are active + * + * \param record The pointer record containing the callback + * \param action The event that occurred + * \param space The space in which the event occurred + * \param size The number of bytes in the array associated with this pointer record + */ + inline void callback(const msp_pointer_record* record, + Action action, + ExecutionSpace space) const { + if (m_callbacks_active) { + // Callback for this ManagedArray only + if (record && record->m_user_callback) { + record->m_user_callback(record, action, space); + } + + // Callback for all ManagedArrays + if (m_user_callback) { + m_user_callback(record, action, space); + } + } + } + + /*! + * Current execution space. + */ + static thread_local ExecutionSpace m_current_execution_space; + + /** + * Default space for new allocations. + */ + ExecutionSpace m_default_allocation_space; + + /*! + * Map of active ManagedArray pointers to their corresponding msp_pointer_record. + */ + PointerMap m_pointer_map; + + /*! + * + * \brief Array of umpire::Allocators, indexed by ExecutionSpace. + */ + umpire::Allocator* m_allocators[NUM_EXECUTION_SPACES]; + + /*! + * \brief The umpire resource manager. + */ + umpire::ResourceManager& m_resource_manager; + + /*! + * \brief Used for thread-safe operations. + */ + mutable std::mutex m_mutex; + + /*! + * \brief A callback triggered upon memory operations on all ManagedArrays. + */ + UserCallback m_user_callback; + + /*! + * \brief Controls whether or not callbacks are called. + */ + bool m_callbacks_active; + + /*! + * Whether or not a synchronize has been performed since the launch of the last + * GPU context + */ + static thread_local bool m_synced_since_last_kernel; + +#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + /*! + * Used by the RAJA plugin to determine whether the execution space should be + * CPU or GPU. + */ + bool m_gpu_sim_mode = false; +#endif +}; + +} // end of namespace chai + +#include "chai/SharedPtrManager.inl" + +#endif // CHAI_SharedPtrManager_HPP diff --git a/src/chai/SharedPtrManager.inl b/src/chai/SharedPtrManager.inl new file mode 100644 index 00000000..af7b7a4b --- /dev/null +++ b/src/chai/SharedPtrManager.inl @@ -0,0 +1,129 @@ +////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC and CHAI +// project contributors. See the CHAI LICENSE file for details. +// +// SPDX-License-Identifier: BSD-3-Clause +////////////////////////////////////////////////////////////////////////////// +#ifndef CHAI_SharedPtrManager_INL +#define CHAI_SharedPtrManager_INL + +#include "chai/config.hpp" + +#include "chai/SharedPtrManager.hpp" +#include "chai/ChaiMacros.hpp" + +#include + +#include "umpire/ResourceManager.hpp" + +#if defined(CHAI_ENABLE_UM) +#if !defined(CHAI_THIN_GPU_ALLOCATE) +#include +#endif +#endif + +namespace chai { + +template +CHAI_INLINE +void* SharedPtrManager::reallocate(void* pointer, size_t elems, msp_pointer_record* pointer_record) +{ + ExecutionSpace my_space = CPU; + + for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { + if (pointer_record->m_pointers[space] == pointer) { + my_space = static_cast(space); + break; + } + } + + for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { + if (!pointer_record->m_owned[space]) { + CHAI_LOG(Debug, "Cannot reallocate unowned pointer"); + return pointer_record->m_pointers[my_space]; + } + } + + // Call callback with ACTION_FREE before changing the size + for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { + if (pointer_record->m_pointers[space]) { + callback(pointer_record, ACTION_FREE, ExecutionSpace(space)); + } + } + + // Update the pointer record size + size_t old_size = pointer_record->m_size; + size_t new_size = sizeof(T) * elems; + pointer_record->m_size = new_size; + + // only copy however many bytes overlap + size_t num_bytes_to_copy = std::min(old_size, new_size); + + for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { + void* old_ptr = pointer_record->m_pointers[space]; + + if (old_ptr) { + void* new_ptr = m_allocators[space]->allocate(new_size); + m_resource_manager.copy(new_ptr, old_ptr, num_bytes_to_copy); + m_allocators[space]->deallocate(old_ptr); + + pointer_record->m_pointers[space] = new_ptr; + callback(pointer_record, ACTION_ALLOC, ExecutionSpace(space)); + + m_pointer_map.erase(old_ptr); + m_pointer_map.insert(new_ptr, pointer_record); + } + } + + return pointer_record->m_pointers[my_space]; +} + +#if defined(CHAI_ENABLE_PICK) +template +CHAI_INLINE +typename SharedPtrManager::T_non_const SharedPtrManager::pick(T* src_ptr, size_t index) +{ + T_non_const val; + m_resource_manager.registerAllocation(const_cast*>(&val), umpire::util::AllocationRecord{const_cast*>(&val), sizeof(T), m_resource_manager.getAllocator("HOST").getAllocationStrategy()}); + m_resource_manager.copy(const_cast*>(&val), const_cast*>(src_ptr+index), sizeof(T)); + m_resource_manager.deregisterAllocation(&val); + return val; +} + +template +CHAI_INLINE +void SharedPtrManager::set(T* dst_ptr, size_t index, const T& val) +{ + m_resource_manager.registerAllocation(const_cast*>(&val), umpire::util::AllocationRecord{const_cast*>(&val), sizeof(T), m_resource_manager.getAllocator("HOST").getAllocationStrategy()}); + m_resource_manager.copy(const_cast*>(dst_ptr+index), const_cast*>(&val), sizeof(T)); + m_resource_manager.deregisterAllocation(const_cast*>(&val)); +} +#endif + +CHAI_INLINE +void SharedPtrManager::copy(void * dst, void * src, size_t size) { + m_resource_manager.copy(dst,src,size); +} + +CHAI_INLINE +umpire::Allocator SharedPtrManager::getAllocator(ExecutionSpace space) { + return *m_allocators[space]; +} + +CHAI_INLINE +void SharedPtrManager::setAllocator(ExecutionSpace space, umpire::Allocator &allocator) { + *m_allocators[space] = allocator; +} + +CHAI_INLINE +bool SharedPtrManager::syncIfNeeded() { + if (!m_synced_since_last_kernel) { + synchronize(); + m_synced_since_last_kernel = true; + return true; + } + return false; +} +} // end of namespace chai + +#endif // CHAI_SharedPtrManager_INL diff --git a/tests/integration/managed_ptr_tests.cpp b/tests/integration/managed_ptr_tests.cpp index 259f9e61..de7f348a 100644 --- a/tests/integration/managed_ptr_tests.cpp +++ b/tests/integration/managed_ptr_tests.cpp @@ -167,11 +167,14 @@ TEST(managed_ptr, shared_ptr) //chai::ManagedSharedPtr sptr(new TestDerived(), // [](TestDerived*p){ printf("Deleter Call\n"); p->~TestDerived(); }); - chai::ManagedSharedPtr sptr = chai::ManagedSharedPtr(new TestDerived(), - [](TestDerived*p){ printf("Custom Deleter Call\n"); delete p; }); + //chai::ManagedSharedPtr sptr = chai::ManagedSharedPtr(new TestDerived(), + // [](TestDerived*p){ printf("Custom Deleter Call\n"); delete p; }); //chai::ManagedSharedPtr sptr(new TestDerived()); - + //chai::ManagedSharedPtr sptr = chai::make_shared(); + chai::ManagedSharedPtr sptr = chai::make_shared_deleter( + [](TestDerived* p){ printf("Custom Deleter Call\n"); p->~TestDerived(); }); + std::cout << "use_count : " << sptr.use_count() << std::endl; auto sptr2 = sptr; From 75749a2f5584e400a1d136802aea7abd69eff122 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 9 Apr 2024 14:26:44 -0700 Subject: [PATCH 04/44] MSPtr counter takes on ownership of the pointer record creation and lifetime. --- src/chai/ManagedSharedPtr.hpp | 156 +----------------------- src/chai/SharedPointerRecord.hpp | 144 ++++++++++++++++++---- tests/integration/managed_ptr_tests.cpp | 4 +- 3 files changed, 129 insertions(+), 175 deletions(-) diff --git a/src/chai/ManagedSharedPtr.hpp b/src/chai/ManagedSharedPtr.hpp index ad9b04ad..cabb691e 100644 --- a/src/chai/ManagedSharedPtr.hpp +++ b/src/chai/ManagedSharedPtr.hpp @@ -12,117 +12,6 @@ namespace chai { -class msp_counted_base { -public: - msp_counted_base() noexcept : m_use_count(1) {} - - virtual ~msp_counted_base() noexcept {} - - virtual void m_dispose() noexcept = 0; - virtual void m_destroy() noexcept { delete this; } - - void m_add_ref_copy() noexcept { ++m_use_count; } - - void m_release() noexcept { - if(--m_use_count == 0) { - m_dispose(); - m_destroy(); - } - } - - long m_get_use_count() const noexcept { return m_use_count; } -private: - msp_counted_base(msp_counted_base const&) = delete; - msp_counted_base& operator=(msp_counted_base const&) = delete; - - long m_use_count = 0; -}; - -template -class msp_counted_ptr final : public msp_counted_base { -public: - msp_counted_ptr(Record p) noexcept : m_record(p) {} - //virtual void m_dispose() noexcept { delete (m_record.get_pointer(chai::CPU)); }// TODO : Other Exec spaces... - virtual void m_dispose() noexcept { delete (Ptr)m_record->m_pointers[chai::CPU]; }// TODO : Other Exec spaces... - virtual void m_destroy() noexcept { delete this; } - msp_counted_ptr(msp_counted_ptr const&) = delete; - msp_counted_ptr& operator=(msp_counted_ptr const&) = delete; -private: - Record m_record; -}; - -template -class msp_counted_deleter final : public msp_counted_base { - - class impl { - public: - impl(Record p, Deleter d) : m_record(p), m_deleter(std::move(d)) {} - Deleter& m_del() noexcept { return m_deleter; } - Record m_record; - Deleter m_deleter; - }; - -public: - msp_counted_deleter(Record p, Deleter d) noexcept : m_impl(p, std::move(d)) {} - virtual void m_dispose() noexcept { - printf("Delete GPU Memory Here...\n"); - m_impl.m_del()((Ptr)m_impl.m_record->m_pointers[chai::CPU]); - } - virtual void m_destroy() noexcept { this->~msp_counted_deleter(); } - msp_counted_deleter(msp_counted_deleter const&) = delete; - msp_counted_deleter& operator=(msp_counted_deleter const&) = delete; -private: - impl m_impl; -}; - - -class msp_shared_count { -public: - constexpr msp_shared_count() noexcept : m_pi(0) {} - - template - explicit msp_shared_count(Ptr, Record p) - : m_pi( new msp_counted_ptr(p) ) {} - - template - explicit msp_shared_count(Ptr, Record p, Deleter d) - : m_pi( new msp_counted_deleter(p, d) ) {} - - ~msp_shared_count() noexcept - { if (m_pi) m_pi->m_release(); } - - msp_shared_count(msp_shared_count const& rhs) noexcept : m_pi(rhs.m_pi) - { if (m_pi) m_pi->m_add_ref_copy(); } - - msp_shared_count& operator=(msp_shared_count const& rhs) noexcept { - msp_counted_base* temp = rhs.m_pi; - if (temp != m_pi) - { - if (temp) temp->m_add_ref_copy(); - if (m_pi) m_pi->m_release(); - m_pi = temp; - } - return *this; - } - - void m_swap(msp_shared_count& rhs) noexcept { - msp_counted_base* temp = rhs.m_pi; - rhs.m_pi = m_pi; - m_pi = temp; - } - - long m_get_use_count() const noexcept - { return m_pi ? m_pi->m_get_use_count() : 0; } - - friend inline bool - operator==(msp_shared_count const& a, msp_shared_count const& b) noexcept - { return a.m_pi == b.m_pi; } - - msp_counted_base* m_pi; - -}; - - @@ -166,39 +55,13 @@ class ManagedSharedPtr { /* * Constructors */ - constexpr ManagedSharedPtr() noexcept : m_ref_count() {} + constexpr ManagedSharedPtr() noexcept : m_record_count() {} //// *Default* Ctor with convertible type Yp -> Tp - //template> - //explicit ManagedSharedPtr(Yp* host_p) : - // m_pointer_record(new msp_pointer_record(host_p)), - // m_ref_count(host_p, m_pointer_record), - // m_active_pointer(static_cast(m_pointer_record->m_pointers[chai::CPU])) - // //m_resource_manager(SharedPtrManager::getInstance()) - //{} - - //template> - //explicit ManagedSharedPtr(Yp* host_p, Yp* device_p) : - // m_pointer_record(new msp_pointer_record(host_p, device_p)), - // m_ref_count(host_p, m_pointer_record), - // m_active_pointer(static_cast(m_pointer_record->m_pointers[chai::CPU])) - // //m_resource_manager(SharedPtrManager::getInstance()) - //{} - - //template> - //ManagedSharedPtr(Yp* host_p, Deleter d) : - // m_pointer_record(new msp_pointer_record(host_p)), - // m_ref_count(host_p, m_pointer_record, std::move(d)), - // m_active_pointer(static_cast(m_pointer_record->m_pointers[chai::CPU])) - // //m_resource_manager(SharedPtrManager::getInstance()) - //{} - template> ManagedSharedPtr(Yp* host_p, Yp* device_p, Deleter d) : - m_pointer_record(new msp_pointer_record(host_p, device_p)), - m_ref_count(host_p, m_pointer_record, std::move(d)), - m_active_pointer(static_cast(m_pointer_record->m_pointers[chai::CPU])) - //m_resource_manager(SharedPtrManager::getInstance()) + m_record_count(host_p, device_p, std::move(d)), + m_active_pointer(m_record_count.getPointer(chai::CPU)) {} /* @@ -208,11 +71,9 @@ class ManagedSharedPtr { template> ManagedSharedPtr(ManagedSharedPtr const& rhs) noexcept : - m_ref_count(rhs.m_ref_count), + m_record_count(rhs.m_record_count), m_active_pointer(rhs.m_active_pointer) { - // TODO : Is this safe?? - m_pointer_record = reinterpret_cast*>(rhs.m_pointer_record); } @@ -230,7 +91,7 @@ class ManagedSharedPtr { public: - long use_count() const noexcept { return m_ref_count.m_get_use_count(); } + long use_count() const noexcept { return m_record_count.m_get_use_count(); } /* * Private Members @@ -239,13 +100,8 @@ class ManagedSharedPtr { template friend class ManagedSharedPtr; - //template - //friend ManagedSharedPtr make_managed(Args... args); - - mutable msp_pointer_record* m_pointer_record = nullptr; - msp_shared_count m_ref_count; + msp_record_count m_record_count; mutable element_type* m_active_pointer = nullptr; - //mutable SharedPtrManager* m_resource_manager = nullptr; }; diff --git a/src/chai/SharedPointerRecord.hpp b/src/chai/SharedPointerRecord.hpp index 3dad340d..a12c181f 100644 --- a/src/chai/SharedPointerRecord.hpp +++ b/src/chai/SharedPointerRecord.hpp @@ -19,7 +19,7 @@ namespace chai /*! * \brief Struct holding details about each pointer. */ -template +//template struct msp_pointer_record { // Using NUM_EXECUTION_SPACES for the time being, this will help with logical @@ -35,18 +35,6 @@ struct msp_pointer_record { int m_allocators[NUM_EXECUTION_SPACES]; - //template - //msp_pointer_record(Yp* host_p = nullptr, Yp* device_p = nullptr) : m_last_space(NONE) { - // for (int space = 0; space < NUM_EXECUTION_SPACES; ++space ) { - // m_pointers[space] = nullptr; - // m_touched[space] = false; - // m_owned[space] = true; - // m_allocators[space] = 0; - // } - // m_pointers[CPU] = host_p; - // m_pointers[GPU] = device_p; - //} - msp_pointer_record(void* host_p = nullptr, void* device_p = nullptr) : m_last_space(NONE) { for (int space = 0; space < NUM_EXECUTION_SPACES; ++space ) { @@ -59,23 +47,133 @@ struct msp_pointer_record { m_pointers[GPU] = device_p; } - //Tp* get_pointer(ExecutionSpace space) noexcept { return m_pointers[space]; } - //template - //msp_pointer_record(msp_pointer_record const& rhs) : - // m_pointers(rhs.m_pointers), - // m_touched(rhs.m_touched), - // m_owned(rhs.m_owned), - // m_last_space(rhs.m_last_space), - // m_allocators(rhs.m_allocators) - //{} +}; + + +class msp_counted_base { +public: + msp_counted_base() noexcept : m_use_count(1) {} + + virtual ~msp_counted_base() noexcept {} + + virtual void m_dispose() noexcept = 0; + virtual void m_destroy() noexcept { delete this; } + + void m_add_ref_copy() noexcept { ++m_use_count; } + + void m_release() noexcept { + if(--m_use_count == 0) { + m_dispose(); + m_destroy(); + } + } + + long m_get_use_count() const noexcept { return m_use_count; } + virtual msp_pointer_record& getPointerRecord() noexcept = 0; +private: + msp_counted_base(msp_counted_base const&) = delete; + msp_counted_base& operator=(msp_counted_base const&) = delete; + long m_use_count = 0; }; +template +class msp_counted_ptr final : public msp_counted_base { +public: + msp_counted_ptr(Ptr h_p, Ptr d_p) noexcept : m_record(h_p, d_p) {} + virtual void m_dispose() noexcept { delete (Ptr)m_record.m_pointers[chai::CPU]; }// TODO : Other Exec spaces... + virtual void m_destroy() noexcept { delete this; } + msp_counted_ptr(msp_counted_ptr const&) = delete; + msp_counted_ptr& operator=(msp_counted_ptr const&) = delete; + + msp_pointer_record& getPointerRecord() noexcept { return m_record; } +private: + msp_pointer_record m_record; +}; +template +class msp_counted_deleter final : public msp_counted_base { + + class impl { + public: + impl(Ptr h_p, Ptr d_p, Deleter d) : m_record(h_p, d_p), m_deleter(std::move(d)) {} + Deleter& m_del() noexcept { return m_deleter; } + msp_pointer_record m_record; + Deleter m_deleter; + }; + +public: + msp_counted_deleter(Ptr h_p, Ptr d_p, Deleter d) noexcept : m_impl(h_p, d_p, std::move(d)) {} + virtual void m_dispose() noexcept { + printf("Delete GPU Memory Here...\n"); + m_impl.m_del()((Ptr)m_impl.m_record.m_pointers[chai::CPU]); + } + virtual void m_destroy() noexcept { this->~msp_counted_deleter(); } + msp_counted_deleter(msp_counted_deleter const&) = delete; + msp_counted_deleter& operator=(msp_counted_deleter const&) = delete; + msp_pointer_record& getPointerRecord() noexcept { return m_impl.m_record; } +private: + impl m_impl; +}; -} // end of namespace chai +class msp_record_count { +public: + constexpr msp_record_count() noexcept : m_pi(0) {} + + template + explicit msp_record_count(Ptr h_p, Ptr d_p) + : m_pi( new msp_counted_ptr(h_p, d_p) ) {} + + template + explicit msp_record_count(Ptr h_p, Ptr d_p, Deleter d) + : m_pi( new msp_counted_deleter(h_p, d_p, d) ) {} + + ~msp_record_count() noexcept + { if (m_pi) m_pi->m_release(); } + + msp_record_count(msp_record_count const& rhs) noexcept : m_pi(rhs.m_pi) + { if (m_pi) m_pi->m_add_ref_copy(); } + + msp_record_count& operator=(msp_record_count const& rhs) noexcept { + msp_counted_base* temp = rhs.m_pi; + if (temp != m_pi) + { + if (temp) temp->m_add_ref_copy(); + if (m_pi) m_pi->m_release(); + m_pi = temp; + } + return *this; + } + + void m_swap(msp_record_count& rhs) noexcept { + msp_counted_base* temp = rhs.m_pi; + rhs.m_pi = m_pi; + m_pi = temp; + } + + long m_get_use_count() const noexcept + { return m_pi ? m_pi->m_get_use_count() : 0; } + + friend inline bool + operator==(msp_record_count const& a, msp_record_count const& b) noexcept + { return a.m_pi == b.m_pi; } + + msp_pointer_record& getPointerRecord() noexcept { return m_pi->getPointerRecord(); } + + template + Ptr* getPointer(chai::ExecutionSpace space) noexcept { return static_cast(getPointerRecord().m_pointers[space]); } + + msp_counted_base* m_pi; + +}; + + + + + +} // end of namespace chai #endif // CHAI_SharedPointerRecord_HPP diff --git a/tests/integration/managed_ptr_tests.cpp b/tests/integration/managed_ptr_tests.cpp index de7f348a..5aa869ec 100644 --- a/tests/integration/managed_ptr_tests.cpp +++ b/tests/integration/managed_ptr_tests.cpp @@ -172,12 +172,12 @@ TEST(managed_ptr, shared_ptr) //chai::ManagedSharedPtr sptr(new TestDerived()); //chai::ManagedSharedPtr sptr = chai::make_shared(); - chai::ManagedSharedPtr sptr = chai::make_shared_deleter( + chai::ManagedSharedPtr sptr = chai::make_shared_deleter( [](TestDerived* p){ printf("Custom Deleter Call\n"); p->~TestDerived(); }); std::cout << "use_count : " << sptr.use_count() << std::endl; - auto sptr2 = sptr; + chai::ManagedSharedPtr sptr2 = sptr; sptr2->doSomething(); std::cout << "use_count : " << sptr.use_count() << std::endl; From 067b66a52261cab2f4fc40bf97799b5517a6fab7 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Mon, 22 Apr 2024 12:28:24 -0700 Subject: [PATCH 05/44] polymorphic object host->device copy testing; Working on getting SharedPtrManager copies working correctly with umpire/chai. --- src/chai/CMakeLists.txt | 2 +- src/chai/ManagedSharedPtr.hpp | 99 +++++++++-- src/chai/SharedPointerRecord.hpp | 142 ++------------- src/chai/SharedPtrCounter.hpp | 159 +++++++++++++++++ src/chai/SharedPtrManager.cpp | 195 +++++++++++---------- src/chai/SharedPtrManager.hpp | 56 +++--- src/chai/SharedPtrManager.inl | 116 +++++++------ src/chai/managed_ptr.hpp | 3 +- tests/integration/CMakeLists.txt | 13 ++ tests/integration/managed_ptr_tests.cpp | 79 ++++++++- tests/integration/polymorphism_tests.cpp | 211 +++++++++++++++++++++++ 11 files changed, 743 insertions(+), 332 deletions(-) create mode 100644 src/chai/SharedPtrCounter.hpp create mode 100644 tests/integration/polymorphism_tests.cpp diff --git a/src/chai/CMakeLists.txt b/src/chai/CMakeLists.txt index ed138282..ca86753f 100644 --- a/src/chai/CMakeLists.txt +++ b/src/chai/CMakeLists.txt @@ -29,7 +29,7 @@ if(CHAI_DISABLE_RM) endif () set (chai_sources - # SharedPtrManager.cpp + SharedPtrManager.cpp ArrayManager.cpp) set (chai_depends diff --git a/src/chai/ManagedSharedPtr.hpp b/src/chai/ManagedSharedPtr.hpp index cabb691e..90967f87 100644 --- a/src/chai/ManagedSharedPtr.hpp +++ b/src/chai/ManagedSharedPtr.hpp @@ -3,19 +3,16 @@ #include +#include "chai/ArrayManager.hpp" +#include "chai/ChaiMacros.hpp" #include "chai/ExecutionSpaces.hpp" //#include "chai/SharedPtrManager.hpp" -#include "chai/SharedPointerRecord.hpp" +//#include "chai/SharedPointerRecord.hpp" +#include "chai/SharedPtrCounter.hpp" #include "chai/managed_ptr.hpp" namespace chai { - - - - - - // Type traits for SFINAE template struct msp_is_constructible : std::is_convertible::type {}; @@ -27,8 +24,6 @@ template struct msp_compatible_with : std::is_convertible::type {}; - - template class ManagedSharedPtr { @@ -55,44 +50,87 @@ class ManagedSharedPtr { /* * Constructors */ + CHAI_HOST_DEVICE constexpr ManagedSharedPtr() noexcept : m_record_count() {} //// *Default* Ctor with convertible type Yp -> Tp template> ManagedSharedPtr(Yp* host_p, Yp* device_p, Deleter d) : m_record_count(host_p, device_p, std::move(d)), - m_active_pointer(m_record_count.getPointer(chai::CPU)) + m_active_pointer(m_record_count.m_get_pointer(chai::CPU)), + m_resource_manager(SharedPtrManager::getInstance()) {} /* * Copy Constructors */ - ManagedSharedPtr(ManagedSharedPtr const&) noexcept = default; // TODO: this is *NOT* going to be default + CHAI_HOST_DEVICE + ManagedSharedPtr(ManagedSharedPtr const& rhs) noexcept : + m_record_count(rhs.m_record_count), + m_active_pointer(rhs.m_active_pointer), + m_resource_manager(rhs.m_resource_manager) + { +#if !defined(CHAI_DEVICE_COMPILE) + if (m_active_pointer) move(ArrayManager::getInstance()->getExecutionSpace()); + //if (m_active_pointer) move(m_resource_manager->getExecutionSpace()); +#endif + } template> + CHAI_HOST_DEVICE ManagedSharedPtr(ManagedSharedPtr const& rhs) noexcept : m_record_count(rhs.m_record_count), - m_active_pointer(rhs.m_active_pointer) + m_active_pointer(rhs.m_active_pointer), + m_resource_manager(rhs.m_resource_manager) { +#if !defined(CHAI_DEVICE_COMPILE) + if (m_active_pointer) move(ArrayManager::getInstance()->getExecutionSpace()); + //if (m_active_pointer) move(m_resource_manager->getExecutionSpace()); +#endif } - /* * Accessors */ + CHAI_HOST_DEVICE element_type* get(ExecutionSpace space = chai::CPU) const noexcept { return m_active_pointer; } - element_type& operator*() const noexcept { assert(m_get() != nullptr); return *m_get(); } + CHAI_HOST_DEVICE + element_type& operator*() const noexcept { assert(get() != nullptr); return *get(); } - element_type* operator->() const noexcept { assert(m_get() != nullptr); return m_get(); } + CHAI_HOST_DEVICE + element_type* operator->() const noexcept { assert(get() != nullptr); return get(); } private: - element_type* m_get() const noexcept { return static_cast*>(this)->get(); } + + //CHAI_HOST_DEVICE + //element_type* m_get() const noexcept { return static_cast*>(this)->get(); } public: long use_count() const noexcept { return m_record_count.m_get_use_count(); } + CHAI_HOST + void move(ExecutionSpace space, bool registerTouch = true) noexcept { + printf("Calling move\n"); + ExecutionSpace prev_space = m_record_count.m_get_record()->m_last_space; + if (prev_space == CPU || prev_space == NONE) { + /// Move nested ManagedArrays first, so they are working with a valid m_active_pointer for the host, + // and so the meta data associated with them are updated before we move the other array down. + //moveInnerImpl(); + } + m_active_pointer = static_cast(m_resource_manager->move((void *)m_active_pointer, m_record_count.m_get_record(), space)); + + if (registerTouch) { + m_resource_manager->registerTouch(m_record_count.m_get_record(), space); + } + if (space != GPU && prev_space == GPU) { + /// Move nested ManagedArrays after the move, so they are working with a valid m_active_pointer for the host, + // and so the meta data associated with them are updated with live GPU data + //moveInnerImpl(); + } + + } /* * Private Members */ @@ -102,13 +140,38 @@ class ManagedSharedPtr { msp_record_count m_record_count; mutable element_type* m_active_pointer = nullptr; - //mutable SharedPtrManager* m_resource_manager = nullptr; + + mutable SharedPtrManager* m_resource_manager = nullptr; }; + +template +__global__ void msp_make_on_device(T* gpuPointer, Args... args) +{ + new(gpuPointer) T(processArguments(args)...); + //printf("On GPU @ : %p\n", gpuPointer); +} + + +template +CHAI_HOST Tp* msp_make_on_device(Args... args) { + chai::SharedPtrManager* sptr_manager = chai::SharedPtrManager::getInstance(); + + auto gpu_allocator = sptr_manager->getAllocator(chai::GPU); + Tp* gpu_ptr = static_cast( gpu_allocator.allocate(1*sizeof(Tp)) ); + + msp_make_on_device<<<1,1>>>(gpu_ptr, args...); + + return gpu_ptr; +} + template ManagedSharedPtr make_shared(Args... args) { Tp* gpu_pointer = make_on_device(args...); Tp* cpu_pointer = make_on_host(args...); + std::cout << "CPU @ " << cpu_pointer << std::endl; + std::cout << "GPU @ " << gpu_pointer << std::endl; return ManagedSharedPtr(cpu_pointer, gpu_pointer, [](Tp* p){delete p;}); } @@ -117,6 +180,8 @@ template ManagedSharedPtr make_shared_deleter(Args... args, Deleter d) { Tp* gpu_pointer = make_on_device(args...); Tp* cpu_pointer = make_on_host(args...); + std::cout << "CPU @ " << cpu_pointer << std::endl; + std::cout << "GPU @ " << gpu_pointer << std::endl; return ManagedSharedPtr(cpu_pointer, gpu_pointer, std::move(d)); } diff --git a/src/chai/SharedPointerRecord.hpp b/src/chai/SharedPointerRecord.hpp index a12c181f..1daa1baf 100644 --- a/src/chai/SharedPointerRecord.hpp +++ b/src/chai/SharedPointerRecord.hpp @@ -8,6 +8,7 @@ #define CHAI_SharedPointerRecord_HPP #include "chai/ExecutionSpaces.hpp" +#include "chai/SharedPtrManager.hpp" #include "chai/Types.hpp" #include @@ -35,140 +36,19 @@ struct msp_pointer_record { int m_allocators[NUM_EXECUTION_SPACES]; - - msp_pointer_record(void* host_p = nullptr, void* device_p = nullptr) : m_last_space(NONE) { - for (int space = 0; space < NUM_EXECUTION_SPACES; ++space ) { - m_pointers[space] = nullptr; - m_touched[space] = false; - m_owned[space] = true; - m_allocators[space] = 0; - } - m_pointers[CPU] = host_p; - m_pointers[GPU] = device_p; - } - -}; - - -class msp_counted_base { -public: - msp_counted_base() noexcept : m_use_count(1) {} - - virtual ~msp_counted_base() noexcept {} - - virtual void m_dispose() noexcept = 0; - virtual void m_destroy() noexcept { delete this; } - - void m_add_ref_copy() noexcept { ++m_use_count; } - - void m_release() noexcept { - if(--m_use_count == 0) { - m_dispose(); - m_destroy(); - } - } - - long m_get_use_count() const noexcept { return m_use_count; } - - virtual msp_pointer_record& getPointerRecord() noexcept = 0; - -private: - msp_counted_base(msp_counted_base const&) = delete; - msp_counted_base& operator=(msp_counted_base const&) = delete; - - long m_use_count = 0; -}; - -template -class msp_counted_ptr final : public msp_counted_base { -public: - msp_counted_ptr(Ptr h_p, Ptr d_p) noexcept : m_record(h_p, d_p) {} - virtual void m_dispose() noexcept { delete (Ptr)m_record.m_pointers[chai::CPU]; }// TODO : Other Exec spaces... - virtual void m_destroy() noexcept { delete this; } - msp_counted_ptr(msp_counted_ptr const&) = delete; - msp_counted_ptr& operator=(msp_counted_ptr const&) = delete; - - msp_pointer_record& getPointerRecord() noexcept { return m_record; } -private: - msp_pointer_record m_record; -}; - -template -class msp_counted_deleter final : public msp_counted_base { - - class impl { - public: - impl(Ptr h_p, Ptr d_p, Deleter d) : m_record(h_p, d_p), m_deleter(std::move(d)) {} - Deleter& m_del() noexcept { return m_deleter; } - msp_pointer_record m_record; - Deleter m_deleter; - }; - -public: - msp_counted_deleter(Ptr h_p, Ptr d_p, Deleter d) noexcept : m_impl(h_p, d_p, std::move(d)) {} - virtual void m_dispose() noexcept { - printf("Delete GPU Memory Here...\n"); - m_impl.m_del()((Ptr)m_impl.m_record.m_pointers[chai::CPU]); - } - virtual void m_destroy() noexcept { this->~msp_counted_deleter(); } - msp_counted_deleter(msp_counted_deleter const&) = delete; - msp_counted_deleter& operator=(msp_counted_deleter const&) = delete; - - msp_pointer_record& getPointerRecord() noexcept { return m_impl.m_record; } -private: - impl m_impl; -}; - - -class msp_record_count { -public: - constexpr msp_record_count() noexcept : m_pi(0) {} - - template - explicit msp_record_count(Ptr h_p, Ptr d_p) - : m_pi( new msp_counted_ptr(h_p, d_p) ) {} - - template - explicit msp_record_count(Ptr h_p, Ptr d_p, Deleter d) - : m_pi( new msp_counted_deleter(h_p, d_p, d) ) {} - - ~msp_record_count() noexcept - { if (m_pi) m_pi->m_release(); } - - msp_record_count(msp_record_count const& rhs) noexcept : m_pi(rhs.m_pi) - { if (m_pi) m_pi->m_add_ref_copy(); } - - msp_record_count& operator=(msp_record_count const& rhs) noexcept { - msp_counted_base* temp = rhs.m_pi; - if (temp != m_pi) - { - if (temp) temp->m_add_ref_copy(); - if (m_pi) m_pi->m_release(); - m_pi = temp; + msp_pointer_record(void* host_p = nullptr, void* device_p = nullptr) : + m_last_space(CPU) { + for (int space = 0; space < NUM_EXECUTION_SPACES; ++space ) { + m_pointers[space] = nullptr; + m_touched[space] = false; + m_owned[space] = true; + //m_allocators[space] = 0; } - return *this; - } - - void m_swap(msp_record_count& rhs) noexcept { - msp_counted_base* temp = rhs.m_pi; - rhs.m_pi = m_pi; - m_pi = temp; + m_pointers[CPU] = host_p; + m_touched[CPU] = true; + m_pointers[GPU] = device_p; } - long m_get_use_count() const noexcept - { return m_pi ? m_pi->m_get_use_count() : 0; } - - friend inline bool - operator==(msp_record_count const& a, msp_record_count const& b) noexcept - { return a.m_pi == b.m_pi; } - - msp_pointer_record& getPointerRecord() noexcept { return m_pi->getPointerRecord(); } - - template - Ptr* getPointer(chai::ExecutionSpace space) noexcept { return static_cast(getPointerRecord().m_pointers[space]); } - - msp_counted_base* m_pi; - }; diff --git a/src/chai/SharedPtrCounter.hpp b/src/chai/SharedPtrCounter.hpp new file mode 100644 index 00000000..00d4a81b --- /dev/null +++ b/src/chai/SharedPtrCounter.hpp @@ -0,0 +1,159 @@ + +////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC and CHAI +// project contributors. See the CHAI LICENSE file for details. +// +// SPDX-License-Identifier: BSD-3-Clause +////////////////////////////////////////////////////////////////////////////// +#ifndef CHAI_SharedPointerCounter_HPP +#define CHAI_SharedPointerCounter_HPP + +#include "chai/ChaiMacros.hpp" +#include "chai/SharedPtrManager.hpp" + +namespace chai +{ + +class msp_counted_base { +public: + msp_counted_base() noexcept : m_use_count(1) {} + + virtual ~msp_counted_base() noexcept {} + + virtual void m_dispose() noexcept = 0; + virtual void m_destroy() noexcept { delete this; } + + void m_add_ref_copy() noexcept { ++m_use_count; } + + void m_release() noexcept { + if(--m_use_count == 0) { + m_dispose(); + m_destroy(); + } + } + + long m_get_use_count() const noexcept { return m_use_count; } + + virtual msp_pointer_record* m_get_record() noexcept = 0; + +private: + msp_counted_base(msp_counted_base const&) = delete; + msp_counted_base& operator=(msp_counted_base const&) = delete; + + long m_use_count = 0; +}; + +template +class msp_counted_ptr final : public msp_counted_base { +public: + msp_counted_ptr(Ptr h_p, Ptr d_p) noexcept : + m_record(SharedPtrManager::getInstance()->makeSharedPtrRecord(h_p, d_p, true)) {} + //msp_counted_ptr(Ptr h_p, Ptr d_p) noexcept : m_record(new msp_pointer_record(h_p, d_p)) {} + virtual void m_dispose() noexcept { delete (Ptr)m_record->m_pointers[chai::CPU]; }// TODO : Other Exec spaces... + virtual void m_destroy() noexcept { delete this; } + msp_counted_ptr(msp_counted_ptr const&) = delete; + msp_counted_ptr& operator=(msp_counted_ptr const&) = delete; + + msp_pointer_record* m_get_record() noexcept { return m_record; } +private: + msp_pointer_record* m_record; +}; + +template +class msp_counted_deleter final : public msp_counted_base { + + class impl { + public: + impl(Ptr h_p, Ptr d_p, Deleter d) : + m_record(SharedPtrManager::getInstance()->makeSharedPtrRecord(h_p, d_p, true)), m_deleter(std::move(d)) {} + //impl(Ptr h_p, Ptr d_p, Deleter d) : m_record(new msp_pointer_record(h_p, d_p)), m_deleter(std::move(d)) {} + Deleter& m_del() noexcept { return m_deleter; } + msp_pointer_record* m_record; + Deleter m_deleter; + }; + +public: + msp_counted_deleter(Ptr h_p, Ptr d_p, Deleter d) noexcept : m_impl(h_p, d_p, std::move(d)) {} + virtual void m_dispose() noexcept { + printf("Delete GPU Memory Here...\n"); + m_impl.m_del()((Ptr)m_impl.m_record->m_pointers[chai::CPU]); + } + virtual void m_destroy() noexcept { this->~msp_counted_deleter(); } + msp_counted_deleter(msp_counted_deleter const&) = delete; + msp_counted_deleter& operator=(msp_counted_deleter const&) = delete; + + msp_pointer_record* m_get_record() noexcept { return m_impl.m_record; } +private: + impl m_impl; +}; + + +class msp_record_count { +public: + CHAI_HOST_DEVICE + constexpr msp_record_count() noexcept : m_pi(0) {} + + template + explicit msp_record_count(Ptr h_p, Ptr d_p) + : m_pi( new msp_counted_ptr(h_p, d_p) ) {} + + template + explicit msp_record_count(Ptr h_p, Ptr d_p, Deleter d) + : m_pi( new msp_counted_deleter(h_p, d_p, d) ) {} + + CHAI_HOST_DEVICE + ~msp_record_count() noexcept + { +#if !defined(CHAI_DEVICE_COMPILE) + if (m_pi) m_pi->m_release(); +#endif // !defined(CHAI_DEVICE_COMPILE) + } + + CHAI_HOST_DEVICE + msp_record_count(msp_record_count const& rhs) noexcept : m_pi(rhs.m_pi) + { +#if !defined(CHAI_DEVICE_COMPILE) + if (m_pi) m_pi->m_add_ref_copy(); +#endif // !defined(CHAI_DEVICE_COMPILE) + } + + CHAI_HOST_DEVICE + msp_record_count& operator=(msp_record_count const& rhs) noexcept { +#if !defined(CHAI_DEVICE_COMPILE) + msp_counted_base* temp = rhs.m_pi; + if (temp != m_pi) + { + if (temp) temp->m_add_ref_copy(); + if (m_pi) m_pi->m_release(); + m_pi = temp; + } +#endif // !defined(CHAI_DEVICE_COMPILE) + return *this; + } + + void m_swap(msp_record_count& rhs) noexcept { + msp_counted_base* temp = rhs.m_pi; + rhs.m_pi = m_pi; + m_pi = temp; + } + + long m_get_use_count() const noexcept + { return m_pi ? m_pi->m_get_use_count() : 0; } + + friend inline bool + operator==(msp_record_count const& a, msp_record_count const& b) noexcept + { return a.m_pi == b.m_pi; } + + msp_pointer_record* m_get_record() noexcept { return m_pi->m_get_record(); } + + template + Ptr* m_get_pointer(chai::ExecutionSpace space) noexcept { return static_cast(m_get_record()->m_pointers[space]); } + + msp_counted_base* m_pi; + +}; + + + +} // end of namespace chai +#endif // CHAI_SharedPointerRecord_HPP diff --git a/src/chai/SharedPtrManager.cpp b/src/chai/SharedPtrManager.cpp index 1845e383..4593cbd0 100644 --- a/src/chai/SharedPtrManager.cpp +++ b/src/chai/SharedPtrManager.cpp @@ -6,6 +6,7 @@ ////////////////////////////////////////////////////////////////////////////// #include "chai/SharedPtrManager.hpp" +#include "chai/ExecutionSpaces.hpp" #include "chai/config.hpp" #if defined(CHAI_ENABLE_CUDA) @@ -91,7 +92,7 @@ void SharedPtrManager::registerPointer( CHAI_LOG(Warning, "SharedPtrManager::registerPointer found a record for " << pointer << " already there. Deleting abandoned pointer record."); - callback(foundRecord, ACTION_FOUND_ABANDONED, space); + //callback(foundRecord, ACTION_FOUND_ABANDONED, space); for (int fspace = CPU; fspace < NUM_EXECUTION_SPACES; ++fspace) { foundRecord->m_pointers[fspace] = nullptr; @@ -116,13 +117,13 @@ void SharedPtrManager::registerPointer( // are consistent if (m_resource_manager.hasAllocator(pointer)) { umpire::util::AllocationRecord *allocation_record = const_cast(m_resource_manager.findAllocationRecord(pointer)); - allocation_record->size = record->m_size; + //allocation_record->size = record->m_size; } // register with umpire if it's not there so that umpire can perform data migrations else { umpire::util::AllocationRecord new_allocation_record; new_allocation_record.ptr = pointer; - new_allocation_record.size = record->m_size; + //new_allocation_record.size = record->m_size; new_allocation_record.strategy = m_resource_manager.getAllocator(record->m_allocators[space]).getAllocationStrategy(); m_resource_manager.registerAllocation(pointer, new_allocation_record); @@ -251,7 +252,11 @@ static void copy(void * dst_pointer, void * src_pointer, umpire::ResourceManager camp::resources::Resource host_resource(camp::resources::Host::get_default()); if (dst_space == GPU || src_space == GPU) { // Do the copy using the device resource - manager.copy(dst_pointer, src_pointer, device_resource); + //manager.copy(dst_pointer, src_pointer, device_resource); + { + std::cout << "Do Fake Copy to GPU.....\n"; + //CHAI_GPU_ERROR_CHECK(cudaMemcpyAsync(dst_pointer, src_pointer, 1, cudaMemcpyHostToDevice)); + } } else { // Do the copy using the host resource manager.copy(dst_pointer, src_pointer, host_resource); @@ -268,47 +273,45 @@ void SharedPtrManager::move(msp_pointer_record* record, ExecutionSpace space) return; } - callback(record, ACTION_CAPTURED, space); + //callback(record, ACTION_CAPTURED, space); if (space == record->m_last_space) { return; } -#if defined(CHAI_ENABLE_UM) - if (record->m_last_space == UM) { - return; - } -#endif - -#if defined(CHAI_ENABLE_PINNED) - if (record->m_last_space == PINNED) { - if (space == CPU) { - syncIfNeeded(); - } - return; - } -#endif - ExecutionSpace prev_space = record->m_last_space; void* src_pointer = record->m_pointers[prev_space]; void* dst_pointer = record->m_pointers[space]; - if (!dst_pointer) { - allocate(record, space); - dst_pointer = record->m_pointers[space]; - } + //if (!dst_pointer) { + // allocate(record, space); + // dst_pointer = record->m_pointers[space]; + //} if ( (!record->m_touched[record->m_last_space]) || (! src_pointer )) { + printf("failed move conditions\n"); + for (int i = chai::CPU; i < NUM_EXECUTION_SPACES; i++) std::cout << i << " : " <m_touched[i] << std::endl; + std::cout << record->m_last_space << std::endl; + std::cout << record->m_touched[record->m_last_space] << std::endl; + std::cout << (src_pointer) << std::endl; return; } else if (dst_pointer != src_pointer) { // Exclude the copy if src and dst are the same (can happen for PINNED memory) { + printf("Performing Copy\n"); + std::cout << "dst_pointer : " << dst_pointer << std::endl; + std::cout << "src_pointer : " << src_pointer << std::endl; + std::cout << "space : " << space << std::endl; + std::cout << "prev_space : " << prev_space << std::endl; + std::cout << m_resource_manager.findAllocatorForPointer(dst_pointer)->getName() << std::endl; + std::cout << m_resource_manager.findAllocatorForPointer(src_pointer)->getName() << std::endl; chai::copy(dst_pointer, src_pointer, m_resource_manager, space, prev_space); + } - callback(record, ACTION_MOVE, space); + //callback(record, ACTION_MOVE, space); } resetTouch(record); @@ -318,11 +321,11 @@ void SharedPtrManager::allocate( msp_pointer_record* pointer_record, ExecutionSpace space) { - auto size = pointer_record->m_size; + //auto size = pointer_record->m_size; auto alloc = m_resource_manager.getAllocator(pointer_record->m_allocators[space]); - pointer_record->m_pointers[space] = alloc.allocate(size); - callback(pointer_record, ACTION_ALLOC, space); + pointer_record->m_pointers[space] = alloc.allocate(1); + //callback(pointer_record, ACTION_ALLOC, space); registerPointer(pointer_record, space); CHAI_LOG(Debug, "Allocated array at: " << pointer_record->m_pointers[space]); @@ -371,9 +374,9 @@ void SharedPtrManager::free(msp_pointer_record* pointer_record, ExecutionSpace s } else #endif { - callback(pointer_record, - ACTION_FREE, - ExecutionSpace(space)); + // callback(pointer_record, + // ACTION_FREE, + // ExecutionSpace(space)); auto alloc = m_resource_manager.getAllocator( pointer_record->m_allocators[space]); @@ -400,12 +403,12 @@ void SharedPtrManager::free(msp_pointer_record* pointer_record, ExecutionSpace s } } -size_t SharedPtrManager::getSize(void* ptr) -{ - // TODO - auto pointer_record = getPointerRecord(ptr); - return pointer_record->m_size; -} +//size_t SharedPtrManager::getSize(void* ptr) +//{ +// // TODO +// auto pointer_record = getPointerRecord(ptr); +// return pointer_record->m_size; +//} void SharedPtrManager::setDefaultAllocationSpace(ExecutionSpace space) { @@ -418,17 +421,17 @@ ExecutionSpace SharedPtrManager::getDefaultAllocationSpace() } -void SharedPtrManager::setUserCallback(void* pointer, UserCallback const& f) -{ - // TODO ?? - auto pointer_record = getPointerRecord(pointer); - pointer_record->m_user_callback = f; -} - -void SharedPtrManager::setGlobalUserCallback(UserCallback const& f) -{ - m_user_callback = f; -} +//void SharedPtrManager::setUserCallback(void* pointer, UserCallback const& f) +//{ +// // TODO ?? +// auto pointer_record = getPointerRecord(pointer); +// pointer_record->m_user_callback = f; +//} +// +//void SharedPtrManager::setGlobalUserCallback(UserCallback const& f) +//{ +// m_user_callback = f; +//} msp_pointer_record* SharedPtrManager::getPointerRecord(void* pointer) { @@ -437,22 +440,28 @@ msp_pointer_record* SharedPtrManager::getPointerRecord(void* pointer) return record->second ? *record->second : &s_null_record; } -msp_pointer_record* SharedPtrManager::makeManaged(void* pointer, - size_t size, - ExecutionSpace space, - bool owned) +msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(void* pointer, void* d_pointer, + //size_t size, + //ExecutionSpace space, + bool owned) { if (pointer == nullptr) { return &s_null_record ; } - if (space == NONE) { - space = getDefaultAllocationSpace(); - } + //if (space == NONE) { + // space = getDefaultAllocationSpace(); + //} m_resource_manager.registerAllocation( pointer, - {pointer, size, m_allocators[space]->getAllocationStrategy()}); + {pointer, 1, m_allocators[chai::CPU]->getAllocationStrategy()}); + std::cout << "m_allocators[chai::CPU] : " << m_allocators[chai::CPU]->getName() << std::endl; + + m_resource_manager.registerAllocation( + d_pointer, + {d_pointer, 1, m_allocators[chai::GPU]->getAllocationStrategy()}); + std::cout << "m_allocators[chai::GPU] : " << m_allocators[chai::GPU]->getName() << std::endl; auto pointer_record = getPointerRecord(pointer); @@ -465,20 +474,22 @@ msp_pointer_record* SharedPtrManager::makeManaged(void* pointer, } else { CHAI_LOG(Warning, "SharedPtrManager::makeManaged found abandoned pointer record!!!"); - callback(pointer_record, ACTION_FOUND_ABANDONED, space); + //callback(pointer_record, ACTION_FOUND_ABANDONED, space); } - pointer_record->m_pointers[space] = pointer; - pointer_record->m_owned[space] = owned; - pointer_record->m_size = size; - pointer_record->m_user_callback = [] (const msp_pointer_record*, Action, ExecutionSpace) {}; + pointer_record->m_pointers[chai::CPU] = pointer; + pointer_record->m_owned[chai::CPU] = owned; + pointer_record->m_pointers[chai::GPU] = d_pointer; + pointer_record->m_owned[chai::GPU] = owned; + //pointer_record->m_size = size; + //pointer_record->m_user_callback = [] (const msp_pointer_record*, Action, ExecutionSpace) {}; for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { pointer_record->m_allocators[space] = getAllocatorId(ExecutionSpace(space)); } - if (pointer && size > 0) { - registerPointer(pointer_record, space, owned); + if (pointer) { + registerPointer(pointer_record, chai::CPU, owned); } return pointer_record; @@ -487,9 +498,9 @@ msp_pointer_record* SharedPtrManager::makeManaged(void* pointer, msp_pointer_record* SharedPtrManager::deepCopyRecord(msp_pointer_record const* record) { msp_pointer_record* new_record = new msp_pointer_record{}; - const size_t size = record->m_size; - new_record->m_size = size; - new_record->m_user_callback = [] (const msp_pointer_record*, Action, ExecutionSpace) {}; + //const size_t size = record->m_size; + //new_record->m_size = size; + //new_record->m_user_callback = [] (const msp_pointer_record*, Action, ExecutionSpace) {}; const ExecutionSpace last_space = record->m_last_space; new_record->m_last_space = last_space; @@ -531,32 +542,32 @@ size_t SharedPtrManager::getTotalNumArrays() const { return m_pointer_map.size() // TODO: Investigate counting memory allocated in each execution space if // possible -size_t SharedPtrManager::getTotalSize() const -{ - std::lock_guard lock(m_mutex); - size_t total = 0; - - for (const auto& entry : m_pointer_map) { - total += (*entry.second)->m_size; - } - - return total; -} - -void SharedPtrManager::reportLeaks() const -{ - std::lock_guard lock(m_mutex); - for (const auto& entry : m_pointer_map) { - const void* pointer = entry.first; - const msp_pointer_record* record = *entry.second; - - for (int s = CPU; s < NUM_EXECUTION_SPACES; ++s) { - if (pointer == record->m_pointers[s]) { - callback(record, ACTION_LEAKED, ExecutionSpace(s)); - } - } - } -} +//size_t SharedPtrManager::getTotalSize() const +//{ +// std::lock_guard lock(m_mutex); +// size_t total = 0; +// +// for (const auto& entry : m_pointer_map) { +// total += (*entry.second)->m_size; +// } +// +// return total; +//} + +//void SharedPtrManager::reportLeaks() const +//{ +// std::lock_guard lock(m_mutex); +// for (const auto& entry : m_pointer_map) { +// const void* pointer = entry.first; +// const msp_pointer_record* record = *entry.second; +// +// for (int s = CPU; s < NUM_EXECUTION_SPACES; ++s) { +// if (pointer == record->m_pointers[s]) { +// callback(record, ACTION_LEAKED, ExecutionSpace(s)); +// } +// } +// } +//} int SharedPtrManager::getAllocatorId(ExecutionSpace space) const diff --git a/src/chai/SharedPtrManager.hpp b/src/chai/SharedPtrManager.hpp index c4a88097..3c9609ed 100644 --- a/src/chai/SharedPtrManager.hpp +++ b/src/chai/SharedPtrManager.hpp @@ -217,10 +217,10 @@ class SharedPtrManager * * \return Pointer to the allocated memory. */ - template - void* reallocate(void* pointer, - size_t elems, - msp_pointer_record* record); +// template +// void* reallocate(void* pointer, +// size_t elems, +// msp_pointer_record* record); /*! * \brief Set the default space for new ManagedArray allocations. @@ -262,24 +262,24 @@ class SharedPtrManager * \param pointer Pointer to find the size of. * \return Size of pointer. */ - CHAISHAREDDLL_API size_t getSize(void* pointer); + //CHAISHAREDDLL_API size_t getSize(void* pointer); - CHAISHAREDDLL_API msp_pointer_record* makeManaged(void* pointer, - size_t size, - ExecutionSpace space, - bool owned); + CHAISHAREDDLL_API msp_pointer_record* makeSharedPtrRecord(void* pointer, void* d_pointer, + //size_t size, + //ExecutionSpace space, + bool owned); /*! * \brief Assign a user-defined callback triggered upon memory operations. * This callback applies to a single ManagedArray. */ - CHAISHAREDDLL_API void setUserCallback(void* pointer, UserCallback const& f); + //CHAISHAREDDLL_API void setUserCallback(void* pointer, UserCallback const& f); /*! * \brief Assign a user-defined callback triggered upon memory operations. * This callback applies to all ManagedArrays. */ - CHAISHAREDDLL_API void setGlobalUserCallback(UserCallback const& f); + //CHAISHAREDDLL_API void setGlobalUserCallback(UserCallback const& f); /*! * \brief Set touched to false in all spaces for the given msp_pointer_record. @@ -327,12 +327,12 @@ class SharedPtrManager * * \return The total amount of memory allocated. */ - CHAISHAREDDLL_API size_t getTotalSize() const; + //CHAISHAREDDLL_API size_t getTotalSize() const; /*! * \brief Calls callbacks of pointers still in the map with ACTION_LEAKED. */ - CHAISHAREDDLL_API void reportLeaks() const; + //CHAISHAREDDLL_API void reportLeaks() const; /*! * \brief Get the allocator ID @@ -456,21 +456,21 @@ class SharedPtrManager * \param space The space in which the event occurred * \param size The number of bytes in the array associated with this pointer record */ - inline void callback(const msp_pointer_record* record, - Action action, - ExecutionSpace space) const { - if (m_callbacks_active) { - // Callback for this ManagedArray only - if (record && record->m_user_callback) { - record->m_user_callback(record, action, space); - } - - // Callback for all ManagedArrays - if (m_user_callback) { - m_user_callback(record, action, space); - } - } - } +// inline void callback(const msp_pointer_record* record, +// Action action, +// ExecutionSpace space) const { +// if (m_callbacks_active) { +// // Callback for this ManagedArray only +// if (record && record->m_user_callback) { +// record->m_user_callback(record, action, space); +// } +// +// // Callback for all ManagedArrays +// if (m_user_callback) { +// m_user_callback(record, action, space); +// } +// } +// } /*! * Current execution space. diff --git a/src/chai/SharedPtrManager.inl b/src/chai/SharedPtrManager.inl index af7b7a4b..bff622d9 100644 --- a/src/chai/SharedPtrManager.inl +++ b/src/chai/SharedPtrManager.inl @@ -16,67 +16,69 @@ #include "umpire/ResourceManager.hpp" -#if defined(CHAI_ENABLE_UM) -#if !defined(CHAI_THIN_GPU_ALLOCATE) #include -#endif -#endif - -namespace chai { - -template -CHAI_INLINE -void* SharedPtrManager::reallocate(void* pointer, size_t elems, msp_pointer_record* pointer_record) -{ - ExecutionSpace my_space = CPU; - - for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { - if (pointer_record->m_pointers[space] == pointer) { - my_space = static_cast(space); - break; - } - } - - for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { - if (!pointer_record->m_owned[space]) { - CHAI_LOG(Debug, "Cannot reallocate unowned pointer"); - return pointer_record->m_pointers[my_space]; - } - } - - // Call callback with ACTION_FREE before changing the size - for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { - if (pointer_record->m_pointers[space]) { - callback(pointer_record, ACTION_FREE, ExecutionSpace(space)); - } - } - - // Update the pointer record size - size_t old_size = pointer_record->m_size; - size_t new_size = sizeof(T) * elems; - pointer_record->m_size = new_size; - - // only copy however many bytes overlap - size_t num_bytes_to_copy = std::min(old_size, new_size); - for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { - void* old_ptr = pointer_record->m_pointers[space]; +//#if defined(CHAI_ENABLE_UM) +//#if !defined(CHAI_THIN_GPU_ALLOCATE) +//#include +//#endif +//#endif - if (old_ptr) { - void* new_ptr = m_allocators[space]->allocate(new_size); - m_resource_manager.copy(new_ptr, old_ptr, num_bytes_to_copy); - m_allocators[space]->deallocate(old_ptr); - - pointer_record->m_pointers[space] = new_ptr; - callback(pointer_record, ACTION_ALLOC, ExecutionSpace(space)); - - m_pointer_map.erase(old_ptr); - m_pointer_map.insert(new_ptr, pointer_record); - } - } +namespace chai { - return pointer_record->m_pointers[my_space]; -} +//template +//CHAI_INLINE +//void* SharedPtrManager::reallocate(void* pointer, size_t elems, msp_pointer_record* pointer_record) +//{ +// ExecutionSpace my_space = CPU; +// +// for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { +// if (pointer_record->m_pointers[space] == pointer) { +// my_space = static_cast(space); +// break; +// } +// } +// +// for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { +// if (!pointer_record->m_owned[space]) { +// CHAI_LOG(Debug, "Cannot reallocate unowned pointer"); +// return pointer_record->m_pointers[my_space]; +// } +// } +// +// // Call callback with ACTION_FREE before changing the size +// for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { +// if (pointer_record->m_pointers[space]) { +// callback(pointer_record, ACTION_FREE, ExecutionSpace(space)); +// } +// } +// +// // Update the pointer record size +// size_t old_size = pointer_record->m_size; +// size_t new_size = sizeof(T) * elems; +// pointer_record->m_size = new_size; +// +// // only copy however many bytes overlap +// size_t num_bytes_to_copy = std::min(old_size, new_size); +// +// for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { +// void* old_ptr = pointer_record->m_pointers[space]; +// +// if (old_ptr) { +// void* new_ptr = m_allocators[space]->allocate(new_size); +// m_resource_manager.copy(new_ptr, old_ptr, num_bytes_to_copy); +// m_allocators[space]->deallocate(old_ptr); +// +// pointer_record->m_pointers[space] = new_ptr; +// callback(pointer_record, ACTION_ALLOC, ExecutionSpace(space)); +// +// m_pointer_map.erase(old_ptr); +// m_pointer_map.insert(new_ptr, pointer_record); +// } +// } +// +// return pointer_record->m_pointers[my_space]; +//} #if defined(CHAI_ENABLE_PICK) template diff --git a/src/chai/managed_ptr.hpp b/src/chai/managed_ptr.hpp index 708261e1..da7bbf85 100644 --- a/src/chai/managed_ptr.hpp +++ b/src/chai/managed_ptr.hpp @@ -759,8 +759,9 @@ namespace chai { typename... Args> __global__ void make_on_device(T** gpuPointer, Args... args) { - printf("On GPU\n"); *gpuPointer = new T(processArguments(args)...); + printf("On GPU @ : %p\n", gpuPointer); + printf("On GPU @ : %p\n", &gpuPointer); } /// diff --git a/tests/integration/CMakeLists.txt b/tests/integration/CMakeLists.txt index c8338b0c..78a1600d 100644 --- a/tests/integration/CMakeLists.txt +++ b/tests/integration/CMakeLists.txt @@ -25,6 +25,19 @@ blt_add_test( NAME managed_array_test COMMAND managed_array_tests) +blt_add_executable( + NAME polymorphism_tests + SOURCES polymorphism_tests.cpp + DEPENDS_ON ${chai_integration_test_depends}) + +target_include_directories( + polymorphism_tests + PUBLIC ${PROJECT_BINARY_DIR}/include) + +blt_add_test( + NAME polymorphism_test + COMMAND polymorphism_tests) + if (CHAI_ENABLE_MANAGED_PTR) blt_add_executable( NAME managed_ptr_tests diff --git a/tests/integration/managed_ptr_tests.cpp b/tests/integration/managed_ptr_tests.cpp index 5aa869ec..9acbace4 100644 --- a/tests/integration/managed_ptr_tests.cpp +++ b/tests/integration/managed_ptr_tests.cpp @@ -4,8 +4,11 @@ // // SPDX-License-Identifier: BSD-3-Clause ////////////////////////////////////////////////////////////////////////////// +#include "chai/ChaiMacros.hpp" #include "chai/ManagedSharedPtr.hpp" +#include "chai/SharedPtrManager.hpp" #include "gtest/gtest.h" +#include "umpire/ResourceManager.hpp" #define GPU_TEST(X, Y) \ static void gpu_test_##X##Y(); \ @@ -79,7 +82,7 @@ class TestBase { CHAI_HOST_DEVICE virtual int getValue(const int i) const = 0; CHAI_HOST_DEVICE virtual int getMemberValue() const = 0; CHAI_HOST_DEVICE virtual void setMemberValue(int v) = 0; - CHAI_HOST virtual void doSomething() const = 0; + CHAI_HOST_DEVICE virtual void doSomething() const = 0; }; class TestDerived : public TestBase { @@ -93,7 +96,7 @@ class TestDerived : public TestBase { CHAI_HOST_DEVICE void setMemberValue(int v) { m_member = v; } - CHAI_HOST virtual void doSomething() const {printf("TestDerived doSomething()\n");} + CHAI_HOST_DEVICE virtual void doSomething() const {printf("TestDerived doSomething()\n");} private: chai::ManagedArray m_values; @@ -162,9 +165,65 @@ class MultipleRawArrayClass { int* m_values2; }; -TEST(managed_ptr, shared_ptr) +#define assert_empty_map(IGNORED) ASSERT_EQ(chai::SharedPtrManager::getInstance()->getPointerMap().size(),0) + +inline void gpuErrorCheck(cudaError_t code, const char *file, int line, bool abort=true) +{ + if (code != cudaSuccess) { + fprintf(stderr, "[CHAI] GPU Error: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) { + exit(code); + } + } +} +#define GPU_ERROR_CHECK(code) { gpuErrorCheck((code), __FILE__, __LINE__); } + +GPU_TEST(managed_ptr, shared_ptralloc) +{ + + { + + chai::SharedPtrManager* sptr_manager = chai::SharedPtrManager::getInstance(); + umpire::ResourceManager& res_manager = umpire::ResourceManager::getInstance(); + + auto cpu_allocator = sptr_manager->getAllocator(chai::CPU); + TestBase* cpu_ptr = static_cast( cpu_allocator.allocate(1*sizeof(TestDerived)) ); + + new(cpu_ptr) TestDerived(); + + + TestBase* gpu_ptr = chai::msp_make_on_device(); + + forall(gpu(), 0, 1, [=] __device__ (int i) { + printf("GPU Body\n"); + gpu_ptr->doSomething(); + printf("Mem val : %d\n", gpu_ptr->getMemberValue()); + }); + + std::cout << "Ump alloc cpu : " << cpu_ptr << std::endl; + std::cout << "Ump alloc gpu : " << gpu_ptr << std::endl; + + cpu_ptr->setMemberValue(5); + + unsigned int offset = sizeof(void*); + GPU_ERROR_CHECK(cudaMemcpy((char*)gpu_ptr+offset, (char*)cpu_ptr+offset, sizeof(TestDerived)-offset, cudaMemcpyHostToDevice)); + + forall(gpu(), 0, 1, [=] __device__ (int i) { + printf("GPU Body\n"); + gpu_ptr->doSomething(); + printf("Mem val : %d\n", gpu_ptr->getMemberValue()); + }); + GPU_ERROR_CHECK( cudaPeekAtLastError() ); + GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + + } + //assert_empty_map(); +} + +GPU_TEST(managed_ptr, shared_ptr) { + { //chai::ManagedSharedPtr sptr(new TestDerived(), // [](TestDerived*p){ printf("Deleter Call\n"); p->~TestDerived(); }); //chai::ManagedSharedPtr sptr = chai::ManagedSharedPtr(new TestDerived(), @@ -172,16 +231,26 @@ TEST(managed_ptr, shared_ptr) //chai::ManagedSharedPtr sptr(new TestDerived()); //chai::ManagedSharedPtr sptr = chai::make_shared(); - chai::ManagedSharedPtr sptr = chai::make_shared_deleter( + + chai::ManagedSharedPtr sptr = chai::make_shared_deleter( [](TestDerived* p){ printf("Custom Deleter Call\n"); p->~TestDerived(); }); std::cout << "use_count : " << sptr.use_count() << std::endl; + std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; chai::ManagedSharedPtr sptr2 = sptr; - sptr2->doSomething(); + //sptr2->doSomething(); std::cout << "use_count : " << sptr.use_count() << std::endl; + std::cout << "GPU CALL...\n"; + forall(gpu(), 0, 3, [=] __device__ (int i) { + printf("GPU Body\n"); + sptr->doSomething(); + //results[i] = rawArrayClass->getValue(i); + }); + } + //assert_empty_map(); } diff --git a/tests/integration/polymorphism_tests.cpp b/tests/integration/polymorphism_tests.cpp new file mode 100644 index 00000000..36d7b457 --- /dev/null +++ b/tests/integration/polymorphism_tests.cpp @@ -0,0 +1,211 @@ +////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC and CHAI +// project contributors. See the CHAI LICENSE file for details. +// +// SPDX-License-Identifier: BSD-3-Clause +////////////////////////////////////////////////////////////////////////////// +#include "chai/ChaiMacros.hpp" +#include "chai/ManagedSharedPtr.hpp" +#include "chai/SharedPtrManager.hpp" +#include "gtest/gtest.h" +#include "umpire/ResourceManager.hpp" + +#define GPU_TEST(X, Y) \ + static void gpu_test_##X##Y(); \ + TEST(X, Y) { gpu_test_##X##Y(); } \ + static void gpu_test_##X##Y() + +#include "chai/config.hpp" +#include "chai/ArrayManager.hpp" +#include "chai/ManagedArray.hpp" +#include "chai/managed_ptr.hpp" +#include "chai/ManagedSharedPtr.hpp" + +#include "../src/util/forall.hpp" + +// Standard library headers +#include + +inline void gpuErrorCheck(cudaError_t code, const char *file, int line, bool abort=true) +{ + if (code != cudaSuccess) { + fprintf(stderr, "[CHAI] GPU Error: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) { + exit(code); + } + } +} + +#define GPU_ERROR_CHECK(code) { gpuErrorCheck((code), __FILE__, __LINE__); } + +void PrintMemory(const unsigned char* memory, + const char label[] = "contents") +{ + std::cout << "Memory " << label << ": \n"; + for (size_t i = 0; i < 4; i++) + { + for (size_t j = 0; j < 8; j++) + printf("%02X ", static_cast (memory[i * 8 + j])); + printf("\n"); + } +} + +#define M_PRINT_MEMORY(memory) \ + for (size_t i = 0; i < 7; i++) \ + { \ + for (size_t j = 0; j < 8; j++) \ + printf("%02X ", static_cast (memory[i * 8 + j])); \ + printf("\n"); \ + } + +#define CPU_PRINT_MEMORY(memory, label)\ + printf("HOST Memory "); printf(label); printf("\n"); \ + M_PRINT_MEMORY(memory) \ + +#define GPU_PRINT_MEMORY(memory, label)\ + forall(gpu(), 0, 1, [=] __device__ (int i) { \ + printf("DEVICE Memory "); printf(label); printf("\n"); \ + M_PRINT_MEMORY(memory) \ + }); + + +class C +{ +public: + CHAI_HOST_DEVICE C(void) { printf("++ C has been constructed\n"); } + CHAI_HOST_DEVICE ~C(void) { printf("-- C has been destructed\n"); } + CHAI_HOST_DEVICE virtual void function(void) = 0; +}; + +class D : public C +{ +public: + unsigned long long content_D; + CHAI_HOST_DEVICE D(void) : content_D(0xDDDDDDDDDDDDDDDDull) { printf("++ D has been constructed\n"); } + CHAI_HOST_DEVICE ~D(void) { printf("-- D has been destructed\n"); } + CHAI_HOST_DEVICE virtual void function(void) { printf("%lX\n", content_D); } +}; + + +class A +{ +public: + unsigned long long content_A; + D d; + CHAI_HOST_DEVICE A(void) : content_A(0xAAAAAAAAAAAAAAAAull) { printf("++ A has been constructed\n"); } + CHAI_HOST_DEVICE ~A(void) { printf("-- A has been destructed\n"); } + CHAI_HOST_DEVICE virtual void function(void) = 0; + CHAI_HOST_DEVICE virtual void d_function(void) = 0; +}; + +class A2 +{ +public: + CHAI_HOST_DEVICE A2(void) { printf("++ A2 has been constructed\n"); } + CHAI_HOST_DEVICE ~A2(void) { printf("-- A2 has been destructed\n"); } +}; + +class B : public A, public A2 +{ +public: + unsigned long long content_B; + CHAI_HOST_DEVICE B(void) : content_B(0xBBBBBBBBBBBBBBBBull) { printf("++ B has been constructed\n"); } + CHAI_HOST_DEVICE ~B(void) { printf("-- B has been destructed\n"); } + CHAI_HOST_DEVICE virtual void function(void) { printf("%lX\n", content_B); } + CHAI_HOST_DEVICE virtual void d_function(void) { d.function(); } +}; + + + +GPU_TEST(managed_ptr, polycpytest) +{ + + // Assign 32 byte block of memory to 0x11 on the Host + unsigned char* memory1 = (unsigned char*)malloc(56*sizeof(unsigned char)); + memset(memory1, 0x11, 56 * sizeof(unsigned char)); + CPU_PRINT_MEMORY(memory1, "1 : before placement new") + + + // Assign 32 byte block of memory to 0x22 on the Device + unsigned char* memory2; cudaMalloc((void**)&memory2, 56*sizeof(unsigned char)); + forall(gpu(), 0, 56, [=] __device__ (int i) { memory2[i] = 0x22; }); + GPU_PRINT_MEMORY(memory2, "2 : before placement new") + + + // Placement New Polymorphic object on the Host. + B* b_ptr1 = new (memory1) B; + CPU_PRINT_MEMORY(memory1, "1 : after placement new"); + + + // Placement New Polymorphic object on the Device. + B* b_ptr2 = reinterpret_cast(memory2); + A* base2 = b_ptr2; + forall(gpu(), 0, 1, [=] __device__ (int i) { new(b_ptr2) B();}); + GPU_PRINT_MEMORY(memory2, "2 : after placement new"); + + + // B was constructed on the Device so we can call virtual + // function on the GPU from a host pointer. + printf("Calling virtual function from Base pointer on GPU.\n"); + forall(gpu(), 0, 1, [=] __device__ (int i) { base2->function(); }); + GPU_ERROR_CHECK( cudaPeekAtLastError() ); + GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + + + + // Lets edit the Data on the Host... + b_ptr1->content_B = 0xCBCBCBCBCBCBCBCBull; + CPU_PRINT_MEMORY(memory1, "1 : after content change"); + + // Copying Data from Host to Device +#define OFFSET_CPY +#if !defined(OFFSET_CPY) + GPU_ERROR_CHECK(cudaMemcpy(b_ptr2, b_ptr1, sizeof(B), cudaMemcpyHostToDevice)); +#else + // We nee to skip over the Vtable and try to only copy the contents of the + // object itself. + unsigned int offset = sizeof(void*); + char* off_b_ptr2 = (char*)b_ptr2 + offset; + char* off_b_ptr1 = (char*)b_ptr1 + offset; + int off_size = sizeof(B) - offset; + + GPU_ERROR_CHECK(cudaMemcpy(off_b_ptr2, off_b_ptr1, off_size, cudaMemcpyHostToDevice)); + //// This will not work as we need to do pointer arithmatic at the byte level... + //GPU_ERROR_CHECK(cudaMemcpy(b_ptr2 + offset, b_ptr1 + offset, sizeof(B) - offset, cudaMemcpyHostToDevice)); +#endif + GPU_PRINT_MEMORY(memory2, "2 : after copy from host"); + + // Try to call virtual funciton on GPU like we did before. + printf("Calling virtual function from Base pointer on GPU.\n"); + forall(gpu(), 0, 1, [=] __device__ (int i) { base2->function(); }); + GPU_ERROR_CHECK( cudaPeekAtLastError() ); + GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + + + + // Lets edit the Data on the Device... + forall(gpu(), 0, 1, [=] __device__ (int i) { + b_ptr2->content_B = 0xDBDBDBDBDBDBDBDBull; + b_ptr2->content_A = 0xDADADADADADADADAull; }); + GPU_PRINT_MEMORY(memory2, "2 : after content change"); + + +#if !defined(OFFSET_CPY) + GPU_ERROR_CHECK(cudaMemcpy(b_ptr1, b_ptr2, sizeof(B), cudaMemcpyDeviceToHost)); +#else + GPU_ERROR_CHECK(cudaMemcpy((char*)b_ptr1 + offset, (char*)b_ptr2 + offset, sizeof(B) - offset, cudaMemcpyDeviceToHost)); +#endif + CPU_PRINT_MEMORY(memory1, "1 : after copy from host"); + + + + + + // Free up memory, we useed placement new so we need to call the destructor first... + reinterpret_cast(memory1)->~B(); + forall(gpu(), 0, 1, [=] __device__ (int i) { reinterpret_cast(memory2)->~B(); }); + cudaFree(memory2); + +} + + From c4db6a1e2d690bfc097ee7dfb5405407bbdb24da Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 30 Apr 2024 14:07:57 -0700 Subject: [PATCH 06/44] Pushing latest changes to ManagedSharedPtr interface --- src/chai/ManagedSharedPtr.hpp | 29 +- src/chai/SharedPtrCounter.hpp | 19 +- src/chai/SharedPtrManager.cpp | 23 +- src/chai/SharedPtrManager.hpp | 2 +- src/tpl/umpire | 2 +- tests/integration/managed_ptr_tests.cpp | 5 +- tests/integration/polymorphism_hana_tests.cpp | 230 +++++++++++++++ tests/integration/polymorphism_tests.cpp | 263 +++++++++++++++++- 8 files changed, 528 insertions(+), 45 deletions(-) create mode 100644 tests/integration/polymorphism_hana_tests.cpp diff --git a/src/chai/ManagedSharedPtr.hpp b/src/chai/ManagedSharedPtr.hpp index 90967f87..3b10252b 100644 --- a/src/chai/ManagedSharedPtr.hpp +++ b/src/chai/ManagedSharedPtr.hpp @@ -6,8 +6,6 @@ #include "chai/ArrayManager.hpp" #include "chai/ChaiMacros.hpp" #include "chai/ExecutionSpaces.hpp" -//#include "chai/SharedPtrManager.hpp" -//#include "chai/SharedPointerRecord.hpp" #include "chai/SharedPtrCounter.hpp" #include "chai/managed_ptr.hpp" @@ -55,33 +53,33 @@ class ManagedSharedPtr { //// *Default* Ctor with convertible type Yp -> Tp template> - ManagedSharedPtr(Yp* host_p, Yp* device_p, Deleter d) : - m_record_count(host_p, device_p, std::move(d)), - m_active_pointer(m_record_count.m_get_pointer(chai::CPU)), - m_resource_manager(SharedPtrManager::getInstance()) + ManagedSharedPtr(Yp* host_p, Yp* device_p, Deleter d) + : m_record_count(host_p, device_p, std::move(d)) + , m_active_pointer(m_record_count.m_get_pointer(chai::CPU)) + , m_resource_manager(SharedPtrManager::getInstance()) {} /* * Copy Constructors */ CHAI_HOST_DEVICE - ManagedSharedPtr(ManagedSharedPtr const& rhs) noexcept : - m_record_count(rhs.m_record_count), - m_active_pointer(rhs.m_active_pointer), - m_resource_manager(rhs.m_resource_manager) + ManagedSharedPtr(ManagedSharedPtr const& rhs) noexcept + : m_record_count(rhs.m_record_count) + , m_active_pointer(rhs.m_active_pointer) + , m_resource_manager(rhs.m_resource_manager) { #if !defined(CHAI_DEVICE_COMPILE) - if (m_active_pointer) move(ArrayManager::getInstance()->getExecutionSpace()); + if (m_active_pointer) move(ArrayManager::getInstance()->getExecutionSpace()); // TODO: Use a generic interface for RAJA queries. //if (m_active_pointer) move(m_resource_manager->getExecutionSpace()); #endif } template> CHAI_HOST_DEVICE - ManagedSharedPtr(ManagedSharedPtr const& rhs) noexcept : - m_record_count(rhs.m_record_count), - m_active_pointer(rhs.m_active_pointer), - m_resource_manager(rhs.m_resource_manager) + ManagedSharedPtr(ManagedSharedPtr const& rhs) noexcept + : m_record_count(rhs.m_record_count) + , m_active_pointer(rhs.m_active_pointer) + , m_resource_manager(rhs.m_resource_manager) { #if !defined(CHAI_DEVICE_COMPILE) if (m_active_pointer) move(ArrayManager::getInstance()->getExecutionSpace()); @@ -150,7 +148,6 @@ template #include "chai/ChaiMacros.hpp" #include "chai/SharedPtrManager.hpp" @@ -46,27 +47,31 @@ class msp_counted_base { template class msp_counted_ptr final : public msp_counted_base { public: - msp_counted_ptr(Ptr h_p, Ptr d_p) noexcept : - m_record(SharedPtrManager::getInstance()->makeSharedPtrRecord(h_p, d_p, true)) {} - //msp_counted_ptr(Ptr h_p, Ptr d_p) noexcept : m_record(new msp_pointer_record(h_p, d_p)) {} + msp_counted_ptr(Ptr h_p, Ptr d_p) noexcept + : m_record(SharedPtrManager::getInstance()->makeSharedPtrRecord(h_p, d_p, sizeof(std::remove_pointer), true)) + {} + virtual void m_dispose() noexcept { delete (Ptr)m_record->m_pointers[chai::CPU]; }// TODO : Other Exec spaces... virtual void m_destroy() noexcept { delete this; } msp_counted_ptr(msp_counted_ptr const&) = delete; msp_counted_ptr& operator=(msp_counted_ptr const&) = delete; - msp_pointer_record* m_get_record() noexcept { return m_record; } private: msp_pointer_record* m_record; }; +#include + template class msp_counted_deleter final : public msp_counted_base { class impl { public: - impl(Ptr h_p, Ptr d_p, Deleter d) : - m_record(SharedPtrManager::getInstance()->makeSharedPtrRecord(h_p, d_p, true)), m_deleter(std::move(d)) {} - //impl(Ptr h_p, Ptr d_p, Deleter d) : m_record(new msp_pointer_record(h_p, d_p)), m_deleter(std::move(d)) {} + impl(Ptr h_p, Ptr d_p, Deleter d) + : m_record(SharedPtrManager::getInstance()->makeSharedPtrRecord(h_p, d_p, sizeof(std::remove_pointer_t), true)) + , m_deleter(std::move(d)) + {} + Deleter& m_del() noexcept { return m_deleter; } msp_pointer_record* m_record; Deleter m_deleter; diff --git a/src/chai/SharedPtrManager.cpp b/src/chai/SharedPtrManager.cpp index 4593cbd0..ceeb5e7e 100644 --- a/src/chai/SharedPtrManager.cpp +++ b/src/chai/SharedPtrManager.cpp @@ -252,11 +252,9 @@ static void copy(void * dst_pointer, void * src_pointer, umpire::ResourceManager camp::resources::Resource host_resource(camp::resources::Host::get_default()); if (dst_space == GPU || src_space == GPU) { // Do the copy using the device resource - //manager.copy(dst_pointer, src_pointer, device_resource); - { - std::cout << "Do Fake Copy to GPU.....\n"; - //CHAI_GPU_ERROR_CHECK(cudaMemcpyAsync(dst_pointer, src_pointer, 1, cudaMemcpyHostToDevice)); - } + manager.copy_poly(dst_pointer, src_pointer, device_resource); + //manager.copy(dst_pointer, src_pointer); + //CHAI_GPU_ERROR_CHECK(cudaMemcpyAsync(dst_pointer, src_pointer, 1, cudaMemcpyHostToDevice)); } else { // Do the copy using the host resource manager.copy(dst_pointer, src_pointer, host_resource); @@ -289,7 +287,6 @@ void SharedPtrManager::move(msp_pointer_record* record, ExecutionSpace space) // dst_pointer = record->m_pointers[space]; //} - if ( (!record->m_touched[record->m_last_space]) || (! src_pointer )) { printf("failed move conditions\n"); for (int i = chai::CPU; i < NUM_EXECUTION_SPACES; i++) std::cout << i << " : " <m_touched[i] << std::endl; @@ -342,9 +339,9 @@ void SharedPtrManager::free(msp_pointer_record* pointer_record, ExecutionSpace s if (pointer_record->m_owned[space]) { #if defined(CHAI_ENABLE_UM) if (space_ptr == pointer_record->m_pointers[UM]) { - callback(pointer_record, - ACTION_FREE, - ExecutionSpace(UM)); + //callback(pointer_record, + // ACTION_FREE, + // ExecutionSpace(UM)); auto alloc = m_resource_manager.getAllocator(pointer_record->m_allocators[UM]); alloc.deallocate(space_ptr); @@ -441,7 +438,7 @@ msp_pointer_record* SharedPtrManager::getPointerRecord(void* pointer) } msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(void* pointer, void* d_pointer, - //size_t size, + size_t size, //ExecutionSpace space, bool owned) { @@ -455,12 +452,12 @@ msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(void* pointer, void* d m_resource_manager.registerAllocation( pointer, - {pointer, 1, m_allocators[chai::CPU]->getAllocationStrategy()}); + {pointer, size, m_allocators[chai::CPU]->getAllocationStrategy()}); std::cout << "m_allocators[chai::CPU] : " << m_allocators[chai::CPU]->getName() << std::endl; m_resource_manager.registerAllocation( d_pointer, - {d_pointer, 1, m_allocators[chai::GPU]->getAllocationStrategy()}); + {d_pointer, size, m_allocators[chai::GPU]->getAllocationStrategy()}); std::cout << "m_allocators[chai::GPU] : " << m_allocators[chai::GPU]->getName() << std::endl; auto pointer_record = getPointerRecord(pointer); @@ -489,7 +486,7 @@ msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(void* pointer, void* d } if (pointer) { - registerPointer(pointer_record, chai::CPU, owned); + //registerPointer(pointer_record, chai::CPU, owned); } return pointer_record; diff --git a/src/chai/SharedPtrManager.hpp b/src/chai/SharedPtrManager.hpp index 3c9609ed..ee0c5c92 100644 --- a/src/chai/SharedPtrManager.hpp +++ b/src/chai/SharedPtrManager.hpp @@ -265,7 +265,7 @@ class SharedPtrManager //CHAISHAREDDLL_API size_t getSize(void* pointer); CHAISHAREDDLL_API msp_pointer_record* makeSharedPtrRecord(void* pointer, void* d_pointer, - //size_t size, + size_t size, //ExecutionSpace space, bool owned); diff --git a/src/tpl/umpire b/src/tpl/umpire index 1db3fef9..974ef8c1 160000 --- a/src/tpl/umpire +++ b/src/tpl/umpire @@ -1 +1 @@ -Subproject commit 1db3fef913a70d8882ca510a4830c77c388873e0 +Subproject commit 974ef8c18f2728e75005696f6ef27dacce491b88 diff --git a/tests/integration/managed_ptr_tests.cpp b/tests/integration/managed_ptr_tests.cpp index 9acbace4..b0fb7481 100644 --- a/tests/integration/managed_ptr_tests.cpp +++ b/tests/integration/managed_ptr_tests.cpp @@ -186,11 +186,13 @@ GPU_TEST(managed_ptr, shared_ptralloc) chai::SharedPtrManager* sptr_manager = chai::SharedPtrManager::getInstance(); umpire::ResourceManager& res_manager = umpire::ResourceManager::getInstance(); - auto cpu_allocator = sptr_manager->getAllocator(chai::CPU); + auto cpu_allocator = sptr_manager->getAllocator(chai::UM); TestBase* cpu_ptr = static_cast( cpu_allocator.allocate(1*sizeof(TestDerived)) ); new(cpu_ptr) TestDerived(); + std::cout << "check\n"; + TestBase* gpu_ptr = chai::msp_make_on_device(); @@ -207,6 +209,7 @@ GPU_TEST(managed_ptr, shared_ptralloc) unsigned int offset = sizeof(void*); GPU_ERROR_CHECK(cudaMemcpy((char*)gpu_ptr+offset, (char*)cpu_ptr+offset, sizeof(TestDerived)-offset, cudaMemcpyHostToDevice)); + //GPU_ERROR_CHECK(cudaMemcpy((char*)gpu_ptr, (char*)cpu_ptr, sizeof(TestDerived), cudaMemcpyHostToDevice)); forall(gpu(), 0, 1, [=] __device__ (int i) { printf("GPU Body\n"); diff --git a/tests/integration/polymorphism_hana_tests.cpp b/tests/integration/polymorphism_hana_tests.cpp new file mode 100644 index 00000000..8ad93bea --- /dev/null +++ b/tests/integration/polymorphism_hana_tests.cpp @@ -0,0 +1,230 @@ +////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC and CHAI +// project contributors. See the CHAI LICENSE file for details. +// +// SPDX-License-Identifier: BSD-3-Clause +////////////////////////////////////////////////////////////////////////////// +#include "camp/defines.hpp" +#include "chai/ChaiMacros.hpp" +#include "chai/ExecutionSpaces.hpp" +#include "chai/ManagedSharedPtr.hpp" +#include "chai/SharedPtrManager.hpp" +#include "gtest/gtest.h" +#include "umpire/ResourceManager.hpp" + +#define GPU_TEST(X, Y) \ + static void gpu_test_##X##Y(); \ + TEST(X, Y) { gpu_test_##X##Y(); } \ + static void gpu_test_##X##Y() + +#include "chai/config.hpp" +#include "chai/ArrayManager.hpp" +#include "chai/ManagedArray.hpp" +#include "chai/managed_ptr.hpp" +#include "chai/ManagedSharedPtr.hpp" + +#include "../src/util/forall.hpp" + +// Standard library headers +#include + +#define BEGIN_EXEC_ON_DEVICE() \ + forall(gpu(), 0, 1, [=] __device__ (int i) { + +#define END_EXEC()\ + }); \ + GPU_ERROR_CHECK( cudaPeekAtLastError() );\ + GPU_ERROR_CHECK( cudaDeviceSynchronize() );\ + + +inline void gpuErrorCheck(cudaError_t code, const char *file, int line, bool abort=true) +{ + if (code != cudaSuccess) { + fprintf(stderr, "[CHAI] GPU Error: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) { + exit(code); + } + } +} + +#define GPU_ERROR_CHECK(code) { gpuErrorCheck((code), __FILE__, __LINE__); } + +void PrintMemory(const unsigned char* memory, + const char label[] = "contents") +{ + std::cout << "Memory " << label << ": \n"; + for (size_t i = 0; i < 4; i++) + { + for (size_t j = 0; j < 8; j++) + printf("%02X ", static_cast (memory[i * 8 + j])); + printf("\n"); + } +} + +#define M_PRINT_MEMORY(memory) \ + for (size_t i = 0; i < 7; i++) \ + { \ + for (size_t j = 0; j < 8; j++) \ + printf("%02X ", static_cast (memory[i * 8 + j])); \ + printf("\n"); \ + } + +#define CPU_PRINT_MEMORY(memory, label)\ + printf("HOST Memory "); printf(label); printf("\n"); \ + M_PRINT_MEMORY(memory) \ + +#define GPU_PRINT_MEMORY(memory, label)\ + forall(gpu(), 0, 1, [=] __device__ (int i) { \ + printf("DEVICE Memory "); printf(label); printf("\n"); \ + M_PRINT_MEMORY(memory) \ + }); + + + + + + + + +template +Base_vtable const Base_vtable_for_host = { + "doSomething"_s = [] __host__ __device__ (T& base){ base.doSomehting(); } + ,"setContents"_s = [] __host__ __device__ (T& base, ull val){ base.setContents(val); } +}; + +template +Base_vtable const Base_vtable_for_host = { + [] __host__ __device__ (void* this_){ static_cast(this_)->doSomething(); } + ,[] __host__ __device__ (void* this_, unsigned long long val){ static_cast(this_)->setContents(val); } +}; + +template +__global__ +void Base_vtable_for_device(Base_vtable* vptr_) { + new(vptr_) Base_vtable{[] __host__ __device__ (void* this_){ static_cast(this_)->doSomething(); } + ,[] __host__ __device__ (void* this_, unsigned long long val){ static_cast(this_)->setContents(val); } + }; +}; + + + +//----------------------------------------------------------------------------- + +template +Base_vtable* make_Base_vtable_on_device() { + Base_vtable* vptr_; + cudaMalloc((void**)&vptr_, sizeof(Base_vtable)); + Base_vtable_for_device <<<1,1>>>(vptr_); + return vptr_; +} + +struct CHAIPolyInterface { + + template + CHAIPolyInterface(Any base) + { + vtbl_host_ = &Base_vtable_for_host; + ptr_host_ = new Any{base}; + + vtbl_device_ = make_Base_vtable_on_device(); + cudaMalloc(&ptr_device_, sizeof(Any)); + + obj_size_ = sizeof(Any); + } + + void move(chai::ExecutionSpace space) + { + if (space == chai::CPU) cudaMemcpy(ptr_host_, ptr_device_, obj_size_, cudaMemcpyDeviceToHost); + if (space == chai::GPU) cudaMemcpy(ptr_device_, ptr_host_, obj_size_, cudaMemcpyHostToDevice); + } + +protected: + Base_vtable const* vtbl_host_; + Base_vtable* vtbl_device_; + //void* ptr_; + void* ptr_host_; + void* ptr_device_; + + long obj_size_; + +}; + +//----------------------------------------------------------------------------- + +#include +#include + +struct IBase : decltype(camp::requires( + "doSomething"_s = camp::function + "setContents"_s = camp::function +)) {}; + +struct Base { + template + Base(Any base) : poly_(base) {}; + + CHAI_HOST_DEVICE void doSomething() const { poly_.virtual("doSomething"_s)(poly_); } + CHAI_HOST_DEVICE void setContents(unsigned long long val) const { poly.virtual("setContents")(poly_, val); } + +private: + ChaiPolyInterface poly_; +}; + + +struct DerivedA { + CHAI_HOST_DEVICE void doSomething() { printf("DerivedA: doSomething\n"); } + CHAI_HOST_DEVICE void setContents(unsigned long long) {} +}; + +struct DerivedB { + DerivedB() : content(0xDDDDDDDDDDDDDDDDull) {}; + + void doBthing() { printf("concrete B thing"); } + + CHAI_HOST_DEVICE void doSomething() { printf("DerivedB: doSomething : %lX\n", content); } + CHAI_HOST_DEVICE void setContents(unsigned long long val) { content = val; } + +private: + unsigned long long content; +}; + + +GPU_TEST(managed_ptr, customvtabletest) { + + Base b = Base(DerivedA{}); + Base b2 = Base(DerivedB{}); + + b.doSomething(); + b2.doSomething(); + + b.move(chai::GPU); + b2.move(chai::GPU); + + BEGIN_EXEC_ON_DEVICE() + printf("-- GPU Kernel begin\n"); + b.doSomething(); + b2.doSomething(); + printf("-- GPU Kernel end\n"); + END_EXEC() + + + b2.setContents(0xCCCCCCCCCCCCCCCCull); + b2.move(chai::GPU); + + BEGIN_EXEC_ON_DEVICE() + printf("-- GPU Kernel begin\n"); + b.doSomething(); + b2.doSomething(); + b2.setContents(0xBBBBBBBBBBBBBBBBull); + printf("-- GPU Kernel end\n"); + END_EXEC() + + b2.move(chai::CPU); + b2.doSomething(); + + + + + +} + diff --git a/tests/integration/polymorphism_tests.cpp b/tests/integration/polymorphism_tests.cpp index 36d7b457..217e96a0 100644 --- a/tests/integration/polymorphism_tests.cpp +++ b/tests/integration/polymorphism_tests.cpp @@ -4,7 +4,9 @@ // // SPDX-License-Identifier: BSD-3-Clause ////////////////////////////////////////////////////////////////////////////// +#include "camp/defines.hpp" #include "chai/ChaiMacros.hpp" +#include "chai/ExecutionSpaces.hpp" #include "chai/ManagedSharedPtr.hpp" #include "chai/SharedPtrManager.hpp" #include "gtest/gtest.h" @@ -26,6 +28,15 @@ // Standard library headers #include +#define BEGIN_EXEC_ON_DEVICE() \ + forall(gpu(), 0, 1, [=] __device__ (int i) { + +#define END_EXEC()\ + }); \ + GPU_ERROR_CHECK( cudaPeekAtLastError() );\ + GPU_ERROR_CHECK( cudaDeviceSynchronize() );\ + + inline void gpuErrorCheck(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { @@ -96,6 +107,7 @@ class A CHAI_HOST_DEVICE ~A(void) { printf("-- A has been destructed\n"); } CHAI_HOST_DEVICE virtual void function(void) = 0; CHAI_HOST_DEVICE virtual void d_function(void) = 0; + CHAI_HOST_DEVICE virtual void set_content(unsigned long long) = 0; }; class A2 @@ -111,11 +123,97 @@ class B : public A, public A2 unsigned long long content_B; CHAI_HOST_DEVICE B(void) : content_B(0xBBBBBBBBBBBBBBBBull) { printf("++ B has been constructed\n"); } CHAI_HOST_DEVICE ~B(void) { printf("-- B has been destructed\n"); } - CHAI_HOST_DEVICE virtual void function(void) { printf("%lX\n", content_B); } - CHAI_HOST_DEVICE virtual void d_function(void) { d.function(); } + CHAI_HOST_DEVICE virtual void function(void) override { printf("%lX\n", content_B); } + CHAI_HOST_DEVICE virtual void d_function(void) override { d.function(); } + CHAI_HOST_DEVICE virtual void set_content(unsigned long long val) override { content_B = val; content_A = val; } }; +GPU_TEST(managed_ptr, shared_ptr) +{ + + { + using DerivedT = B; + using BaseT = A; + + std::cout << "size of (DerivedT) : " << sizeof(DerivedT) << std::endl; + std::cout << "size of (BaseT) : " << sizeof(BaseT) << std::endl; + + chai::ManagedSharedPtr sptr = chai::make_shared_deleter( + [](DerivedT* p){ printf("Custom Deleter Call\n"); p->~DerivedT(); }); + + std::cout << "use_count : " << sptr.use_count() << std::endl; + + std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; + + chai::ManagedSharedPtr sptr2 = sptr; + std::cout << "use_count : " << sptr.use_count() << std::endl; + + sptr->set_content(0xFFFFFFFFFFFFFFFFull); + sptr->function(); + + std::cout << "GPU CALL...\n"; + forall(gpu(), 0, 1, [=] __device__ (int i) { + printf("GPU Body\n"); + sptr->function(); + //results[i] = rawArrayClass->getValue(i); + }); + + } + +} + + + +GPU_TEST(managed_ptr, shared_ptralloc) +{ + + { + + using DerivedT = B; + using BaseT = A; + + chai::SharedPtrManager* sptr_manager = chai::SharedPtrManager::getInstance(); + umpire::ResourceManager& res_manager = umpire::ResourceManager::getInstance(); + + auto cpu_allocator = sptr_manager->getAllocator(chai::CPU); + BaseT* cpu_ptr = static_cast( cpu_allocator.allocate(1*sizeof(DerivedT)) ); + new(cpu_ptr) DerivedT(); + + std::cout << "check\n"; + + + BaseT* gpu_ptr = chai::msp_make_on_device(); + + forall(gpu(), 0, 1, [=] __device__ (int i) { + printf("GPU Body\n"); + gpu_ptr->function(); + gpu_ptr->d_function(); + }); + + std::cout << "Ump alloc cpu : " << cpu_ptr << std::endl; + std::cout << "Ump alloc gpu : " << gpu_ptr << std::endl; + + cpu_ptr->set_content(0xFFFFFFFFFFFFFFFFull); + + camp::resources::Resource device_resource(camp::resources::Cuda::get_default()); + res_manager.copy_poly(gpu_ptr, cpu_ptr, device_resource); + + //unsigned int offset = sizeof(void*); + //GPU_ERROR_CHECK(cudaMemcpy((char*)gpu_ptr+offset, (char*)cpu_ptr+offset, sizeof(DerivedT)-offset, cudaMemcpyHostToDevice)); + //GPU_ERROR_CHECK(cudaMemcpy((char*)gpu_ptr, (char*)cpu_ptr, sizeof(DerivedT), cudaMemcpyHostToDevice)); + + forall(gpu(), 0, 1, [=] __device__ (int i) { + printf("GPU Body\n"); + gpu_ptr->function(); + gpu_ptr->d_function(); + }); + GPU_ERROR_CHECK( cudaPeekAtLastError() ); + GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + + } + //assert_empty_map(); +} GPU_TEST(managed_ptr, polycpytest) { @@ -197,10 +295,6 @@ GPU_TEST(managed_ptr, polycpytest) #endif CPU_PRINT_MEMORY(memory1, "1 : after copy from host"); - - - - // Free up memory, we useed placement new so we need to call the destructor first... reinterpret_cast(memory1)->~B(); forall(gpu(), 0, 1, [=] __device__ (int i) { reinterpret_cast(memory2)->~B(); }); @@ -209,3 +303,160 @@ GPU_TEST(managed_ptr, polycpytest) } + + + + + + + +struct Base_vtable { + void (*doSomething)(void* this_); + void (*setContents)(void* this_, unsigned long long val); +}; + +template +Base_vtable const Base_vtable_for_host = { + [] __host__ __device__ (void* this_){ static_cast(this_)->doSomething(); } + ,[] __host__ __device__ (void* this_, unsigned long long val){ static_cast(this_)->setContents(val); } +}; + +template +__global__ +void Base_vtable_for_device(Base_vtable* vptr_) { + new(vptr_) Base_vtable{[] __host__ __device__ (void* this_){ static_cast(this_)->doSomething(); } + ,[] __host__ __device__ (void* this_, unsigned long long val){ static_cast(this_)->setContents(val); } + }; +}; + + + +//----------------------------------------------------------------------------- + +#if !defined(CHAI_DEVICE_COMPILE) +#define CHAI_POLY_VIRTUAL_CALL(name) \ + return vtbl_host_->name((void*) ptr_host_); +#define CHAI_POLY_VIRTUAL_CALL_ARGS(name, ...) \ + return vtbl_host_->name((void*) ptr_host_, __VA_ARGS__); +#else +#define CHAI_POLY_VIRTUAL_CALL(name) \ + return vtbl_device_->name((void*) ptr_device_); +#define CHAI_POLY_VIRTUAL_CALL_ARGS(name, ...) \ + return vtbl_device_->name((void*) ptr_device_, __VA_ARGS__); +#endif + +template +Base_vtable* make_Base_vtable_on_device() { + Base_vtable* vptr_; + cudaMalloc((void**)&vptr_, sizeof(Base_vtable)); + Base_vtable_for_device <<<1,1>>>(vptr_); + return vptr_; +} + +struct CHAIPolyInterface { + + template + CHAIPolyInterface(Any base) + { + vtbl_host_ = &Base_vtable_for_host; + ptr_host_ = new Any{base}; + + vtbl_device_ = make_Base_vtable_on_device(); + cudaMalloc(&ptr_device_, sizeof(Any)); + + obj_size_ = sizeof(Any); + } + + void move(chai::ExecutionSpace space) + { + if (space == chai::CPU) cudaMemcpy(ptr_host_, ptr_device_, obj_size_, cudaMemcpyDeviceToHost); + if (space == chai::GPU) cudaMemcpy(ptr_device_, ptr_host_, obj_size_, cudaMemcpyHostToDevice); + } + +protected: + Base_vtable const* vtbl_host_; + Base_vtable* vtbl_device_; + //void* ptr_; + void* ptr_host_; + void* ptr_device_; + + long obj_size_; + +}; + +//----------------------------------------------------------------------------- + +#include +#include + + +struct Base: CHAIPolyInterface { + using Poly = CHAIPolyInterface; + + template + Base(Any base) : Poly(base) {}; + + CHAI_HOST_DEVICE void doSomething() const { CHAI_POLY_VIRTUAL_CALL(doSomething) } + CHAI_HOST_DEVICE void setContents(unsigned long long val) const { CHAI_POLY_VIRTUAL_CALL_ARGS(setContents, val) } +}; + + +struct DerivedA { + CHAI_HOST_DEVICE void doSomething() { printf("DerivedA: doSomething\n"); } + CHAI_HOST_DEVICE void setContents(unsigned long long) {} +}; + +struct DerivedB { + DerivedB() : content(0xDDDDDDDDDDDDDDDDull) {}; + + void doBthing() { printf("concrete B thing"); } + + CHAI_HOST_DEVICE void doSomething() + { + printf("DerivedB: doSomething\n"); + printf("%lX\n", content); + } + CHAI_HOST_DEVICE void setContents(unsigned long long val) { content = val; } + unsigned long long content; +}; + + +GPU_TEST(managed_ptr, customvtabletest) { + + Base b = Base(DerivedA{}); + Base b2 = Base(DerivedB{}); + + b.doSomething(); + b2.doSomething(); + + b.move(chai::GPU); + b2.move(chai::GPU); + + BEGIN_EXEC_ON_DEVICE() + printf("-- GPU Kernel begin\n"); + b.doSomething(); + b2.doSomething(); + printf("-- GPU Kernel end\n"); + END_EXEC() + + + b2.setContents(0xCCCCCCCCCCCCCCCCull); + b2.move(chai::GPU); + + BEGIN_EXEC_ON_DEVICE() + printf("-- GPU Kernel begin\n"); + b.doSomething(); + b2.doSomething(); + b2.setContents(0xBBBBBBBBBBBBBBBBull); + printf("-- GPU Kernel end\n"); + END_EXEC() + + b2.move(chai::CPU); + b2.doSomething(); + + + + + +} + From 5cabef2ac5f7dde312ccc8b3b1d9cb320104bdb9 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Wed, 1 May 2024 15:01:40 -0700 Subject: [PATCH 07/44] Fixing Umpire copies; Using umpire allocators in makeSharedPtr. --- src/chai/ManagedSharedPtr.hpp | 28 ++++++++++------- src/chai/SharedPtrManager.cpp | 38 +++++++++++++----------- tests/integration/polymorphism_tests.cpp | 15 +++++----- 3 files changed, 45 insertions(+), 36 deletions(-) diff --git a/src/chai/ManagedSharedPtr.hpp b/src/chai/ManagedSharedPtr.hpp index 3b10252b..bc0e5f03 100644 --- a/src/chai/ManagedSharedPtr.hpp +++ b/src/chai/ManagedSharedPtr.hpp @@ -110,7 +110,7 @@ class ManagedSharedPtr { CHAI_HOST void move(ExecutionSpace space, bool registerTouch = true) noexcept { - printf("Calling move\n"); + //printf("Calling move\n"); ExecutionSpace prev_space = m_record_count.m_get_record()->m_last_space; if (prev_space == CPU || prev_space == NONE) { /// Move nested ManagedArrays first, so they are working with a valid m_active_pointer for the host, @@ -164,22 +164,28 @@ CHAI_HOST Tp* msp_make_on_device(Args... args) { } template -ManagedSharedPtr make_shared(Args... args) { - Tp* gpu_pointer = make_on_device(args...); - Tp* cpu_pointer = make_on_host(args...); - std::cout << "CPU @ " << cpu_pointer << std::endl; - std::cout << "GPU @ " << gpu_pointer << std::endl; +CHAI_HOST Tp* msp_make_on_host(Args... args) { + chai::SharedPtrManager* sptr_manager = chai::SharedPtrManager::getInstance(); + + auto cpu_allocator = sptr_manager->getAllocator(chai::CPU); + Tp* cpu_ptr = static_cast( cpu_allocator.allocate(1*sizeof(Tp)) ); + new (cpu_ptr) Tp{args...}; + + return cpu_ptr; +} + +template +ManagedSharedPtr make_shared(Args... args) { + Tp* gpu_pointer = msp_make_on_device(args...); + Tp* cpu_pointer = msp_make_on_host(args...); return ManagedSharedPtr(cpu_pointer, gpu_pointer, [](Tp* p){delete p;}); } template ManagedSharedPtr make_shared_deleter(Args... args, Deleter d) { - Tp* gpu_pointer = make_on_device(args...); - Tp* cpu_pointer = make_on_host(args...); - std::cout << "CPU @ " << cpu_pointer << std::endl; - std::cout << "GPU @ " << gpu_pointer << std::endl; - + Tp* gpu_pointer = msp_make_on_device(args...); + Tp* cpu_pointer = msp_make_on_host(args...); return ManagedSharedPtr(cpu_pointer, gpu_pointer, std::move(d)); } diff --git a/src/chai/SharedPtrManager.cpp b/src/chai/SharedPtrManager.cpp index ceeb5e7e..8774fac5 100644 --- a/src/chai/SharedPtrManager.cpp +++ b/src/chai/SharedPtrManager.cpp @@ -252,9 +252,11 @@ static void copy(void * dst_pointer, void * src_pointer, umpire::ResourceManager camp::resources::Resource host_resource(camp::resources::Host::get_default()); if (dst_space == GPU || src_space == GPU) { // Do the copy using the device resource - manager.copy_poly(dst_pointer, src_pointer, device_resource); - //manager.copy(dst_pointer, src_pointer); - //CHAI_GPU_ERROR_CHECK(cudaMemcpyAsync(dst_pointer, src_pointer, 1, cudaMemcpyHostToDevice)); + std::size_t vtable_size = sizeof(void*); + void* poly_src_ptr = ((char*)src_pointer + vtable_size); + void* poly_dst_ptr = ((char*)dst_pointer + vtable_size); + + manager.copy(poly_dst_ptr, poly_src_ptr); } else { // Do the copy using the host resource manager.copy(dst_pointer, src_pointer, host_resource); @@ -288,22 +290,22 @@ void SharedPtrManager::move(msp_pointer_record* record, ExecutionSpace space) //} if ( (!record->m_touched[record->m_last_space]) || (! src_pointer )) { - printf("failed move conditions\n"); - for (int i = chai::CPU; i < NUM_EXECUTION_SPACES; i++) std::cout << i << " : " <m_touched[i] << std::endl; - std::cout << record->m_last_space << std::endl; - std::cout << record->m_touched[record->m_last_space] << std::endl; - std::cout << (src_pointer) << std::endl; + //printf("failed move conditions\n"); + //for (int i = chai::CPU; i < NUM_EXECUTION_SPACES; i++) std::cout << i << " : " <m_touched[i] << std::endl; + //std::cout << record->m_last_space << std::endl; + //std::cout << record->m_touched[record->m_last_space] << std::endl; + //std::cout << (src_pointer) << std::endl; return; } else if (dst_pointer != src_pointer) { // Exclude the copy if src and dst are the same (can happen for PINNED memory) { - printf("Performing Copy\n"); - std::cout << "dst_pointer : " << dst_pointer << std::endl; - std::cout << "src_pointer : " << src_pointer << std::endl; - std::cout << "space : " << space << std::endl; - std::cout << "prev_space : " << prev_space << std::endl; - std::cout << m_resource_manager.findAllocatorForPointer(dst_pointer)->getName() << std::endl; - std::cout << m_resource_manager.findAllocatorForPointer(src_pointer)->getName() << std::endl; + //printf("Performing Copy\n"); + //std::cout << "dst_pointer : " << dst_pointer << std::endl; + //std::cout << "src_pointer : " << src_pointer << std::endl; + //std::cout << "space : " << space << std::endl; + //std::cout << "prev_space : " << prev_space << std::endl; + //std::cout << m_resource_manager.findAllocatorForPointer(dst_pointer)->getName() << std::endl; + //std::cout << m_resource_manager.findAllocatorForPointer(src_pointer)->getName() << std::endl; chai::copy(dst_pointer, src_pointer, m_resource_manager, space, prev_space); } @@ -453,12 +455,12 @@ msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(void* pointer, void* d m_resource_manager.registerAllocation( pointer, {pointer, size, m_allocators[chai::CPU]->getAllocationStrategy()}); - std::cout << "m_allocators[chai::CPU] : " << m_allocators[chai::CPU]->getName() << std::endl; + //std::cout << "m_allocators[chai::CPU] : " << m_allocators[chai::CPU]->getName() << std::endl; m_resource_manager.registerAllocation( d_pointer, {d_pointer, size, m_allocators[chai::GPU]->getAllocationStrategy()}); - std::cout << "m_allocators[chai::GPU] : " << m_allocators[chai::GPU]->getName() << std::endl; + //std::cout << "m_allocators[chai::GPU] : " << m_allocators[chai::GPU]->getName() << std::endl; auto pointer_record = getPointerRecord(pointer); @@ -486,7 +488,7 @@ msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(void* pointer, void* d } if (pointer) { - //registerPointer(pointer_record, chai::CPU, owned); + registerPointer(pointer_record, chai::CPU, owned); } return pointer_record; diff --git a/tests/integration/polymorphism_tests.cpp b/tests/integration/polymorphism_tests.cpp index 217e96a0..ecc2f0ff 100644 --- a/tests/integration/polymorphism_tests.cpp +++ b/tests/integration/polymorphism_tests.cpp @@ -141,22 +141,23 @@ GPU_TEST(managed_ptr, shared_ptr) chai::ManagedSharedPtr sptr = chai::make_shared_deleter( [](DerivedT* p){ printf("Custom Deleter Call\n"); p->~DerivedT(); }); - std::cout << "use_count : " << sptr.use_count() << std::endl; - std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; chai::ManagedSharedPtr sptr2 = sptr; std::cout << "use_count : " << sptr.use_count() << std::endl; sptr->set_content(0xFFFFFFFFFFFFFFFFull); - sptr->function(); std::cout << "GPU CALL...\n"; forall(gpu(), 0, 1, [=] __device__ (int i) { printf("GPU Body\n"); - sptr->function(); + sptr2->function(); + sptr2->d_function(); + //results[i] = rawArrayClass->getValue(i); }); + GPU_ERROR_CHECK( cudaPeekAtLastError() ); + GPU_ERROR_CHECK( cudaDeviceSynchronize() ); } @@ -179,11 +180,10 @@ GPU_TEST(managed_ptr, shared_ptralloc) BaseT* cpu_ptr = static_cast( cpu_allocator.allocate(1*sizeof(DerivedT)) ); new(cpu_ptr) DerivedT(); - - std::cout << "check\n"; + BaseT* gpu_ptr = chai::msp_make_on_device(); - BaseT* gpu_ptr = chai::msp_make_on_device(); + auto record = sptr_manager->makeSharedPtrRecord(cpu_ptr, gpu_ptr, sizeof(DerivedT), true); forall(gpu(), 0, 1, [=] __device__ (int i) { printf("GPU Body\n"); @@ -193,6 +193,7 @@ GPU_TEST(managed_ptr, shared_ptralloc) std::cout << "Ump alloc cpu : " << cpu_ptr << std::endl; std::cout << "Ump alloc gpu : " << gpu_ptr << std::endl; + std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; cpu_ptr->set_content(0xFFFFFFFFFFFFFFFFull); From 7b0176e76f6a0ece25df5b698740b7e83a7e3324 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 7 May 2024 11:15:57 -0700 Subject: [PATCH 08/44] Recursive polymorphic object copies. --- src/chai/ManagedSharedPtr.hpp | 52 +++++++++-- src/chai/SharedPtrCounter.hpp | 26 ++++++ src/chai/SharedPtrManager.cpp | 39 ++++---- tests/integration/polymorphism_tests.cpp | 110 ++++++++++++++++++++++- 4 files changed, 201 insertions(+), 26 deletions(-) diff --git a/src/chai/ManagedSharedPtr.hpp b/src/chai/ManagedSharedPtr.hpp index bc0e5f03..1131942e 100644 --- a/src/chai/ManagedSharedPtr.hpp +++ b/src/chai/ManagedSharedPtr.hpp @@ -69,6 +69,7 @@ class ManagedSharedPtr { , m_resource_manager(rhs.m_resource_manager) { #if !defined(CHAI_DEVICE_COMPILE) + std::cout << "ManagedSharedPtr Copy Ctor\n"; if (m_active_pointer) move(ArrayManager::getInstance()->getExecutionSpace()); // TODO: Use a generic interface for RAJA queries. //if (m_active_pointer) move(m_resource_manager->getExecutionSpace()); #endif @@ -91,7 +92,9 @@ class ManagedSharedPtr { * Accessors */ CHAI_HOST_DEVICE - element_type* get(ExecutionSpace space = chai::CPU) const noexcept { return m_active_pointer; } + element_type* get(ExecutionSpace space = chai::CPU) const noexcept { + return m_active_pointer; + } CHAI_HOST_DEVICE element_type& operator*() const noexcept { assert(get() != nullptr); return *get(); } @@ -110,14 +113,17 @@ class ManagedSharedPtr { CHAI_HOST void move(ExecutionSpace space, bool registerTouch = true) noexcept { - //printf("Calling move\n"); ExecutionSpace prev_space = m_record_count.m_get_record()->m_last_space; - if (prev_space == CPU || prev_space == NONE) { + if (prev_space == CPU && space == GPU) { + //if (prev_space == CPU || prev_space == NONE) { /// Move nested ManagedArrays first, so they are working with a valid m_active_pointer for the host, // and so the meta data associated with them are updated before we move the other array down. - //moveInnerImpl(); + moveInnerImpl(); } m_active_pointer = static_cast(m_resource_manager->move((void *)m_active_pointer, m_record_count.m_get_record(), space)); + if (prev_space == CPU && space == GPU) { + std::cout << "m_active_pointer @ " << m_active_pointer << std::endl; + } if (registerTouch) { m_resource_manager->registerTouch(m_record_count.m_get_record(), space); @@ -125,7 +131,7 @@ class ManagedSharedPtr { if (space != GPU && prev_space == GPU) { /// Move nested ManagedArrays after the move, so they are working with a valid m_active_pointer for the host, // and so the meta data associated with them are updated with live GPU data - //moveInnerImpl(); + moveInnerImpl(); } } @@ -140,6 +146,30 @@ class ManagedSharedPtr { mutable element_type* m_active_pointer = nullptr; mutable SharedPtrManager* m_resource_manager = nullptr; + + template ::value, + typename std::enable_if::type = 0> + CHAI_HOST + void + moveInnerImpl() + { + std::cout << "moveInnerImpl\n"; + m_record_count.moveInnerImpl(); + //Tp * host_ptr = (Tp *) m_record_count.m_get_record()->m_pointers[CPU]; + //// trigger the copy constructor + //Tp inner = Tp(*host_ptr); + // ensure the inner type gets the state of the result of the copy + // host_ptr[i].shallowCopy(inner); + } + + template ::value, + typename std::enable_if::type = 0> + CHAI_HOST + void + moveInnerImpl() + { + } + }; @@ -147,7 +177,7 @@ template __global__ void msp_make_on_device(T* gpuPointer, Args... args) { - new(gpuPointer) T(processArguments(args)...); + new(gpuPointer) T((args)...); } @@ -177,15 +207,21 @@ CHAI_HOST Tp* msp_make_on_host(Args... args) { template ManagedSharedPtr make_shared(Args... args) { - Tp* gpu_pointer = msp_make_on_device(args...); + Tp* gpu_pointer = msp_make_on_device(); + //Tp* gpu_pointer = msp_make_on_device(args...); Tp* cpu_pointer = msp_make_on_host(args...); + std::cout << "CPU pointer @ " << cpu_pointer << std::endl; + std::cout << "GPU pointer @ " << gpu_pointer << std::endl; return ManagedSharedPtr(cpu_pointer, gpu_pointer, [](Tp* p){delete p;}); } template ManagedSharedPtr make_shared_deleter(Args... args, Deleter d) { - Tp* gpu_pointer = msp_make_on_device(args...); + Tp* gpu_pointer = msp_make_on_device(); + //Tp* gpu_pointer = msp_make_on_device(args...); Tp* cpu_pointer = msp_make_on_host(args...); + std::cout << "CPU pointer @ " << cpu_pointer << std::endl; + std::cout << "GPU pointer @ " << gpu_pointer << std::endl; return ManagedSharedPtr(cpu_pointer, gpu_pointer, std::move(d)); } diff --git a/src/chai/SharedPtrCounter.hpp b/src/chai/SharedPtrCounter.hpp index 5d929afc..065ff53d 100644 --- a/src/chai/SharedPtrCounter.hpp +++ b/src/chai/SharedPtrCounter.hpp @@ -24,6 +24,8 @@ class msp_counted_base { virtual void m_dispose() noexcept = 0; virtual void m_destroy() noexcept { delete this; } + virtual void moveInnerImpl() = 0; + void m_add_ref_copy() noexcept { ++m_use_count; } void m_release() noexcept { @@ -53,6 +55,17 @@ class msp_counted_ptr final : public msp_counted_base { virtual void m_dispose() noexcept { delete (Ptr)m_record->m_pointers[chai::CPU]; }// TODO : Other Exec spaces... virtual void m_destroy() noexcept { delete this; } + + virtual void moveInnerImpl() { + using T = std::remove_pointer_t; + Ptr host_ptr = (Ptr) m_record->m_pointers[CPU]; + // trigger the copy constructor + std::cout << "Trigger Inner Copy Ctor\n"; + T inner = T(*host_ptr); + // ensure the inner type gets the state of the result of the copy + host_ptr->operator=(inner); + } + msp_counted_ptr(msp_counted_ptr const&) = delete; msp_counted_ptr& operator=(msp_counted_ptr const&) = delete; msp_pointer_record* m_get_record() noexcept { return m_record; } @@ -84,6 +97,17 @@ class msp_counted_deleter final : public msp_counted_base { m_impl.m_del()((Ptr)m_impl.m_record->m_pointers[chai::CPU]); } virtual void m_destroy() noexcept { this->~msp_counted_deleter(); } + + virtual void moveInnerImpl() { + using T = std::remove_pointer_t; + Ptr host_ptr = (Ptr) m_impl.m_record->m_pointers[CPU]; + // trigger the copy constructor + std::cout << "Trigger Inner Copy Ctor\n"; + T inner = T(*host_ptr); + // ensure the inner type gets the state of the result of the copy + host_ptr->operator=(inner); + } + msp_counted_deleter(msp_counted_deleter const&) = delete; msp_counted_deleter& operator=(msp_counted_deleter const&) = delete; @@ -154,6 +178,8 @@ class msp_record_count { template Ptr* m_get_pointer(chai::ExecutionSpace space) noexcept { return static_cast(m_get_record()->m_pointers[space]); } + void moveInnerImpl() { m_pi->moveInnerImpl(); } + msp_counted_base* m_pi; }; diff --git a/src/chai/SharedPtrManager.cpp b/src/chai/SharedPtrManager.cpp index 8774fac5..a7685d9b 100644 --- a/src/chai/SharedPtrManager.cpp +++ b/src/chai/SharedPtrManager.cpp @@ -249,22 +249,29 @@ static void copy(void * dst_pointer, void * src_pointer, umpire::ResourceManager camp::resources::Resource device_resource(camp::resources::Host::get_default()); #endif - camp::resources::Resource host_resource(camp::resources::Host::get_default()); - if (dst_space == GPU || src_space == GPU) { - // Do the copy using the device resource - std::size_t vtable_size = sizeof(void*); - void* poly_src_ptr = ((char*)src_pointer + vtable_size); - void* poly_dst_ptr = ((char*)dst_pointer + vtable_size); - - manager.copy(poly_dst_ptr, poly_src_ptr); - } else { - // Do the copy using the host resource - manager.copy(dst_pointer, src_pointer, host_resource); - } - // Ensure device to host copies are synchronous - if (dst_space == CPU && src_space == GPU) { - device_resource.wait(); - } + + std::cout << "SPtr Manager Copy Call\n"; + std::cout << "dst_ptr @ " << dst_pointer << std::endl; + std::cout << "src_ptr @ " << src_pointer << std::endl; + camp::resources::Resource host_resource(camp::resources::Host::get_default()); + if (dst_space == GPU || src_space == GPU) { + // Do the copy using the device resource + std::size_t vtable_size = sizeof(void*); + void* poly_src_ptr = ((char*)src_pointer + vtable_size); + void* poly_dst_ptr = ((char*)dst_pointer + vtable_size); + + std::cout << "---- Sptr Manager Device Copy\n"; + std::cout << "---- dst_ptr @ " << dst_pointer << std::endl; + std::cout << "---- src_ptr @ " << src_pointer << std::endl; + manager.copy(poly_dst_ptr, poly_src_ptr); + } else { + // Do the copy using the host resource + manager.copy(dst_pointer, src_pointer, host_resource); + } + // Ensure device to host copies are synchronous + if (dst_space == CPU && src_space == GPU) { + device_resource.wait(); + } } void SharedPtrManager::move(msp_pointer_record* record, ExecutionSpace space) diff --git a/tests/integration/polymorphism_tests.cpp b/tests/integration/polymorphism_tests.cpp index ecc2f0ff..5abbeed9 100644 --- a/tests/integration/polymorphism_tests.cpp +++ b/tests/integration/polymorphism_tests.cpp @@ -128,9 +128,116 @@ class B : public A, public A2 CHAI_HOST_DEVICE virtual void set_content(unsigned long long val) override { content_B = val; content_A = val; } }; -GPU_TEST(managed_ptr, shared_ptr) + +class AAbsMem : public chai::CHAICopyable { +public: + unsigned long long content_A; + chai::ManagedSharedPtr base_member; + CHAI_HOST_DEVICE AAbsMem(void) : content_A(0xAAAAAAAAAAAAAAAAull) { printf("++ A has been constructed\n"); } + template + CHAI_HOST_DEVICE AAbsMem(chai::ManagedSharedPtr const& base_val) + : base_member(base_val) + , content_A(0xAAAAAAAAAAAAAAAAull) + { printf("++ A has been constructed\n"); } + //CHAI_HOST_DEVICE AAbsMem(AAbsMem const& rhs) + // : base_member(rhs.base_member) + // , content_A(rhs.content_A) + //{ printf("AAbsMem CopyCtor\n"); } + CHAI_HOST_DEVICE ~AAbsMem(void) { printf("-- A has been destructed\n"); } + CHAI_HOST_DEVICE virtual void function(void) = 0; + CHAI_HOST_DEVICE virtual void d_function(void) = 0; + CHAI_HOST_DEVICE virtual void set_content(unsigned long long) = 0; +}; + +class BAbsMem : public AAbsMem +{ +public: + unsigned long long content_B; + + CHAI_HOST_DEVICE BAbsMem() : AAbsMem() {}; + + template + CHAI_HOST BAbsMem(chai::ManagedSharedPtr const& base_val) + : AAbsMem(base_val) + , content_B(0xBBBBBBBBBBBBBBBBull) + { + printf("++ B has been constructed\n"); + } + + //CHAI_HOST_DEVICE BAbsMem(BAbsMem const& rhs) + // : AAbsMem(rhs) + // , content_B(rhs.content_B) + //{ printf("BAbsMem CopyCtor\n"); } + CHAI_HOST_DEVICE ~BAbsMem(void) { printf("-- B has been destructed\n"); } + CHAI_HOST_DEVICE virtual void function(void) override { printf("%lX\n", content_B); } + CHAI_HOST_DEVICE virtual void d_function(void) override { + printf("base_member @ %p\n", &base_member); + printf("base_member.get() @ %p\n", base_member.get()); + base_member->function(); + } + CHAI_HOST_DEVICE virtual void set_content(unsigned long long val) override { content_B = val; content_A = val; } +}; + + +GPU_TEST(managed_ptr, shared_ptr_absmem) +{ + { + using DerivedT = BAbsMem; + using BaseT = AAbsMem; + + std::cout << "size of (DerivedT) : " << sizeof(DerivedT) << std::endl; + std::cout << "size of (BaseT) : " << sizeof(BaseT) << std::endl; + + auto d = chai::make_shared(); + GPU_ERROR_CHECK( cudaPeekAtLastError() ); + GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + + chai::ManagedSharedPtr sptr = chai::make_shared(d); + GPU_ERROR_CHECK( cudaPeekAtLastError() ); + GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + + std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; + + chai::ManagedSharedPtr sptr2 = sptr; + std::cout << "use_count : " << sptr.use_count() << std::endl; + sptr2->function(); + sptr2->d_function(); + + auto mem_ptr = sptr2->base_member; + mem_ptr->function(); + + std::cout << "GPU CALL...\n"; + forall(gpu(), 0, 1, [=] __device__ (int i) { + printf("GPU Body\n"); + sptr2->function(); + sptr2->d_function(); + }); + GPU_ERROR_CHECK( cudaPeekAtLastError() ); + GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + + forall(sequential(), 0, 1, [=] (int i) { + printf("CPU Body\n"); + sptr->set_content(0xFFFFFFFFFFFFFFFFull); + sptr->function(); + sptr->d_function(); + }); + + std::cout << "GPU CALL...\n"; + forall(gpu(), 0, 1, [=] __device__ (int i) { + printf("GPU Body\n"); + sptr->function(); + sptr->d_function(); + }); + GPU_ERROR_CHECK( cudaPeekAtLastError() ); + GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + + } +} + +GPU_TEST(managed_ptr, shared_ptr) +{ { using DerivedT = B; using BaseT = A; @@ -160,7 +267,6 @@ GPU_TEST(managed_ptr, shared_ptr) GPU_ERROR_CHECK( cudaDeviceSynchronize() ); } - } From 435172ae52bd1b279e46744ca67959552e1e4461 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Wed, 8 May 2024 13:27:18 -0700 Subject: [PATCH 09/44] Better copies between host + device; Register Touch considers const and CHAICopyable status of type. --- src/chai/ManagedSharedPtr.hpp | 120 ++-- src/chai/SharedPtrCounter.hpp | 17 +- src/chai/SharedPtrManager.cpp | 5 +- src/chai/SharedPtrManager.hpp | 2 +- tests/integration/polymorphism_tests.cpp | 731 +++++++++++------------ 5 files changed, 466 insertions(+), 409 deletions(-) diff --git a/src/chai/ManagedSharedPtr.hpp b/src/chai/ManagedSharedPtr.hpp index 1131942e..0067c985 100644 --- a/src/chai/ManagedSharedPtr.hpp +++ b/src/chai/ManagedSharedPtr.hpp @@ -6,6 +6,7 @@ #include "chai/ArrayManager.hpp" #include "chai/ChaiMacros.hpp" #include "chai/ExecutionSpaces.hpp" +#include "chai/ManagedArray.hpp" #include "chai/SharedPtrCounter.hpp" #include "chai/managed_ptr.hpp" @@ -21,6 +22,9 @@ struct msp_compatible_with : std::false_type {}; template struct msp_compatible_with : std::is_convertible::type {}; +template +struct is_CHAICopyable : std::is_base_of::type {}; + template class ManagedSharedPtr { @@ -69,7 +73,7 @@ class ManagedSharedPtr { , m_resource_manager(rhs.m_resource_manager) { #if !defined(CHAI_DEVICE_COMPILE) - std::cout << "ManagedSharedPtr Copy Ctor\n"; + std::cout << "ManagedSharedPtr Copy Ctor: m_active_pointer @ " << m_active_pointer << std::endl; if (m_active_pointer) move(ArrayManager::getInstance()->getExecutionSpace()); // TODO: Use a generic interface for RAJA queries. //if (m_active_pointer) move(m_resource_manager->getExecutionSpace()); #endif @@ -92,7 +96,21 @@ class ManagedSharedPtr { * Accessors */ CHAI_HOST_DEVICE + const element_type* cget(ExecutionSpace space = chai::CPU) const noexcept { +#if !defined(CHAI_DEVICE_COMPILE) + if (m_active_pointer) { + //move(CPU, false); + } +#endif + return m_active_pointer; + } + CHAI_HOST_DEVICE element_type* get(ExecutionSpace space = chai::CPU) const noexcept { +#if !defined(CHAI_DEVICE_COMPILE) + if (m_active_pointer) { + //move(CPU); + } +#endif return m_active_pointer; } @@ -111,27 +129,37 @@ class ManagedSharedPtr { public: long use_count() const noexcept { return m_record_count.m_get_use_count(); } + CHAI_INLINE + CHAI_HOST void registerTouch(ExecutionSpace space) { + m_resource_manager->registerTouch(m_record_count.m_get_record(), space); + } + CHAI_HOST - void move(ExecutionSpace space, bool registerTouch = true) noexcept { + void move(ExecutionSpace space,// bool registerTouch = true) noexcept { + bool registerTouch=(!std::is_const::value || is_CHAICopyable::value)) { + //bool registerTouch=false) { ExecutionSpace prev_space = m_record_count.m_get_record()->m_last_space; - if (prev_space == CPU && space == GPU) { - //if (prev_space == CPU || prev_space == NONE) { + ExecutionSpace oldContext = m_resource_manager->getExecutionSpace(); + if (prev_space != GPU && space == GPU) { /// Move nested ManagedArrays first, so they are working with a valid m_active_pointer for the host, // and so the meta data associated with them are updated before we move the other array down. - moveInnerImpl(); + std::cout << "Pre-move InnerImpl\n"; + moveInnerImpl(); } + auto old_pointer = m_active_pointer; m_active_pointer = static_cast(m_resource_manager->move((void *)m_active_pointer, m_record_count.m_get_record(), space)); - if (prev_space == CPU && space == GPU) { - std::cout << "m_active_pointer @ " << m_active_pointer << std::endl; + if (old_pointer != m_active_pointer) { + std::cout << "m_active_pointer @ " << m_active_pointer << " : def touch behaviour : " << (!std::is_const::value || is_CHAICopyable::value) << std::endl; } if (registerTouch) { m_resource_manager->registerTouch(m_record_count.m_get_record(), space); } if (space != GPU && prev_space == GPU) { - /// Move nested ManagedArrays after the move, so they are working with a valid m_active_pointer for the host, - // and so the meta data associated with them are updated with live GPU data - moveInnerImpl(); + /// Move nested ManagedArrays after the move, so they are working with a valid m_active_pointer for the host, + // and so the meta data associated with them are updated with live GPU data + std::cout << "Post-move InnerImpl\n"; + moveInnerImpl(); } } @@ -147,22 +175,18 @@ class ManagedSharedPtr { mutable SharedPtrManager* m_resource_manager = nullptr; - template ::value, + template ::value, + //template ::value, typename std::enable_if::type = 0> CHAI_HOST void moveInnerImpl() { - std::cout << "moveInnerImpl\n"; m_record_count.moveInnerImpl(); - //Tp * host_ptr = (Tp *) m_record_count.m_get_record()->m_pointers[CPU]; - //// trigger the copy constructor - //Tp inner = Tp(*host_ptr); - // ensure the inner type gets the state of the result of the copy - // host_ptr[i].shallowCopy(inner); } - template ::value, + template ::value, + //template ::value, typename std::enable_if::type = 0> CHAI_HOST void @@ -172,54 +196,82 @@ class ManagedSharedPtr { }; +namespace detail { +namespace impl { template -__global__ void msp_make_on_device(T* gpuPointer, Args... args) +__global__ void msp_make_on_device(T* gpuPointer, Args&&... args) { - new(gpuPointer) T((args)...); + new(gpuPointer) T(std::forward(args)...); } +} // namespace impl - +//template template -CHAI_HOST Tp* msp_make_on_device(Args... args) { +CHAI_INLINE +CHAI_HOST Tp* msp_make_on_device(Args&&... args) { + std::cout << "msp_make_on_device\n"; + Tp* gpu_ptr = nullptr; chai::SharedPtrManager* sptr_manager = chai::SharedPtrManager::getInstance(); auto gpu_allocator = sptr_manager->getAllocator(chai::GPU); - Tp* gpu_ptr = static_cast( gpu_allocator.allocate(1*sizeof(Tp)) ); + gpu_ptr = static_cast( gpu_allocator.allocate(1*sizeof(Tp)) ); - msp_make_on_device<<<1,1>>>(gpu_ptr, args...); + impl::msp_make_on_device<<<1,1>>>(gpu_ptr, std::forward(args)...); return gpu_ptr; } template -CHAI_HOST Tp* msp_make_on_host(Args... args) { +CHAI_INLINE +CHAI_HOST Tp* msp_make_on_host(Args&&... args) { + std::cout << "msp_make_on_host\n"; chai::SharedPtrManager* sptr_manager = chai::SharedPtrManager::getInstance(); auto cpu_allocator = sptr_manager->getAllocator(chai::CPU); Tp* cpu_ptr = static_cast( cpu_allocator.allocate(1*sizeof(Tp)) ); - new (cpu_ptr) Tp{args...}; + new (cpu_ptr) Tp{std::forward(args)...}; return cpu_ptr; } +} // namespace detail + template -ManagedSharedPtr make_shared(Args... args) { - Tp* gpu_pointer = msp_make_on_device(); - //Tp* gpu_pointer = msp_make_on_device(args...); - Tp* cpu_pointer = msp_make_on_host(args...); +CHAI_INLINE +CHAI_HOST +ManagedSharedPtr make_shared(Args&&... args) { + using Tp_non_const = std::remove_const_t; + std::cout << "make_shared\n"; + + Tp* cpu_pointer = detail::msp_make_on_host(std::forward(args)...); + //Tp* gpu_pointer = detail::msp_make_on_device(std::forward(args)...); std::cout << "CPU pointer @ " << cpu_pointer << std::endl; + + Tp* gpu_pointer = detail::msp_make_on_device(); std::cout << "GPU pointer @ " << gpu_pointer << std::endl; - return ManagedSharedPtr(cpu_pointer, gpu_pointer, [](Tp* p){delete p;}); + cudaDeviceSynchronize(); + + auto result = ManagedSharedPtr(cpu_pointer, gpu_pointer, [](Tp* p){delete p;}); + + if (!is_CHAICopyable::value) { + result.move(chai::GPU, false); + result.move(chai::CPU, false); + } + + std::cout << "End of make_shared\n"; + return result; } template +CHAI_INLINE +CHAI_HOST ManagedSharedPtr make_shared_deleter(Args... args, Deleter d) { - Tp* gpu_pointer = msp_make_on_device(); - //Tp* gpu_pointer = msp_make_on_device(args...); - Tp* cpu_pointer = msp_make_on_host(args...); + Tp* gpu_pointer = detail::msp_make_on_device(); + Tp* cpu_pointer = detail::msp_make_on_host(std::forward(args)...); + //Tp* gpu_pointer = detail::msp_make_on_device(std::forward(args)...); std::cout << "CPU pointer @ " << cpu_pointer << std::endl; std::cout << "GPU pointer @ " << gpu_pointer << std::endl; return ManagedSharedPtr(cpu_pointer, gpu_pointer, std::move(d)); diff --git a/src/chai/SharedPtrCounter.hpp b/src/chai/SharedPtrCounter.hpp index 065ff53d..82ee809f 100644 --- a/src/chai/SharedPtrCounter.hpp +++ b/src/chai/SharedPtrCounter.hpp @@ -60,7 +60,7 @@ class msp_counted_ptr final : public msp_counted_base { using T = std::remove_pointer_t; Ptr host_ptr = (Ptr) m_record->m_pointers[CPU]; // trigger the copy constructor - std::cout << "Trigger Inner Copy Ctor\n"; + std::cout << "Trigger Inner Copy Ctor @ " << host_ptr << std::endl; T inner = T(*host_ptr); // ensure the inner type gets the state of the result of the copy host_ptr->operator=(inner); @@ -75,6 +75,9 @@ class msp_counted_ptr final : public msp_counted_base { #include +//template +//void err_func(T arg){ static_assert(false); } + template class msp_counted_deleter final : public msp_counted_base { @@ -99,12 +102,16 @@ class msp_counted_deleter final : public msp_counted_base { virtual void m_destroy() noexcept { this->~msp_counted_deleter(); } virtual void moveInnerImpl() { - using T = std::remove_pointer_t; - Ptr host_ptr = (Ptr) m_impl.m_record->m_pointers[CPU]; + //using T = std::remove_cv_t; + using T_non_const = std::remove_const_t>; + + T_non_const* host_ptr = const_cast((Ptr)m_impl.m_record->m_pointers[CPU]); // trigger the copy constructor - std::cout << "Trigger Inner Copy Ctor\n"; - T inner = T(*host_ptr); + std::cout << "Trigger Inner Copy Ctor @ " << host_ptr << std::endl; + T_non_const inner = T_non_const(*host_ptr); + // ensure the inner type gets the state of the result of the copy + //err_func(host_ptr); host_ptr->operator=(inner); } diff --git a/src/chai/SharedPtrManager.cpp b/src/chai/SharedPtrManager.cpp index a7685d9b..9b02fe18 100644 --- a/src/chai/SharedPtrManager.cpp +++ b/src/chai/SharedPtrManager.cpp @@ -446,11 +446,14 @@ msp_pointer_record* SharedPtrManager::getPointerRecord(void* pointer) return record->second ? *record->second : &s_null_record; } -msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(void* pointer, void* d_pointer, +msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(void const* c_pointer, void const* c_d_pointer, size_t size, //ExecutionSpace space, bool owned) { + void* pointer = const_cast(c_pointer); + void* d_pointer = const_cast(c_d_pointer); + if (pointer == nullptr) { return &s_null_record ; } diff --git a/src/chai/SharedPtrManager.hpp b/src/chai/SharedPtrManager.hpp index ee0c5c92..2e8bdb9c 100644 --- a/src/chai/SharedPtrManager.hpp +++ b/src/chai/SharedPtrManager.hpp @@ -264,7 +264,7 @@ class SharedPtrManager */ //CHAISHAREDDLL_API size_t getSize(void* pointer); - CHAISHAREDDLL_API msp_pointer_record* makeSharedPtrRecord(void* pointer, void* d_pointer, + CHAISHAREDDLL_API msp_pointer_record* makeSharedPtrRecord(void const* c_pointer, void const* c_d_pointer, size_t size, //ExecutionSpace space, bool owned); diff --git a/tests/integration/polymorphism_tests.cpp b/tests/integration/polymorphism_tests.cpp index 5abbeed9..f1f3ea1c 100644 --- a/tests/integration/polymorphism_tests.cpp +++ b/tests/integration/polymorphism_tests.cpp @@ -4,6 +4,7 @@ // // SPDX-License-Identifier: BSD-3-Clause ////////////////////////////////////////////////////////////////////////////// +#include #include "camp/defines.hpp" #include "chai/ChaiMacros.hpp" #include "chai/ExecutionSpaces.hpp" @@ -85,7 +86,7 @@ class C public: CHAI_HOST_DEVICE C(void) { printf("++ C has been constructed\n"); } CHAI_HOST_DEVICE ~C(void) { printf("-- C has been destructed\n"); } - CHAI_HOST_DEVICE virtual void function(void) = 0; + CHAI_HOST_DEVICE virtual void function(void) const = 0; }; class D : public C @@ -94,7 +95,7 @@ class D : public C unsigned long long content_D; CHAI_HOST_DEVICE D(void) : content_D(0xDDDDDDDDDDDDDDDDull) { printf("++ D has been constructed\n"); } CHAI_HOST_DEVICE ~D(void) { printf("-- D has been destructed\n"); } - CHAI_HOST_DEVICE virtual void function(void) { printf("%lX\n", content_D); } + CHAI_HOST_DEVICE virtual void function(void) const { printf("%lX\n", content_D); } }; @@ -133,20 +134,20 @@ class AAbsMem : public chai::CHAICopyable { public: unsigned long long content_A; - chai::ManagedSharedPtr base_member; + chai::ManagedSharedPtr base_member; + CHAI_HOST_DEVICE AAbsMem(void) : content_A(0xAAAAAAAAAAAAAAAAull) { printf("++ A has been constructed\n"); } + + //template::value>::type > template - CHAI_HOST_DEVICE AAbsMem(chai::ManagedSharedPtr const& base_val) - : base_member(base_val) + CHAI_HOST AAbsMem(Derived const& base_val) + : base_member(chai::make_shared(base_val)) , content_A(0xAAAAAAAAAAAAAAAAull) { printf("++ A has been constructed\n"); } - //CHAI_HOST_DEVICE AAbsMem(AAbsMem const& rhs) - // : base_member(rhs.base_member) - // , content_A(rhs.content_A) - //{ printf("AAbsMem CopyCtor\n"); } + CHAI_HOST_DEVICE ~AAbsMem(void) { printf("-- A has been destructed\n"); } - CHAI_HOST_DEVICE virtual void function(void) = 0; - CHAI_HOST_DEVICE virtual void d_function(void) = 0; + CHAI_HOST_DEVICE virtual void function(void) const = 0; + CHAI_HOST_DEVICE virtual void d_function(void) const = 0; CHAI_HOST_DEVICE virtual void set_content(unsigned long long) = 0; }; @@ -155,28 +156,22 @@ class BAbsMem : public AAbsMem public: unsigned long long content_B; - CHAI_HOST_DEVICE BAbsMem() : AAbsMem() {}; + CHAI_HOST_DEVICE BAbsMem() : AAbsMem() + { + printf("++ B has been constructed\n"); + } template - CHAI_HOST BAbsMem(chai::ManagedSharedPtr const& base_val) + CHAI_HOST BAbsMem(Derived const& base_val) : AAbsMem(base_val) , content_B(0xBBBBBBBBBBBBBBBBull) { printf("++ B has been constructed\n"); } - //CHAI_HOST_DEVICE BAbsMem(BAbsMem const& rhs) - // : AAbsMem(rhs) - // , content_B(rhs.content_B) - //{ printf("BAbsMem CopyCtor\n"); } - CHAI_HOST_DEVICE ~BAbsMem(void) { printf("-- B has been destructed\n"); } - CHAI_HOST_DEVICE virtual void function(void) override { printf("%lX\n", content_B); } - CHAI_HOST_DEVICE virtual void d_function(void) override { - printf("base_member @ %p\n", &base_member); - printf("base_member.get() @ %p\n", base_member.get()); - base_member->function(); - } + CHAI_HOST_DEVICE virtual void function(void) const override { printf("%lX\n", content_B); } + CHAI_HOST_DEVICE virtual void d_function(void) const override { base_member->function(); } CHAI_HOST_DEVICE virtual void set_content(unsigned long long val) override { content_B = val; content_A = val; } }; @@ -190,78 +185,45 @@ GPU_TEST(managed_ptr, shared_ptr_absmem) std::cout << "size of (DerivedT) : " << sizeof(DerivedT) << std::endl; std::cout << "size of (BaseT) : " << sizeof(BaseT) << std::endl; - auto d = chai::make_shared(); - GPU_ERROR_CHECK( cudaPeekAtLastError() ); - GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + //chai::ManagedSharedPtr sptr = chai::make_shared(D{}); + D d; + //DerivedT der(d); + chai::ManagedSharedPtr sptr = chai::make_shared(d); + //chai::ManagedSharedPtr sptr = chai::make_shared(d); - chai::ManagedSharedPtr sptr = chai::make_shared(d); GPU_ERROR_CHECK( cudaPeekAtLastError() ); GPU_ERROR_CHECK( cudaDeviceSynchronize() ); - std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; - - chai::ManagedSharedPtr sptr2 = sptr; - std::cout << "use_count : " << sptr.use_count() << std::endl; + chai::ManagedSharedPtr sptr2 = sptr; sptr2->function(); sptr2->d_function(); - auto mem_ptr = sptr2->base_member; - mem_ptr->function(); - std::cout << "GPU CALL...\n"; forall(gpu(), 0, 1, [=] __device__ (int i) { printf("GPU Body\n"); sptr2->function(); sptr2->d_function(); }); + + //sptr2.registerTouch(chai::GPU); + + GPU_ERROR_CHECK( cudaPeekAtLastError() ); GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + std::cout << "CPU CALL...\n"; forall(sequential(), 0, 1, [=] (int i) { printf("CPU Body\n"); - sptr->set_content(0xFFFFFFFFFFFFFFFFull); - sptr->function(); - sptr->d_function(); + //sptr->set_content(0xFFFFFFFFFFFFFFFFull); + sptr2->function(); + sptr2->d_function(); }); - std::cout << "GPU CALL...\n"; - forall(gpu(), 0, 1, [=] __device__ (int i) { - printf("GPU Body\n"); - sptr->function(); - sptr->d_function(); - }); - GPU_ERROR_CHECK( cudaPeekAtLastError() ); - GPU_ERROR_CHECK( cudaDeviceSynchronize() ); - - } -} - -GPU_TEST(managed_ptr, shared_ptr) -{ - { - using DerivedT = B; - using BaseT = A; - - std::cout << "size of (DerivedT) : " << sizeof(DerivedT) << std::endl; - std::cout << "size of (BaseT) : " << sizeof(BaseT) << std::endl; - - chai::ManagedSharedPtr sptr = chai::make_shared_deleter( - [](DerivedT* p){ printf("Custom Deleter Call\n"); p->~DerivedT(); }); - - std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; - - chai::ManagedSharedPtr sptr2 = sptr; - std::cout << "use_count : " << sptr.use_count() << std::endl; - - sptr->set_content(0xFFFFFFFFFFFFFFFFull); - std::cout << "GPU CALL...\n"; forall(gpu(), 0, 1, [=] __device__ (int i) { printf("GPU Body\n"); sptr2->function(); sptr2->d_function(); - - //results[i] = rawArrayClass->getValue(i); }); GPU_ERROR_CHECK( cudaPeekAtLastError() ); GPU_ERROR_CHECK( cudaDeviceSynchronize() ); @@ -269,301 +231,334 @@ GPU_TEST(managed_ptr, shared_ptr) } } - - -GPU_TEST(managed_ptr, shared_ptralloc) -{ - - { - - using DerivedT = B; - using BaseT = A; - - chai::SharedPtrManager* sptr_manager = chai::SharedPtrManager::getInstance(); - umpire::ResourceManager& res_manager = umpire::ResourceManager::getInstance(); - - auto cpu_allocator = sptr_manager->getAllocator(chai::CPU); - BaseT* cpu_ptr = static_cast( cpu_allocator.allocate(1*sizeof(DerivedT)) ); - - new(cpu_ptr) DerivedT(); - BaseT* gpu_ptr = chai::msp_make_on_device(); - - - auto record = sptr_manager->makeSharedPtrRecord(cpu_ptr, gpu_ptr, sizeof(DerivedT), true); - - forall(gpu(), 0, 1, [=] __device__ (int i) { - printf("GPU Body\n"); - gpu_ptr->function(); - gpu_ptr->d_function(); - }); - - std::cout << "Ump alloc cpu : " << cpu_ptr << std::endl; - std::cout << "Ump alloc gpu : " << gpu_ptr << std::endl; - std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; - - cpu_ptr->set_content(0xFFFFFFFFFFFFFFFFull); - - camp::resources::Resource device_resource(camp::resources::Cuda::get_default()); - res_manager.copy_poly(gpu_ptr, cpu_ptr, device_resource); - - //unsigned int offset = sizeof(void*); - //GPU_ERROR_CHECK(cudaMemcpy((char*)gpu_ptr+offset, (char*)cpu_ptr+offset, sizeof(DerivedT)-offset, cudaMemcpyHostToDevice)); - //GPU_ERROR_CHECK(cudaMemcpy((char*)gpu_ptr, (char*)cpu_ptr, sizeof(DerivedT), cudaMemcpyHostToDevice)); - - forall(gpu(), 0, 1, [=] __device__ (int i) { - printf("GPU Body\n"); - gpu_ptr->function(); - gpu_ptr->d_function(); - }); - GPU_ERROR_CHECK( cudaPeekAtLastError() ); - GPU_ERROR_CHECK( cudaDeviceSynchronize() ); - - } - //assert_empty_map(); -} - -GPU_TEST(managed_ptr, polycpytest) -{ - - // Assign 32 byte block of memory to 0x11 on the Host - unsigned char* memory1 = (unsigned char*)malloc(56*sizeof(unsigned char)); - memset(memory1, 0x11, 56 * sizeof(unsigned char)); - CPU_PRINT_MEMORY(memory1, "1 : before placement new") - - - // Assign 32 byte block of memory to 0x22 on the Device - unsigned char* memory2; cudaMalloc((void**)&memory2, 56*sizeof(unsigned char)); - forall(gpu(), 0, 56, [=] __device__ (int i) { memory2[i] = 0x22; }); - GPU_PRINT_MEMORY(memory2, "2 : before placement new") - - - // Placement New Polymorphic object on the Host. - B* b_ptr1 = new (memory1) B; - CPU_PRINT_MEMORY(memory1, "1 : after placement new"); - - - // Placement New Polymorphic object on the Device. - B* b_ptr2 = reinterpret_cast(memory2); - A* base2 = b_ptr2; - forall(gpu(), 0, 1, [=] __device__ (int i) { new(b_ptr2) B();}); - GPU_PRINT_MEMORY(memory2, "2 : after placement new"); - - - // B was constructed on the Device so we can call virtual - // function on the GPU from a host pointer. - printf("Calling virtual function from Base pointer on GPU.\n"); - forall(gpu(), 0, 1, [=] __device__ (int i) { base2->function(); }); - GPU_ERROR_CHECK( cudaPeekAtLastError() ); - GPU_ERROR_CHECK( cudaDeviceSynchronize() ); - - - - // Lets edit the Data on the Host... - b_ptr1->content_B = 0xCBCBCBCBCBCBCBCBull; - CPU_PRINT_MEMORY(memory1, "1 : after content change"); - - // Copying Data from Host to Device -#define OFFSET_CPY -#if !defined(OFFSET_CPY) - GPU_ERROR_CHECK(cudaMemcpy(b_ptr2, b_ptr1, sizeof(B), cudaMemcpyHostToDevice)); -#else - // We nee to skip over the Vtable and try to only copy the contents of the - // object itself. - unsigned int offset = sizeof(void*); - char* off_b_ptr2 = (char*)b_ptr2 + offset; - char* off_b_ptr1 = (char*)b_ptr1 + offset; - int off_size = sizeof(B) - offset; - - GPU_ERROR_CHECK(cudaMemcpy(off_b_ptr2, off_b_ptr1, off_size, cudaMemcpyHostToDevice)); - //// This will not work as we need to do pointer arithmatic at the byte level... - //GPU_ERROR_CHECK(cudaMemcpy(b_ptr2 + offset, b_ptr1 + offset, sizeof(B) - offset, cudaMemcpyHostToDevice)); -#endif - GPU_PRINT_MEMORY(memory2, "2 : after copy from host"); - - // Try to call virtual funciton on GPU like we did before. - printf("Calling virtual function from Base pointer on GPU.\n"); - forall(gpu(), 0, 1, [=] __device__ (int i) { base2->function(); }); - GPU_ERROR_CHECK( cudaPeekAtLastError() ); - GPU_ERROR_CHECK( cudaDeviceSynchronize() ); - - - - // Lets edit the Data on the Device... - forall(gpu(), 0, 1, [=] __device__ (int i) { - b_ptr2->content_B = 0xDBDBDBDBDBDBDBDBull; - b_ptr2->content_A = 0xDADADADADADADADAull; }); - GPU_PRINT_MEMORY(memory2, "2 : after content change"); - - -#if !defined(OFFSET_CPY) - GPU_ERROR_CHECK(cudaMemcpy(b_ptr1, b_ptr2, sizeof(B), cudaMemcpyDeviceToHost)); -#else - GPU_ERROR_CHECK(cudaMemcpy((char*)b_ptr1 + offset, (char*)b_ptr2 + offset, sizeof(B) - offset, cudaMemcpyDeviceToHost)); -#endif - CPU_PRINT_MEMORY(memory1, "1 : after copy from host"); - - // Free up memory, we useed placement new so we need to call the destructor first... - reinterpret_cast(memory1)->~B(); - forall(gpu(), 0, 1, [=] __device__ (int i) { reinterpret_cast(memory2)->~B(); }); - cudaFree(memory2); - -} - - - - - - - - - -struct Base_vtable { - void (*doSomething)(void* this_); - void (*setContents)(void* this_, unsigned long long val); -}; - -template -Base_vtable const Base_vtable_for_host = { - [] __host__ __device__ (void* this_){ static_cast(this_)->doSomething(); } - ,[] __host__ __device__ (void* this_, unsigned long long val){ static_cast(this_)->setContents(val); } -}; - -template -__global__ -void Base_vtable_for_device(Base_vtable* vptr_) { - new(vptr_) Base_vtable{[] __host__ __device__ (void* this_){ static_cast(this_)->doSomething(); } - ,[] __host__ __device__ (void* this_, unsigned long long val){ static_cast(this_)->setContents(val); } - }; -}; - - - -//----------------------------------------------------------------------------- - -#if !defined(CHAI_DEVICE_COMPILE) -#define CHAI_POLY_VIRTUAL_CALL(name) \ - return vtbl_host_->name((void*) ptr_host_); -#define CHAI_POLY_VIRTUAL_CALL_ARGS(name, ...) \ - return vtbl_host_->name((void*) ptr_host_, __VA_ARGS__); -#else -#define CHAI_POLY_VIRTUAL_CALL(name) \ - return vtbl_device_->name((void*) ptr_device_); -#define CHAI_POLY_VIRTUAL_CALL_ARGS(name, ...) \ - return vtbl_device_->name((void*) ptr_device_, __VA_ARGS__); -#endif - -template -Base_vtable* make_Base_vtable_on_device() { - Base_vtable* vptr_; - cudaMalloc((void**)&vptr_, sizeof(Base_vtable)); - Base_vtable_for_device <<<1,1>>>(vptr_); - return vptr_; -} - -struct CHAIPolyInterface { - - template - CHAIPolyInterface(Any base) - { - vtbl_host_ = &Base_vtable_for_host; - ptr_host_ = new Any{base}; - - vtbl_device_ = make_Base_vtable_on_device(); - cudaMalloc(&ptr_device_, sizeof(Any)); - - obj_size_ = sizeof(Any); - } - - void move(chai::ExecutionSpace space) - { - if (space == chai::CPU) cudaMemcpy(ptr_host_, ptr_device_, obj_size_, cudaMemcpyDeviceToHost); - if (space == chai::GPU) cudaMemcpy(ptr_device_, ptr_host_, obj_size_, cudaMemcpyHostToDevice); - } - -protected: - Base_vtable const* vtbl_host_; - Base_vtable* vtbl_device_; - //void* ptr_; - void* ptr_host_; - void* ptr_device_; - - long obj_size_; - -}; - -//----------------------------------------------------------------------------- - -#include -#include - - -struct Base: CHAIPolyInterface { - using Poly = CHAIPolyInterface; - - template - Base(Any base) : Poly(base) {}; - - CHAI_HOST_DEVICE void doSomething() const { CHAI_POLY_VIRTUAL_CALL(doSomething) } - CHAI_HOST_DEVICE void setContents(unsigned long long val) const { CHAI_POLY_VIRTUAL_CALL_ARGS(setContents, val) } -}; - - -struct DerivedA { - CHAI_HOST_DEVICE void doSomething() { printf("DerivedA: doSomething\n"); } - CHAI_HOST_DEVICE void setContents(unsigned long long) {} -}; - -struct DerivedB { - DerivedB() : content(0xDDDDDDDDDDDDDDDDull) {}; - - void doBthing() { printf("concrete B thing"); } - - CHAI_HOST_DEVICE void doSomething() - { - printf("DerivedB: doSomething\n"); - printf("%lX\n", content); - } - CHAI_HOST_DEVICE void setContents(unsigned long long val) { content = val; } - unsigned long long content; -}; - - -GPU_TEST(managed_ptr, customvtabletest) { - - Base b = Base(DerivedA{}); - Base b2 = Base(DerivedB{}); - - b.doSomething(); - b2.doSomething(); - - b.move(chai::GPU); - b2.move(chai::GPU); - - BEGIN_EXEC_ON_DEVICE() - printf("-- GPU Kernel begin\n"); - b.doSomething(); - b2.doSomething(); - printf("-- GPU Kernel end\n"); - END_EXEC() - - - b2.setContents(0xCCCCCCCCCCCCCCCCull); - b2.move(chai::GPU); - - BEGIN_EXEC_ON_DEVICE() - printf("-- GPU Kernel begin\n"); - b.doSomething(); - b2.doSomething(); - b2.setContents(0xBBBBBBBBBBBBBBBBull); - printf("-- GPU Kernel end\n"); - END_EXEC() - - b2.move(chai::CPU); - b2.doSomething(); - - - - - -} +//GPU_TEST(managed_ptr, shared_ptr) +//{ +// { +// using DerivedT = B; +// using BaseT = A; +// +// std::cout << "size of (DerivedT) : " << sizeof(DerivedT) << std::endl; +// std::cout << "size of (BaseT) : " << sizeof(BaseT) << std::endl; +// +// chai::ManagedSharedPtr sptr = chai::make_shared_deleter( +// [](DerivedT* p){ printf("Custom Deleter Call\n"); p->~DerivedT(); }); +// +// std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; +// +// chai::ManagedSharedPtr sptr2 = sptr; +// std::cout << "use_count : " << sptr.use_count() << std::endl; +// +// sptr->set_content(0xFFFFFFFFFFFFFFFFull); +// +// std::cout << "GPU CALL...\n"; +// forall(gpu(), 0, 1, [=] __device__ (int i) { +// printf("GPU Body\n"); +// sptr2->function(); +// sptr2->d_function(); +// +// //results[i] = rawArrayClass->getValue(i); +// }); +// GPU_ERROR_CHECK( cudaPeekAtLastError() ); +// GPU_ERROR_CHECK( cudaDeviceSynchronize() ); +// +// } +//} +// +// +// +//GPU_TEST(managed_ptr, shared_ptralloc) +//{ +// +// { +// +// using DerivedT = B; +// using BaseT = A; +// +// chai::SharedPtrManager* sptr_manager = chai::SharedPtrManager::getInstance(); +// umpire::ResourceManager& res_manager = umpire::ResourceManager::getInstance(); +// +// auto cpu_allocator = sptr_manager->getAllocator(chai::CPU); +// BaseT* cpu_ptr = static_cast( cpu_allocator.allocate(1*sizeof(DerivedT)) ); +// +// new(cpu_ptr) DerivedT(); +// BaseT* gpu_ptr = chai::msp_make_on_device(); +// +// +// auto record = sptr_manager->makeSharedPtrRecord(cpu_ptr, gpu_ptr, sizeof(DerivedT), true); +// +// forall(gpu(), 0, 1, [=] __device__ (int i) { +// printf("GPU Body\n"); +// gpu_ptr->function(); +// gpu_ptr->d_function(); +// }); +// +// std::cout << "Ump alloc cpu : " << cpu_ptr << std::endl; +// std::cout << "Ump alloc gpu : " << gpu_ptr << std::endl; +// std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; +// +// cpu_ptr->set_content(0xFFFFFFFFFFFFFFFFull); +// +// camp::resources::Resource device_resource(camp::resources::Cuda::get_default()); +// res_manager.copy_poly(gpu_ptr, cpu_ptr, device_resource); +// +// //unsigned int offset = sizeof(void*); +// //GPU_ERROR_CHECK(cudaMemcpy((char*)gpu_ptr+offset, (char*)cpu_ptr+offset, sizeof(DerivedT)-offset, cudaMemcpyHostToDevice)); +// //GPU_ERROR_CHECK(cudaMemcpy((char*)gpu_ptr, (char*)cpu_ptr, sizeof(DerivedT), cudaMemcpyHostToDevice)); +// +// forall(gpu(), 0, 1, [=] __device__ (int i) { +// printf("GPU Body\n"); +// gpu_ptr->function(); +// gpu_ptr->d_function(); +// }); +// GPU_ERROR_CHECK( cudaPeekAtLastError() ); +// GPU_ERROR_CHECK( cudaDeviceSynchronize() ); +// +// } +// //assert_empty_map(); +//} +// +//GPU_TEST(managed_ptr, polycpytest) +//{ +// +// // Assign 32 byte block of memory to 0x11 on the Host +// unsigned char* memory1 = (unsigned char*)malloc(56*sizeof(unsigned char)); +// memset(memory1, 0x11, 56 * sizeof(unsigned char)); +// CPU_PRINT_MEMORY(memory1, "1 : before placement new") +// +// +// // Assign 32 byte block of memory to 0x22 on the Device +// unsigned char* memory2; cudaMalloc((void**)&memory2, 56*sizeof(unsigned char)); +// forall(gpu(), 0, 56, [=] __device__ (int i) { memory2[i] = 0x22; }); +// GPU_PRINT_MEMORY(memory2, "2 : before placement new") +// +// +// // Placement New Polymorphic object on the Host. +// B* b_ptr1 = new (memory1) B; +// CPU_PRINT_MEMORY(memory1, "1 : after placement new"); +// +// +// // Placement New Polymorphic object on the Device. +// B* b_ptr2 = reinterpret_cast(memory2); +// A* base2 = b_ptr2; +// forall(gpu(), 0, 1, [=] __device__ (int i) { new(b_ptr2) B();}); +// GPU_PRINT_MEMORY(memory2, "2 : after placement new"); +// +// +// // B was constructed on the Device so we can call virtual +// // function on the GPU from a host pointer. +// printf("Calling virtual function from Base pointer on GPU.\n"); +// forall(gpu(), 0, 1, [=] __device__ (int i) { base2->function(); }); +// GPU_ERROR_CHECK( cudaPeekAtLastError() ); +// GPU_ERROR_CHECK( cudaDeviceSynchronize() ); +// +// +// +// // Lets edit the Data on the Host... +// b_ptr1->content_B = 0xCBCBCBCBCBCBCBCBull; +// CPU_PRINT_MEMORY(memory1, "1 : after content change"); +// +// // Copying Data from Host to Device +//#define OFFSET_CPY +//#if !defined(OFFSET_CPY) +// GPU_ERROR_CHECK(cudaMemcpy(b_ptr2, b_ptr1, sizeof(B), cudaMemcpyHostToDevice)); +//#else +// // We nee to skip over the Vtable and try to only copy the contents of the +// // object itself. +// unsigned int offset = sizeof(void*); +// char* off_b_ptr2 = (char*)b_ptr2 + offset; +// char* off_b_ptr1 = (char*)b_ptr1 + offset; +// int off_size = sizeof(B) - offset; +// +// GPU_ERROR_CHECK(cudaMemcpy(off_b_ptr2, off_b_ptr1, off_size, cudaMemcpyHostToDevice)); +// //// This will not work as we need to do pointer arithmatic at the byte level... +// //GPU_ERROR_CHECK(cudaMemcpy(b_ptr2 + offset, b_ptr1 + offset, sizeof(B) - offset, cudaMemcpyHostToDevice)); +//#endif +// GPU_PRINT_MEMORY(memory2, "2 : after copy from host"); +// +// // Try to call virtual funciton on GPU like we did before. +// printf("Calling virtual function from Base pointer on GPU.\n"); +// forall(gpu(), 0, 1, [=] __device__ (int i) { base2->function(); }); +// GPU_ERROR_CHECK( cudaPeekAtLastError() ); +// GPU_ERROR_CHECK( cudaDeviceSynchronize() ); +// +// +// +// // Lets edit the Data on the Device... +// forall(gpu(), 0, 1, [=] __device__ (int i) { +// b_ptr2->content_B = 0xDBDBDBDBDBDBDBDBull; +// b_ptr2->content_A = 0xDADADADADADADADAull; }); +// GPU_PRINT_MEMORY(memory2, "2 : after content change"); +// +// +//#if !defined(OFFSET_CPY) +// GPU_ERROR_CHECK(cudaMemcpy(b_ptr1, b_ptr2, sizeof(B), cudaMemcpyDeviceToHost)); +//#else +// GPU_ERROR_CHECK(cudaMemcpy((char*)b_ptr1 + offset, (char*)b_ptr2 + offset, sizeof(B) - offset, cudaMemcpyDeviceToHost)); +//#endif +// CPU_PRINT_MEMORY(memory1, "1 : after copy from host"); +// +// // Free up memory, we useed placement new so we need to call the destructor first... +// reinterpret_cast(memory1)->~B(); +// forall(gpu(), 0, 1, [=] __device__ (int i) { reinterpret_cast(memory2)->~B(); }); +// cudaFree(memory2); +// +//} +// +// +// +// +// +// +// +// +// +//struct Base_vtable { +// void (*doSomething)(void* this_); +// void (*setContents)(void* this_, unsigned long long val); +//}; +// +//template +//Base_vtable const Base_vtable_for_host = { +// [] __host__ __device__ (void* this_){ static_cast(this_)->doSomething(); } +// ,[] __host__ __device__ (void* this_, unsigned long long val){ static_cast(this_)->setContents(val); } +//}; +// +//template +//__global__ +//void Base_vtable_for_device(Base_vtable* vptr_) { +// new(vptr_) Base_vtable{[] __host__ __device__ (void* this_){ static_cast(this_)->doSomething(); } +// ,[] __host__ __device__ (void* this_, unsigned long long val){ static_cast(this_)->setContents(val); } +// }; +//}; +// +// +// +////----------------------------------------------------------------------------- +// +//#if !defined(CHAI_DEVICE_COMPILE) +//#define CHAI_POLY_VIRTUAL_CALL(name) \ +// return vtbl_host_->name((void*) ptr_host_); +//#define CHAI_POLY_VIRTUAL_CALL_ARGS(name, ...) \ +// return vtbl_host_->name((void*) ptr_host_, __VA_ARGS__); +//#else +//#define CHAI_POLY_VIRTUAL_CALL(name) \ +// return vtbl_device_->name((void*) ptr_device_); +//#define CHAI_POLY_VIRTUAL_CALL_ARGS(name, ...) \ +// return vtbl_device_->name((void*) ptr_device_, __VA_ARGS__); +//#endif +// +//template +//Base_vtable* make_Base_vtable_on_device() { +// Base_vtable* vptr_; +// cudaMalloc((void**)&vptr_, sizeof(Base_vtable)); +// Base_vtable_for_device <<<1,1>>>(vptr_); +// return vptr_; +//} +// +//struct CHAIPolyInterface { +// +// template +// CHAIPolyInterface(Any base) +// { +// vtbl_host_ = &Base_vtable_for_host; +// ptr_host_ = new Any{base}; +// +// vtbl_device_ = make_Base_vtable_on_device(); +// cudaMalloc(&ptr_device_, sizeof(Any)); +// +// obj_size_ = sizeof(Any); +// } +// +// void move(chai::ExecutionSpace space) +// { +// if (space == chai::CPU) cudaMemcpy(ptr_host_, ptr_device_, obj_size_, cudaMemcpyDeviceToHost); +// if (space == chai::GPU) cudaMemcpy(ptr_device_, ptr_host_, obj_size_, cudaMemcpyHostToDevice); +// } +// +//protected: +// Base_vtable const* vtbl_host_; +// Base_vtable* vtbl_device_; +// //void* ptr_; +// void* ptr_host_; +// void* ptr_device_; +// +// long obj_size_; +// +//}; +// +////----------------------------------------------------------------------------- +// +//#include +//#include +// +// +//struct Base: CHAIPolyInterface { +// using Poly = CHAIPolyInterface; +// +// template +// Base(Any base) : Poly(base) {}; +// +// CHAI_HOST_DEVICE void doSomething() const { CHAI_POLY_VIRTUAL_CALL(doSomething) } +// CHAI_HOST_DEVICE void setContents(unsigned long long val) const { CHAI_POLY_VIRTUAL_CALL_ARGS(setContents, val) } +//}; +// +// +//struct DerivedA { +// CHAI_HOST_DEVICE void doSomething() { printf("DerivedA: doSomething\n"); } +// CHAI_HOST_DEVICE void setContents(unsigned long long) {} +//}; +// +//struct DerivedB { +// DerivedB() : content(0xDDDDDDDDDDDDDDDDull) {}; +// +// void doBthing() { printf("concrete B thing"); } +// +// CHAI_HOST_DEVICE void doSomething() +// { +// printf("DerivedB: doSomething\n"); +// printf("%lX\n", content); +// } +// CHAI_HOST_DEVICE void setContents(unsigned long long val) { content = val; } +// unsigned long long content; +//}; +// +// +//GPU_TEST(managed_ptr, customvtabletest) { +// +// Base b = Base(DerivedA{}); +// Base b2 = Base(DerivedB{}); +// +// b.doSomething(); +// b2.doSomething(); +// +// b.move(chai::GPU); +// b2.move(chai::GPU); +// +// BEGIN_EXEC_ON_DEVICE() +// printf("-- GPU Kernel begin\n"); +// b.doSomething(); +// b2.doSomething(); +// printf("-- GPU Kernel end\n"); +// END_EXEC() +// +// +// b2.setContents(0xCCCCCCCCCCCCCCCCull); +// b2.move(chai::GPU); +// +// BEGIN_EXEC_ON_DEVICE() +// printf("-- GPU Kernel begin\n"); +// b.doSomething(); +// b2.doSomething(); +// b2.setContents(0xBBBBBBBBBBBBBBBBull); +// printf("-- GPU Kernel end\n"); +// END_EXEC() +// +// b2.move(chai::CPU); +// b2.doSomething(); +// +// +// +// +// +//} From acf00adc46bcdc4d8fa203dcf4ebcce61a4b34c4 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Wed, 8 May 2024 20:21:18 -0700 Subject: [PATCH 10/44] Proper destruction on host & device; Correct deallocation w/ Umpire; Non CHAICopyable const type test. --- src/chai/ManagedSharedPtr.hpp | 18 +++- src/chai/SharedPtrCounter.hpp | 21 ++++- src/chai/SharedPtrManager.cpp | 1 + tests/integration/polymorphism_tests.cpp | 102 +++++++++++++---------- 4 files changed, 88 insertions(+), 54 deletions(-) diff --git a/src/chai/ManagedSharedPtr.hpp b/src/chai/ManagedSharedPtr.hpp index 0067c985..4d9cffbe 100644 --- a/src/chai/ManagedSharedPtr.hpp +++ b/src/chai/ManagedSharedPtr.hpp @@ -199,12 +199,21 @@ class ManagedSharedPtr { namespace detail { namespace impl { + +template +__global__ void msp_dispose_on_device(T* gpuPointer, Deleter d) +{ + d(gpuPointer); +} + template __global__ void msp_make_on_device(T* gpuPointer, Args&&... args) { new(gpuPointer) T(std::forward(args)...); } + } // namespace impl //template @@ -247,14 +256,15 @@ ManagedSharedPtr make_shared(Args&&... args) { std::cout << "make_shared\n"; Tp* cpu_pointer = detail::msp_make_on_host(std::forward(args)...); - //Tp* gpu_pointer = detail::msp_make_on_device(std::forward(args)...); std::cout << "CPU pointer @ " << cpu_pointer << std::endl; Tp* gpu_pointer = detail::msp_make_on_device(); - std::cout << "GPU pointer @ " << gpu_pointer << std::endl; - cudaDeviceSynchronize(); + std::cout << "GPU pointer @ " << gpu_pointer << std::endl; cudaDeviceSynchronize(); - auto result = ManagedSharedPtr(cpu_pointer, gpu_pointer, [](Tp* p){delete p;}); + auto result = ManagedSharedPtr(cpu_pointer, gpu_pointer, + [] CHAI_HOST_DEVICE (Tp* p){p->~Tp();} + ); + //auto result = ManagedSharedPtr(cpu_pointer, gpu_pointer, [](Tp* p){delete p;}); if (!is_CHAICopyable::value) { result.move(chai::GPU, false); diff --git a/src/chai/SharedPtrCounter.hpp b/src/chai/SharedPtrCounter.hpp index 82ee809f..705d77b8 100644 --- a/src/chai/SharedPtrCounter.hpp +++ b/src/chai/SharedPtrCounter.hpp @@ -75,8 +75,16 @@ class msp_counted_ptr final : public msp_counted_base { #include -//template -//void err_func(T arg){ static_assert(false); } +namespace impl { + +template +__global__ void msp_dispose_on_device(T* gpuPointer, Deleter d) +{ + d(gpuPointer); +} + +} // namespace impl template class msp_counted_deleter final : public msp_counted_base { @@ -97,15 +105,20 @@ class msp_counted_deleter final : public msp_counted_base { msp_counted_deleter(Ptr h_p, Ptr d_p, Deleter d) noexcept : m_impl(h_p, d_p, std::move(d)) {} virtual void m_dispose() noexcept { printf("Delete GPU Memory Here...\n"); + ::chai::impl::msp_dispose_on_device<<<1,1>>>((Ptr)m_impl.m_record->m_pointers[chai::GPU], m_impl.m_del()); + SharedPtrManager::getInstance()->free(m_impl.m_record, chai::GPU); + + printf("Delete CPU Memory Here...\n"); m_impl.m_del()((Ptr)m_impl.m_record->m_pointers[chai::CPU]); + SharedPtrManager::getInstance()->free(m_impl.m_record, chai::CPU); } virtual void m_destroy() noexcept { this->~msp_counted_deleter(); } virtual void moveInnerImpl() { - //using T = std::remove_cv_t; - using T_non_const = std::remove_const_t>; + using T_non_const = std::remove_const_t>; T_non_const* host_ptr = const_cast((Ptr)m_impl.m_record->m_pointers[CPU]); + // trigger the copy constructor std::cout << "Trigger Inner Copy Ctor @ " << host_ptr << std::endl; T_non_const inner = T_non_const(*host_ptr); diff --git a/src/chai/SharedPtrManager.cpp b/src/chai/SharedPtrManager.cpp index 9b02fe18..c75f9aff 100644 --- a/src/chai/SharedPtrManager.cpp +++ b/src/chai/SharedPtrManager.cpp @@ -499,6 +499,7 @@ msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(void const* c_pointer, if (pointer) { registerPointer(pointer_record, chai::CPU, owned); + registerPointer(pointer_record, chai::GPU, owned); } return pointer_record; diff --git a/tests/integration/polymorphism_tests.cpp b/tests/integration/polymorphism_tests.cpp index f1f3ea1c..c5b1c3bf 100644 --- a/tests/integration/polymorphism_tests.cpp +++ b/tests/integration/polymorphism_tests.cpp @@ -106,8 +106,8 @@ class A D d; CHAI_HOST_DEVICE A(void) : content_A(0xAAAAAAAAAAAAAAAAull) { printf("++ A has been constructed\n"); } CHAI_HOST_DEVICE ~A(void) { printf("-- A has been destructed\n"); } - CHAI_HOST_DEVICE virtual void function(void) = 0; - CHAI_HOST_DEVICE virtual void d_function(void) = 0; + CHAI_HOST_DEVICE virtual void function(void) const = 0; + CHAI_HOST_DEVICE virtual void d_function(void) const = 0; CHAI_HOST_DEVICE virtual void set_content(unsigned long long) = 0; }; @@ -124,8 +124,8 @@ class B : public A, public A2 unsigned long long content_B; CHAI_HOST_DEVICE B(void) : content_B(0xBBBBBBBBBBBBBBBBull) { printf("++ B has been constructed\n"); } CHAI_HOST_DEVICE ~B(void) { printf("-- B has been destructed\n"); } - CHAI_HOST_DEVICE virtual void function(void) override { printf("%lX\n", content_B); } - CHAI_HOST_DEVICE virtual void d_function(void) override { d.function(); } + CHAI_HOST_DEVICE virtual void function(void) const override { printf("%lX\n", content_B); } + CHAI_HOST_DEVICE virtual void d_function(void) const override { d.function(); } CHAI_HOST_DEVICE virtual void set_content(unsigned long long val) override { content_B = val; content_A = val; } }; @@ -134,6 +134,7 @@ class AAbsMem : public chai::CHAICopyable { public: unsigned long long content_A; + //chai::ManagedSharedPtr base_member; chai::ManagedSharedPtr base_member; CHAI_HOST_DEVICE AAbsMem(void) : content_A(0xAAAAAAAAAAAAAAAAull) { printf("++ A has been constructed\n"); } @@ -141,7 +142,8 @@ class AAbsMem : public chai::CHAICopyable //template::value>::type > template CHAI_HOST AAbsMem(Derived const& base_val) - : base_member(chai::make_shared(base_val)) + //: base_member(chai::make_shared(base_val)) + : base_member(chai::make_shared(base_val)) , content_A(0xAAAAAAAAAAAAAAAAull) { printf("++ A has been constructed\n"); } @@ -185,11 +187,8 @@ GPU_TEST(managed_ptr, shared_ptr_absmem) std::cout << "size of (DerivedT) : " << sizeof(DerivedT) << std::endl; std::cout << "size of (BaseT) : " << sizeof(BaseT) << std::endl; - //chai::ManagedSharedPtr sptr = chai::make_shared(D{}); D d; - //DerivedT der(d); - chai::ManagedSharedPtr sptr = chai::make_shared(d); - //chai::ManagedSharedPtr sptr = chai::make_shared(d); + chai::ManagedSharedPtr sptr = chai::make_shared(d); GPU_ERROR_CHECK( cudaPeekAtLastError() ); GPU_ERROR_CHECK( cudaDeviceSynchronize() ); @@ -198,23 +197,66 @@ GPU_TEST(managed_ptr, shared_ptr_absmem) sptr2->function(); sptr2->d_function(); + std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; + + std::cout << "GPU CALL...\n"; + forall(gpu(), 0, 1, [=] __device__ (int i) { + printf("GPU Body\n"); + sptr2->function(); + sptr2->d_function(); + }); + GPU_ERROR_CHECK( cudaPeekAtLastError() ); + GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + + std::cout << "CPU CALL...\n"; + forall(sequential(), 0, 1, [=] (int i) { + printf("CPU Body\n"); + sptr->set_content(0xFFFFFFFFFFFFFFFFull); + sptr2->function(); + sptr2->d_function(); + }); + std::cout << "GPU CALL...\n"; forall(gpu(), 0, 1, [=] __device__ (int i) { printf("GPU Body\n"); sptr2->function(); sptr2->d_function(); }); + GPU_ERROR_CHECK( cudaPeekAtLastError() ); + GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + + } + std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; +} + +GPU_TEST(managed_ptr, shared_ptr_const) +{ + { + using DerivedT = B; + using BaseT = A; + + std::cout << "size of (DerivedT) : " << sizeof(DerivedT) << std::endl; + std::cout << "size of (BaseT) : " << sizeof(BaseT) << std::endl; + + chai::ManagedSharedPtr sptr = chai::make_shared(); - //sptr2.registerTouch(chai::GPU); + chai::ManagedSharedPtr sptr2 = sptr; + std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; + std::cout << "GPU CALL...\n"; + forall(gpu(), 0, 1, [=] __device__ (int i) { + printf("GPU Body\n"); + sptr2->function(); + sptr2->d_function(); + }); GPU_ERROR_CHECK( cudaPeekAtLastError() ); GPU_ERROR_CHECK( cudaDeviceSynchronize() ); std::cout << "CPU CALL...\n"; forall(sequential(), 0, 1, [=] (int i) { printf("CPU Body\n"); - //sptr->set_content(0xFFFFFFFFFFFFFFFFull); + sptr->set_content(0xFFFFFFFFFFFFFFFFull); sptr2->function(); sptr2->d_function(); }); @@ -229,43 +271,11 @@ GPU_TEST(managed_ptr, shared_ptr_absmem) GPU_ERROR_CHECK( cudaDeviceSynchronize() ); } + std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; } -//GPU_TEST(managed_ptr, shared_ptr) -//{ -// { -// using DerivedT = B; -// using BaseT = A; -// -// std::cout << "size of (DerivedT) : " << sizeof(DerivedT) << std::endl; -// std::cout << "size of (BaseT) : " << sizeof(BaseT) << std::endl; -// -// chai::ManagedSharedPtr sptr = chai::make_shared_deleter( -// [](DerivedT* p){ printf("Custom Deleter Call\n"); p->~DerivedT(); }); -// -// std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; -// -// chai::ManagedSharedPtr sptr2 = sptr; -// std::cout << "use_count : " << sptr.use_count() << std::endl; -// -// sptr->set_content(0xFFFFFFFFFFFFFFFFull); -// -// std::cout << "GPU CALL...\n"; -// forall(gpu(), 0, 1, [=] __device__ (int i) { -// printf("GPU Body\n"); -// sptr2->function(); -// sptr2->d_function(); -// -// //results[i] = rawArrayClass->getValue(i); -// }); -// GPU_ERROR_CHECK( cudaPeekAtLastError() ); -// GPU_ERROR_CHECK( cudaDeviceSynchronize() ); -// -// } -//} -// -// -// + + //GPU_TEST(managed_ptr, shared_ptralloc) //{ // From c4bcb9b05236b991e780c9f91a77ed267aeb5e4d Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Wed, 8 May 2024 21:14:04 -0700 Subject: [PATCH 11/44] CHAIPoly type tag for polymorphic types and defining their h/d copy behavior from non-poly types in ManagedSharedPtr. --- src/chai/ManagedSharedPtr.hpp | 22 +++++---- src/chai/SharedPtrCounter.hpp | 10 ++--- src/chai/SharedPtrManager.cpp | 45 +++++++------------ src/chai/SharedPtrManager.hpp | 4 +- tests/integration/polymorphism_tests.cpp | 57 ++++++++++++++++++++++-- 5 files changed, 92 insertions(+), 46 deletions(-) diff --git a/src/chai/ManagedSharedPtr.hpp b/src/chai/ManagedSharedPtr.hpp index 4d9cffbe..37841036 100644 --- a/src/chai/ManagedSharedPtr.hpp +++ b/src/chai/ManagedSharedPtr.hpp @@ -12,6 +12,9 @@ namespace chai { + +struct CHAIPoly {}; + // Type traits for SFINAE template struct msp_is_constructible : std::is_convertible::type {}; @@ -25,6 +28,9 @@ struct msp_compatible_with : std::is_convertible::type {}; template struct is_CHAICopyable : std::is_base_of::type {}; +template +struct is_CHAIPoly : std::is_base_of::type {}; + template class ManagedSharedPtr { @@ -99,7 +105,7 @@ class ManagedSharedPtr { const element_type* cget(ExecutionSpace space = chai::CPU) const noexcept { #if !defined(CHAI_DEVICE_COMPILE) if (m_active_pointer) { - //move(CPU, false); + move(CPU, false); } #endif return m_active_pointer; @@ -108,7 +114,7 @@ class ManagedSharedPtr { element_type* get(ExecutionSpace space = chai::CPU) const noexcept { #if !defined(CHAI_DEVICE_COMPILE) if (m_active_pointer) { - //move(CPU); + move(CPU); } #endif return m_active_pointer; @@ -135,9 +141,8 @@ class ManagedSharedPtr { } CHAI_HOST - void move(ExecutionSpace space,// bool registerTouch = true) noexcept { - bool registerTouch=(!std::is_const::value || is_CHAICopyable::value)) { - //bool registerTouch=false) { + void move(ExecutionSpace space, + bool registerTouch=(!std::is_const::value || is_CHAICopyable::value)) const { ExecutionSpace prev_space = m_record_count.m_get_record()->m_last_space; ExecutionSpace oldContext = m_resource_manager->getExecutionSpace(); if (prev_space != GPU && space == GPU) { @@ -147,7 +152,8 @@ class ManagedSharedPtr { moveInnerImpl(); } auto old_pointer = m_active_pointer; - m_active_pointer = static_cast(m_resource_manager->move((void *)m_active_pointer, m_record_count.m_get_record(), space)); + m_active_pointer = static_cast(m_resource_manager->move( + (void *)m_active_pointer, m_record_count.m_get_record(), space, is_CHAIPoly::value)); if (old_pointer != m_active_pointer) { std::cout << "m_active_pointer @ " << m_active_pointer << " : def touch behaviour : " << (!std::is_const::value || is_CHAICopyable::value) << std::endl; } @@ -180,7 +186,7 @@ class ManagedSharedPtr { typename std::enable_if::type = 0> CHAI_HOST void - moveInnerImpl() + moveInnerImpl() const { m_record_count.moveInnerImpl(); } @@ -190,7 +196,7 @@ class ManagedSharedPtr { typename std::enable_if::type = 0> CHAI_HOST void - moveInnerImpl() + moveInnerImpl() const { } diff --git a/src/chai/SharedPtrCounter.hpp b/src/chai/SharedPtrCounter.hpp index 705d77b8..e0f3c534 100644 --- a/src/chai/SharedPtrCounter.hpp +++ b/src/chai/SharedPtrCounter.hpp @@ -24,7 +24,7 @@ class msp_counted_base { virtual void m_dispose() noexcept = 0; virtual void m_destroy() noexcept { delete this; } - virtual void moveInnerImpl() = 0; + virtual void moveInnerImpl() const = 0; void m_add_ref_copy() noexcept { ++m_use_count; } @@ -56,7 +56,7 @@ class msp_counted_ptr final : public msp_counted_base { virtual void m_dispose() noexcept { delete (Ptr)m_record->m_pointers[chai::CPU]; }// TODO : Other Exec spaces... virtual void m_destroy() noexcept { delete this; } - virtual void moveInnerImpl() { + virtual void moveInnerImpl() const { using T = std::remove_pointer_t; Ptr host_ptr = (Ptr) m_record->m_pointers[CPU]; // trigger the copy constructor @@ -114,7 +114,7 @@ class msp_counted_deleter final : public msp_counted_base { } virtual void m_destroy() noexcept { this->~msp_counted_deleter(); } - virtual void moveInnerImpl() { + virtual void moveInnerImpl() const { using T_non_const = std::remove_const_t>; T_non_const* host_ptr = const_cast((Ptr)m_impl.m_record->m_pointers[CPU]); @@ -193,12 +193,12 @@ class msp_record_count { operator==(msp_record_count const& a, msp_record_count const& b) noexcept { return a.m_pi == b.m_pi; } - msp_pointer_record* m_get_record() noexcept { return m_pi->m_get_record(); } + msp_pointer_record* m_get_record() const noexcept { return m_pi->m_get_record(); } template Ptr* m_get_pointer(chai::ExecutionSpace space) noexcept { return static_cast(m_get_record()->m_pointers[space]); } - void moveInnerImpl() { m_pi->moveInnerImpl(); } + void moveInnerImpl() const { m_pi->moveInnerImpl(); } msp_counted_base* m_pi; diff --git a/src/chai/SharedPtrManager.cpp b/src/chai/SharedPtrManager.cpp index c75f9aff..70fa68d3 100644 --- a/src/chai/SharedPtrManager.cpp +++ b/src/chai/SharedPtrManager.cpp @@ -186,7 +186,7 @@ void SharedPtrManager::setExecutionSpace(ExecutionSpace space) void* SharedPtrManager::move(void* pointer, msp_pointer_record* pointer_record, - ExecutionSpace space) + ExecutionSpace space, bool poly) { // Check for default arg (NONE) if (space == NONE) { @@ -197,7 +197,7 @@ void* SharedPtrManager::move(void* pointer, return pointer; } - move(pointer_record, space); + move(pointer_record, space, poly); return pointer_record->m_pointers[space]; } @@ -239,7 +239,7 @@ void SharedPtrManager::resetTouch(msp_pointer_record* pointer_record) /* Not all GPU platform runtimes (notably HIP), will give you asynchronous copies to the device by default, so we leverage * umpire's API for asynchronous copies using camp resources in this method, based off of the CHAI destination space * */ -static void copy(void * dst_pointer, void * src_pointer, umpire::ResourceManager & manager, ExecutionSpace dst_space, ExecutionSpace src_space) { +static void copy(void * dst_pointer, void * src_pointer, umpire::ResourceManager & manager, ExecutionSpace dst_space, ExecutionSpace src_space, bool poly=false) { #ifdef CHAI_ENABLE_CUDA camp::resources::Resource device_resource(camp::resources::Cuda::get_default()); @@ -256,14 +256,21 @@ static void copy(void * dst_pointer, void * src_pointer, umpire::ResourceManager camp::resources::Resource host_resource(camp::resources::Host::get_default()); if (dst_space == GPU || src_space == GPU) { // Do the copy using the device resource - std::size_t vtable_size = sizeof(void*); - void* poly_src_ptr = ((char*)src_pointer + vtable_size); - void* poly_dst_ptr = ((char*)dst_pointer + vtable_size); - std::cout << "---- Sptr Manager Device Copy\n"; std::cout << "---- dst_ptr @ " << dst_pointer << std::endl; std::cout << "---- src_ptr @ " << src_pointer << std::endl; - manager.copy(poly_dst_ptr, poly_src_ptr); + + if (poly) { + std::cout << "---- POLY COPY\n"; + std::size_t vtable_size = sizeof(void*); + void* poly_src_ptr = ((char*)src_pointer + vtable_size); + void* poly_dst_ptr = ((char*)dst_pointer + vtable_size); + manager.copy(poly_dst_ptr, poly_src_ptr, device_resource); + } else { + std::cout << "---- STD COPY\n"; + manager.copy(dst_pointer, src_pointer, device_resource); + } + } else { // Do the copy using the host resource manager.copy(dst_pointer, src_pointer, host_resource); @@ -274,7 +281,7 @@ static void copy(void * dst_pointer, void * src_pointer, umpire::ResourceManager } } -void SharedPtrManager::move(msp_pointer_record* record, ExecutionSpace space) +void SharedPtrManager::move(msp_pointer_record* record, ExecutionSpace space, bool poly) { if (space == NONE) { return; @@ -291,30 +298,12 @@ void SharedPtrManager::move(msp_pointer_record* record, ExecutionSpace space) void* src_pointer = record->m_pointers[prev_space]; void* dst_pointer = record->m_pointers[space]; - //if (!dst_pointer) { - // allocate(record, space); - // dst_pointer = record->m_pointers[space]; - //} - if ( (!record->m_touched[record->m_last_space]) || (! src_pointer )) { - //printf("failed move conditions\n"); - //for (int i = chai::CPU; i < NUM_EXECUTION_SPACES; i++) std::cout << i << " : " <m_touched[i] << std::endl; - //std::cout << record->m_last_space << std::endl; - //std::cout << record->m_touched[record->m_last_space] << std::endl; - //std::cout << (src_pointer) << std::endl; return; } else if (dst_pointer != src_pointer) { // Exclude the copy if src and dst are the same (can happen for PINNED memory) { - //printf("Performing Copy\n"); - //std::cout << "dst_pointer : " << dst_pointer << std::endl; - //std::cout << "src_pointer : " << src_pointer << std::endl; - //std::cout << "space : " << space << std::endl; - //std::cout << "prev_space : " << prev_space << std::endl; - //std::cout << m_resource_manager.findAllocatorForPointer(dst_pointer)->getName() << std::endl; - //std::cout << m_resource_manager.findAllocatorForPointer(src_pointer)->getName() << std::endl; - chai::copy(dst_pointer, src_pointer, m_resource_manager, space, prev_space); - + chai::copy(dst_pointer, src_pointer, m_resource_manager, space, prev_space, poly); } //callback(record, ACTION_MOVE, space); diff --git a/src/chai/SharedPtrManager.hpp b/src/chai/SharedPtrManager.hpp index 2e8bdb9c..25552000 100644 --- a/src/chai/SharedPtrManager.hpp +++ b/src/chai/SharedPtrManager.hpp @@ -178,7 +178,7 @@ class SharedPtrManager */ CHAISHAREDDLL_API void* move(void* pointer, msp_pointer_record* pointer_record, - ExecutionSpace = NONE); + ExecutionSpace = NONE, bool = false); /*! * \brief Register a touch of the pointer in the current execution space. @@ -446,7 +446,7 @@ class SharedPtrManager * \param record * \param space */ - void move(msp_pointer_record* record, ExecutionSpace space); + void move(msp_pointer_record* record, ExecutionSpace space, bool = false); /*! * \brief Execute a user callback if callbacks are active diff --git a/tests/integration/polymorphism_tests.cpp b/tests/integration/polymorphism_tests.cpp index c5b1c3bf..adadb10b 100644 --- a/tests/integration/polymorphism_tests.cpp +++ b/tests/integration/polymorphism_tests.cpp @@ -81,7 +81,7 @@ void PrintMemory(const unsigned char* memory, }); -class C +class C : chai::CHAIPoly { public: CHAI_HOST_DEVICE C(void) { printf("++ C has been constructed\n"); } @@ -99,7 +99,7 @@ class D : public C }; -class A +class A : chai::CHAIPoly { public: unsigned long long content_A; @@ -130,7 +130,7 @@ class B : public A, public A2 }; -class AAbsMem : public chai::CHAICopyable +class AAbsMem : public chai::CHAICopyable , public chai::CHAIPoly { public: unsigned long long content_A; @@ -274,6 +274,57 @@ GPU_TEST(managed_ptr, shared_ptr_const) std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; } +class NV +{ +public: + unsigned long long content_NV; + CHAI_HOST_DEVICE NV(void) : content_NV(0xFFFFFFFFFFFFFFFFull) { printf("++ NV has been constructed\n"); } + CHAI_HOST_DEVICE ~NV(void) { printf("-- NV has been destructed\n"); } + CHAI_HOST_DEVICE void function(void) const { printf("%lX\n", content_NV); } +}; + +GPU_TEST(managed_ptr, shared_ptr_nv) +{ + { + using DerivedT = NV; + using BaseT = A; + + std::cout << "size of (DerivedT) : " << sizeof(DerivedT) << std::endl; + + chai::ManagedSharedPtr sptr = chai::make_shared(); + + chai::ManagedSharedPtr sptr2 = sptr; + //chai::ManagedSharedPtr sptr2 = sptr; + + std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; + + std::cout << "GPU CALL...\n"; + forall(gpu(), 0, 1, [=] __device__ (int i) { + printf("GPU Body\n"); + sptr2->function(); + }); + GPU_ERROR_CHECK( cudaPeekAtLastError() ); + GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + + std::cout << "CPU CALL...\n"; + forall(sequential(), 0, 1, [=] (int i) { + printf("CPU Body\n"); + //sptr->set_content(0xFFFFFFFFFFFFFFFFull); + sptr2->function(); + }); + + std::cout << "GPU CALL...\n"; + forall(gpu(), 0, 1, [=] __device__ (int i) { + printf("GPU Body\n"); + sptr2->function(); + }); + GPU_ERROR_CHECK( cudaPeekAtLastError() ); + GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + + } + std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; +} + //GPU_TEST(managed_ptr, shared_ptralloc) From c5b92554480b30b8ce5085695e857847f25927ed Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Mon, 3 Jun 2024 09:35:21 -0700 Subject: [PATCH 12/44] Making ManagedSharedPtr CHAICopyable (ISSUE: umpire throws error upon memory map dtor); Rename shared ptr tests to a more appropriate filename. --- src/chai/ManagedArray.hpp | 20 + src/chai/ManagedArray.inl | 1 + src/chai/ManagedSharedPtr.hpp | 35 +- src/chai/SharedPtrCounter.hpp | 12 +- tests/integration/CMakeLists.txt | 10 +- ...tests.cpp => managed_shared_ptr_tests.cpp} | 361 +++--------------- tests/integration/polymorphism_hana_tests.cpp | 230 ----------- 7 files changed, 134 insertions(+), 535 deletions(-) rename tests/integration/{polymorphism_tests.cpp => managed_shared_ptr_tests.cpp} (51%) delete mode 100644 tests/integration/polymorphism_hana_tests.cpp diff --git a/src/chai/ManagedArray.hpp b/src/chai/ManagedArray.hpp index 6cfd5063..15961925 100644 --- a/src/chai/ManagedArray.hpp +++ b/src/chai/ManagedArray.hpp @@ -456,6 +456,26 @@ class ManagedArray : public CHAICopyable { return false; } + // if T is a CHAICopyable, then it is important to initialize all the + // ManagedArrays to nullptr at allocation, since it is extremely easy to + // trigger a moveInnerImpl, which expects inner values to be initialized. + template ::value, + typename std::enable_if::type = 0> + CHAI_HOST bool freeInner(size_t start = 0) + { + for (size_t i = start; i < m_size/sizeof(T); ++i) { + m_active_base_pointer[i] = nullptr; + } + return true; + } + + // Do not deep initialize if T is not a CHAICopyable. + template ::value, + typename std::enable_if::type = 0> + CHAI_HOST bool freeInner(size_t = 0) + { + return false; + } #endif protected: /*! diff --git a/src/chai/ManagedArray.inl b/src/chai/ManagedArray.inl index 69b9bf63..7b04966d 100644 --- a/src/chai/ManagedArray.inl +++ b/src/chai/ManagedArray.inl @@ -260,6 +260,7 @@ CHAI_HOST void ManagedArray::free(ExecutionSpace space) if (m_pointer_record == &ArrayManager::s_null_record) { m_pointer_record = m_resource_manager->makeManaged((void *)m_active_base_pointer,m_size,space,true); } + freeInner(); m_resource_manager->free(m_pointer_record, space); m_active_pointer = nullptr; m_active_base_pointer = nullptr; diff --git a/src/chai/ManagedSharedPtr.hpp b/src/chai/ManagedSharedPtr.hpp index 37841036..6fc04d5f 100644 --- a/src/chai/ManagedSharedPtr.hpp +++ b/src/chai/ManagedSharedPtr.hpp @@ -33,7 +33,7 @@ struct is_CHAIPoly : std::is_base_of::type {}; template -class ManagedSharedPtr { +class ManagedSharedPtr : public CHAICopyable{ public: using element_type = Tp;//typename std::remove_extent::type; @@ -97,6 +97,39 @@ class ManagedSharedPtr { //if (m_active_pointer) move(m_resource_manager->getExecutionSpace()); #endif } + + CHAI_HOST_DEVICE ManagedSharedPtr& operator=(ManagedSharedPtr const& rhs){ + m_record_count=rhs.m_record_count; + m_active_pointer=rhs.m_active_pointer; + m_resource_manager=rhs.m_resource_manager; + + return *this; + + } + + CHAI_HOST void swap(ManagedSharedPtr& rhs) noexcept { + std::swap(m_active_pointer, rhs.m_active_pointer); + std::swap(m_resource_manager, rhs.m_resource_manager); + m_record_count.swap(rhs.m_record_count); + + } + + CHAI_HOST void reset() noexcept { + ManagedSharedPtr().swap(*this); + } + + CHAI_HOST ManagedSharedPtr& operator=(std::nullptr_t) { + reset(); + return *this; + } + + CHAI_HOST_DEVICE void shallowCopy(ManagedSharedPtr const& rhs) { + m_active_pointer = rhs.m_active_pointer; + m_active_pointer=rhs.m_active_pointer; + m_resource_manager=rhs.m_resource_manager; + } + + /* * Accessors diff --git a/src/chai/SharedPtrCounter.hpp b/src/chai/SharedPtrCounter.hpp index e0f3c534..272325a6 100644 --- a/src/chai/SharedPtrCounter.hpp +++ b/src/chai/SharedPtrCounter.hpp @@ -180,6 +180,16 @@ class msp_record_count { return *this; } + CHAI_HOST_DEVICE + msp_record_count& operator=(std::nullptr_t) { +#if !defined(CHAI_DEVICE_COMPILE) + std::cout << "msp_record_count = nullptr\n"; + if(m_pi) m_pi->m_release(); + //m_pi = nullptr; +#endif // !defined(CHAI_DEVICE_COMPILE) + return *this; + } + void m_swap(msp_record_count& rhs) noexcept { msp_counted_base* temp = rhs.m_pi; rhs.m_pi = m_pi; @@ -200,7 +210,7 @@ class msp_record_count { void moveInnerImpl() const { m_pi->moveInnerImpl(); } - msp_counted_base* m_pi; + mutable msp_counted_base* m_pi = nullptr; }; diff --git a/tests/integration/CMakeLists.txt b/tests/integration/CMakeLists.txt index 78a1600d..c6d11008 100644 --- a/tests/integration/CMakeLists.txt +++ b/tests/integration/CMakeLists.txt @@ -26,17 +26,17 @@ blt_add_test( COMMAND managed_array_tests) blt_add_executable( - NAME polymorphism_tests - SOURCES polymorphism_tests.cpp + NAME managed_shared_ptr_tests + SOURCES managed_shared_ptr_tests.cpp DEPENDS_ON ${chai_integration_test_depends}) target_include_directories( - polymorphism_tests + managed_shared_ptr_tests PUBLIC ${PROJECT_BINARY_DIR}/include) blt_add_test( - NAME polymorphism_test - COMMAND polymorphism_tests) + NAME managed_shared_ptr_test + COMMAND managed_shared_ptr_tests) if (CHAI_ENABLE_MANAGED_PTR) blt_add_executable( diff --git a/tests/integration/polymorphism_tests.cpp b/tests/integration/managed_shared_ptr_tests.cpp similarity index 51% rename from tests/integration/polymorphism_tests.cpp rename to tests/integration/managed_shared_ptr_tests.cpp index adadb10b..ed6cd254 100644 --- a/tests/integration/polymorphism_tests.cpp +++ b/tests/integration/managed_shared_ptr_tests.cpp @@ -178,7 +178,7 @@ class BAbsMem : public AAbsMem }; -GPU_TEST(managed_ptr, shared_ptr_absmem) +GPU_TEST(managed_shared_ptr, shared_ptr_absmem) { { using DerivedT = BAbsMem; @@ -229,7 +229,7 @@ GPU_TEST(managed_ptr, shared_ptr_absmem) std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; } -GPU_TEST(managed_ptr, shared_ptr_const) +GPU_TEST(managed_shared_ptr, shared_ptr_const) { { using DerivedT = B; @@ -283,7 +283,7 @@ class NV CHAI_HOST_DEVICE void function(void) const { printf("%lX\n", content_NV); } }; -GPU_TEST(managed_ptr, shared_ptr_nv) +GPU_TEST(managed_shared_ptr, shared_ptr_nv) { { using DerivedT = NV; @@ -326,300 +326,65 @@ GPU_TEST(managed_ptr, shared_ptr_nv) } +GPU_TEST(managed_shared_ptr, shared_arr_shared_ptr_absmem) +{ + { -//GPU_TEST(managed_ptr, shared_ptralloc) -//{ -// -// { -// -// using DerivedT = B; -// using BaseT = A; -// -// chai::SharedPtrManager* sptr_manager = chai::SharedPtrManager::getInstance(); -// umpire::ResourceManager& res_manager = umpire::ResourceManager::getInstance(); -// -// auto cpu_allocator = sptr_manager->getAllocator(chai::CPU); -// BaseT* cpu_ptr = static_cast( cpu_allocator.allocate(1*sizeof(DerivedT)) ); -// -// new(cpu_ptr) DerivedT(); -// BaseT* gpu_ptr = chai::msp_make_on_device(); -// -// -// auto record = sptr_manager->makeSharedPtrRecord(cpu_ptr, gpu_ptr, sizeof(DerivedT), true); -// -// forall(gpu(), 0, 1, [=] __device__ (int i) { -// printf("GPU Body\n"); -// gpu_ptr->function(); -// gpu_ptr->d_function(); -// }); -// -// std::cout << "Ump alloc cpu : " << cpu_ptr << std::endl; -// std::cout << "Ump alloc gpu : " << gpu_ptr << std::endl; -// std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; -// -// cpu_ptr->set_content(0xFFFFFFFFFFFFFFFFull); -// -// camp::resources::Resource device_resource(camp::resources::Cuda::get_default()); -// res_manager.copy_poly(gpu_ptr, cpu_ptr, device_resource); -// -// //unsigned int offset = sizeof(void*); -// //GPU_ERROR_CHECK(cudaMemcpy((char*)gpu_ptr+offset, (char*)cpu_ptr+offset, sizeof(DerivedT)-offset, cudaMemcpyHostToDevice)); -// //GPU_ERROR_CHECK(cudaMemcpy((char*)gpu_ptr, (char*)cpu_ptr, sizeof(DerivedT), cudaMemcpyHostToDevice)); -// -// forall(gpu(), 0, 1, [=] __device__ (int i) { -// printf("GPU Body\n"); -// gpu_ptr->function(); -// gpu_ptr->d_function(); -// }); -// GPU_ERROR_CHECK( cudaPeekAtLastError() ); -// GPU_ERROR_CHECK( cudaDeviceSynchronize() ); -// -// } -// //assert_empty_map(); -//} -// -//GPU_TEST(managed_ptr, polycpytest) -//{ -// -// // Assign 32 byte block of memory to 0x11 on the Host -// unsigned char* memory1 = (unsigned char*)malloc(56*sizeof(unsigned char)); -// memset(memory1, 0x11, 56 * sizeof(unsigned char)); -// CPU_PRINT_MEMORY(memory1, "1 : before placement new") -// -// -// // Assign 32 byte block of memory to 0x22 on the Device -// unsigned char* memory2; cudaMalloc((void**)&memory2, 56*sizeof(unsigned char)); -// forall(gpu(), 0, 56, [=] __device__ (int i) { memory2[i] = 0x22; }); -// GPU_PRINT_MEMORY(memory2, "2 : before placement new") -// -// -// // Placement New Polymorphic object on the Host. -// B* b_ptr1 = new (memory1) B; -// CPU_PRINT_MEMORY(memory1, "1 : after placement new"); -// -// -// // Placement New Polymorphic object on the Device. -// B* b_ptr2 = reinterpret_cast(memory2); -// A* base2 = b_ptr2; -// forall(gpu(), 0, 1, [=] __device__ (int i) { new(b_ptr2) B();}); -// GPU_PRINT_MEMORY(memory2, "2 : after placement new"); -// -// -// // B was constructed on the Device so we can call virtual -// // function on the GPU from a host pointer. -// printf("Calling virtual function from Base pointer on GPU.\n"); -// forall(gpu(), 0, 1, [=] __device__ (int i) { base2->function(); }); -// GPU_ERROR_CHECK( cudaPeekAtLastError() ); -// GPU_ERROR_CHECK( cudaDeviceSynchronize() ); -// -// -// -// // Lets edit the Data on the Host... -// b_ptr1->content_B = 0xCBCBCBCBCBCBCBCBull; -// CPU_PRINT_MEMORY(memory1, "1 : after content change"); -// -// // Copying Data from Host to Device -//#define OFFSET_CPY -//#if !defined(OFFSET_CPY) -// GPU_ERROR_CHECK(cudaMemcpy(b_ptr2, b_ptr1, sizeof(B), cudaMemcpyHostToDevice)); -//#else -// // We nee to skip over the Vtable and try to only copy the contents of the -// // object itself. -// unsigned int offset = sizeof(void*); -// char* off_b_ptr2 = (char*)b_ptr2 + offset; -// char* off_b_ptr1 = (char*)b_ptr1 + offset; -// int off_size = sizeof(B) - offset; -// -// GPU_ERROR_CHECK(cudaMemcpy(off_b_ptr2, off_b_ptr1, off_size, cudaMemcpyHostToDevice)); -// //// This will not work as we need to do pointer arithmatic at the byte level... -// //GPU_ERROR_CHECK(cudaMemcpy(b_ptr2 + offset, b_ptr1 + offset, sizeof(B) - offset, cudaMemcpyHostToDevice)); -//#endif -// GPU_PRINT_MEMORY(memory2, "2 : after copy from host"); -// -// // Try to call virtual funciton on GPU like we did before. -// printf("Calling virtual function from Base pointer on GPU.\n"); -// forall(gpu(), 0, 1, [=] __device__ (int i) { base2->function(); }); -// GPU_ERROR_CHECK( cudaPeekAtLastError() ); -// GPU_ERROR_CHECK( cudaDeviceSynchronize() ); -// -// -// -// // Lets edit the Data on the Device... -// forall(gpu(), 0, 1, [=] __device__ (int i) { -// b_ptr2->content_B = 0xDBDBDBDBDBDBDBDBull; -// b_ptr2->content_A = 0xDADADADADADADADAull; }); -// GPU_PRINT_MEMORY(memory2, "2 : after content change"); -// -// -//#if !defined(OFFSET_CPY) -// GPU_ERROR_CHECK(cudaMemcpy(b_ptr1, b_ptr2, sizeof(B), cudaMemcpyDeviceToHost)); -//#else -// GPU_ERROR_CHECK(cudaMemcpy((char*)b_ptr1 + offset, (char*)b_ptr2 + offset, sizeof(B) - offset, cudaMemcpyDeviceToHost)); -//#endif -// CPU_PRINT_MEMORY(memory1, "1 : after copy from host"); -// -// // Free up memory, we useed placement new so we need to call the destructor first... -// reinterpret_cast(memory1)->~B(); -// forall(gpu(), 0, 1, [=] __device__ (int i) { reinterpret_cast(memory2)->~B(); }); -// cudaFree(memory2); -// -//} -// -// -// -// -// -// -// -// -// -//struct Base_vtable { -// void (*doSomething)(void* this_); -// void (*setContents)(void* this_, unsigned long long val); -//}; -// -//template -//Base_vtable const Base_vtable_for_host = { -// [] __host__ __device__ (void* this_){ static_cast(this_)->doSomething(); } -// ,[] __host__ __device__ (void* this_, unsigned long long val){ static_cast(this_)->setContents(val); } -//}; -// -//template -//__global__ -//void Base_vtable_for_device(Base_vtable* vptr_) { -// new(vptr_) Base_vtable{[] __host__ __device__ (void* this_){ static_cast(this_)->doSomething(); } -// ,[] __host__ __device__ (void* this_, unsigned long long val){ static_cast(this_)->setContents(val); } -// }; -//}; -// -// -// -////----------------------------------------------------------------------------- -// -//#if !defined(CHAI_DEVICE_COMPILE) -//#define CHAI_POLY_VIRTUAL_CALL(name) \ -// return vtbl_host_->name((void*) ptr_host_); -//#define CHAI_POLY_VIRTUAL_CALL_ARGS(name, ...) \ -// return vtbl_host_->name((void*) ptr_host_, __VA_ARGS__); -//#else -//#define CHAI_POLY_VIRTUAL_CALL(name) \ -// return vtbl_device_->name((void*) ptr_device_); -//#define CHAI_POLY_VIRTUAL_CALL_ARGS(name, ...) \ -// return vtbl_device_->name((void*) ptr_device_, __VA_ARGS__); -//#endif -// -//template -//Base_vtable* make_Base_vtable_on_device() { -// Base_vtable* vptr_; -// cudaMalloc((void**)&vptr_, sizeof(Base_vtable)); -// Base_vtable_for_device <<<1,1>>>(vptr_); -// return vptr_; -//} -// -//struct CHAIPolyInterface { -// -// template -// CHAIPolyInterface(Any base) -// { -// vtbl_host_ = &Base_vtable_for_host; -// ptr_host_ = new Any{base}; -// -// vtbl_device_ = make_Base_vtable_on_device(); -// cudaMalloc(&ptr_device_, sizeof(Any)); -// -// obj_size_ = sizeof(Any); -// } -// -// void move(chai::ExecutionSpace space) -// { -// if (space == chai::CPU) cudaMemcpy(ptr_host_, ptr_device_, obj_size_, cudaMemcpyDeviceToHost); -// if (space == chai::GPU) cudaMemcpy(ptr_device_, ptr_host_, obj_size_, cudaMemcpyHostToDevice); -// } -// -//protected: -// Base_vtable const* vtbl_host_; -// Base_vtable* vtbl_device_; -// //void* ptr_; -// void* ptr_host_; -// void* ptr_device_; -// -// long obj_size_; -// -//}; -// -////----------------------------------------------------------------------------- -// -//#include -//#include -// -// -//struct Base: CHAIPolyInterface { -// using Poly = CHAIPolyInterface; -// -// template -// Base(Any base) : Poly(base) {}; -// -// CHAI_HOST_DEVICE void doSomething() const { CHAI_POLY_VIRTUAL_CALL(doSomething) } -// CHAI_HOST_DEVICE void setContents(unsigned long long val) const { CHAI_POLY_VIRTUAL_CALL_ARGS(setContents, val) } -//}; -// -// -//struct DerivedA { -// CHAI_HOST_DEVICE void doSomething() { printf("DerivedA: doSomething\n"); } -// CHAI_HOST_DEVICE void setContents(unsigned long long) {} -//}; -// -//struct DerivedB { -// DerivedB() : content(0xDDDDDDDDDDDDDDDDull) {}; -// -// void doBthing() { printf("concrete B thing"); } -// -// CHAI_HOST_DEVICE void doSomething() -// { -// printf("DerivedB: doSomething\n"); -// printf("%lX\n", content); -// } -// CHAI_HOST_DEVICE void setContents(unsigned long long val) { content = val; } -// unsigned long long content; -//}; -// -// -//GPU_TEST(managed_ptr, customvtabletest) { -// -// Base b = Base(DerivedA{}); -// Base b2 = Base(DerivedB{}); -// -// b.doSomething(); -// b2.doSomething(); -// -// b.move(chai::GPU); -// b2.move(chai::GPU); -// -// BEGIN_EXEC_ON_DEVICE() -// printf("-- GPU Kernel begin\n"); -// b.doSomething(); -// b2.doSomething(); -// printf("-- GPU Kernel end\n"); -// END_EXEC() -// -// -// b2.setContents(0xCCCCCCCCCCCCCCCCull); -// b2.move(chai::GPU); -// -// BEGIN_EXEC_ON_DEVICE() -// printf("-- GPU Kernel begin\n"); -// b.doSomething(); -// b2.doSomething(); -// b2.setContents(0xBBBBBBBBBBBBBBBBull); -// printf("-- GPU Kernel end\n"); -// END_EXEC() -// -// b2.move(chai::CPU); -// b2.doSomething(); -// -// -// -// -// -//} + //using DerivedT = NV; + //using BaseT = NV; + + using DerivedT = BAbsMem; + using BaseT = AAbsMem; + + using ElemT = chai::ManagedSharedPtr; + using Container = chai::ManagedArray; + + std::cout << "size of (DerivedT) : " << sizeof(DerivedT) << std::endl; + std::cout << "size of (BaseT) : " << sizeof(BaseT) << std::endl; + + std::cout << "Sptr Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; + std::cout << "Arr Map Sz : " << chai::ArrayManager::getInstance()->getPointerMap().size() << std::endl; + + Container arr(1); + D d; + arr[0] = chai::make_shared(d); + arr.registerTouch(chai::CPU); + //chai::ManagedSharedPtr sptr = chai::make_shared(d); + //arr[0] = sptr; + ////std::cout << "Use count : " << sptr.use_count() << std::endl; + std::cout << "GPU CALL...\n"; + forall(gpu(), 0, 1, [=] __device__ (int i) { + printf("GPU Body\n"); + arr[0]->function(); + arr[0]->d_function(); + }); + GPU_ERROR_CHECK( cudaPeekAtLastError() ); + GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + + std::cout << "CPU CALL...\n"; + forall(sequential(), 0, 1, [=] (int i) { + printf("CPU Body\n"); + arr[0]->set_content(0xFFFFFFFFFFFFFFFFull); + arr[0]->function(); + arr[0]->d_function(); + }); + + std::cout << "GPU CALL...\n"; + forall(gpu(), 0, 1, [=] __device__ (int i) { + printf("GPU Body\n"); + arr[0]->function(); + arr[0]->d_function(); + }); + GPU_ERROR_CHECK( cudaPeekAtLastError() ); + GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + + std::cout << "Sptr Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; + std::cout << "Arr Map Sz : " << chai::ArrayManager::getInstance()->getPointerMap().size() << std::endl; + + arr.free(); + std::cout << "arr.free()\n"; + } + std::cout << "Sptr Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; + std::cout << "Arr Map Sz : " << chai::ArrayManager::getInstance()->getPointerMap().size() << std::endl; +} diff --git a/tests/integration/polymorphism_hana_tests.cpp b/tests/integration/polymorphism_hana_tests.cpp deleted file mode 100644 index 8ad93bea..00000000 --- a/tests/integration/polymorphism_hana_tests.cpp +++ /dev/null @@ -1,230 +0,0 @@ -////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC and CHAI -// project contributors. See the CHAI LICENSE file for details. -// -// SPDX-License-Identifier: BSD-3-Clause -////////////////////////////////////////////////////////////////////////////// -#include "camp/defines.hpp" -#include "chai/ChaiMacros.hpp" -#include "chai/ExecutionSpaces.hpp" -#include "chai/ManagedSharedPtr.hpp" -#include "chai/SharedPtrManager.hpp" -#include "gtest/gtest.h" -#include "umpire/ResourceManager.hpp" - -#define GPU_TEST(X, Y) \ - static void gpu_test_##X##Y(); \ - TEST(X, Y) { gpu_test_##X##Y(); } \ - static void gpu_test_##X##Y() - -#include "chai/config.hpp" -#include "chai/ArrayManager.hpp" -#include "chai/ManagedArray.hpp" -#include "chai/managed_ptr.hpp" -#include "chai/ManagedSharedPtr.hpp" - -#include "../src/util/forall.hpp" - -// Standard library headers -#include - -#define BEGIN_EXEC_ON_DEVICE() \ - forall(gpu(), 0, 1, [=] __device__ (int i) { - -#define END_EXEC()\ - }); \ - GPU_ERROR_CHECK( cudaPeekAtLastError() );\ - GPU_ERROR_CHECK( cudaDeviceSynchronize() );\ - - -inline void gpuErrorCheck(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) { - fprintf(stderr, "[CHAI] GPU Error: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) { - exit(code); - } - } -} - -#define GPU_ERROR_CHECK(code) { gpuErrorCheck((code), __FILE__, __LINE__); } - -void PrintMemory(const unsigned char* memory, - const char label[] = "contents") -{ - std::cout << "Memory " << label << ": \n"; - for (size_t i = 0; i < 4; i++) - { - for (size_t j = 0; j < 8; j++) - printf("%02X ", static_cast (memory[i * 8 + j])); - printf("\n"); - } -} - -#define M_PRINT_MEMORY(memory) \ - for (size_t i = 0; i < 7; i++) \ - { \ - for (size_t j = 0; j < 8; j++) \ - printf("%02X ", static_cast (memory[i * 8 + j])); \ - printf("\n"); \ - } - -#define CPU_PRINT_MEMORY(memory, label)\ - printf("HOST Memory "); printf(label); printf("\n"); \ - M_PRINT_MEMORY(memory) \ - -#define GPU_PRINT_MEMORY(memory, label)\ - forall(gpu(), 0, 1, [=] __device__ (int i) { \ - printf("DEVICE Memory "); printf(label); printf("\n"); \ - M_PRINT_MEMORY(memory) \ - }); - - - - - - - - -template -Base_vtable const Base_vtable_for_host = { - "doSomething"_s = [] __host__ __device__ (T& base){ base.doSomehting(); } - ,"setContents"_s = [] __host__ __device__ (T& base, ull val){ base.setContents(val); } -}; - -template -Base_vtable const Base_vtable_for_host = { - [] __host__ __device__ (void* this_){ static_cast(this_)->doSomething(); } - ,[] __host__ __device__ (void* this_, unsigned long long val){ static_cast(this_)->setContents(val); } -}; - -template -__global__ -void Base_vtable_for_device(Base_vtable* vptr_) { - new(vptr_) Base_vtable{[] __host__ __device__ (void* this_){ static_cast(this_)->doSomething(); } - ,[] __host__ __device__ (void* this_, unsigned long long val){ static_cast(this_)->setContents(val); } - }; -}; - - - -//----------------------------------------------------------------------------- - -template -Base_vtable* make_Base_vtable_on_device() { - Base_vtable* vptr_; - cudaMalloc((void**)&vptr_, sizeof(Base_vtable)); - Base_vtable_for_device <<<1,1>>>(vptr_); - return vptr_; -} - -struct CHAIPolyInterface { - - template - CHAIPolyInterface(Any base) - { - vtbl_host_ = &Base_vtable_for_host; - ptr_host_ = new Any{base}; - - vtbl_device_ = make_Base_vtable_on_device(); - cudaMalloc(&ptr_device_, sizeof(Any)); - - obj_size_ = sizeof(Any); - } - - void move(chai::ExecutionSpace space) - { - if (space == chai::CPU) cudaMemcpy(ptr_host_, ptr_device_, obj_size_, cudaMemcpyDeviceToHost); - if (space == chai::GPU) cudaMemcpy(ptr_device_, ptr_host_, obj_size_, cudaMemcpyHostToDevice); - } - -protected: - Base_vtable const* vtbl_host_; - Base_vtable* vtbl_device_; - //void* ptr_; - void* ptr_host_; - void* ptr_device_; - - long obj_size_; - -}; - -//----------------------------------------------------------------------------- - -#include -#include - -struct IBase : decltype(camp::requires( - "doSomething"_s = camp::function - "setContents"_s = camp::function -)) {}; - -struct Base { - template - Base(Any base) : poly_(base) {}; - - CHAI_HOST_DEVICE void doSomething() const { poly_.virtual("doSomething"_s)(poly_); } - CHAI_HOST_DEVICE void setContents(unsigned long long val) const { poly.virtual("setContents")(poly_, val); } - -private: - ChaiPolyInterface poly_; -}; - - -struct DerivedA { - CHAI_HOST_DEVICE void doSomething() { printf("DerivedA: doSomething\n"); } - CHAI_HOST_DEVICE void setContents(unsigned long long) {} -}; - -struct DerivedB { - DerivedB() : content(0xDDDDDDDDDDDDDDDDull) {}; - - void doBthing() { printf("concrete B thing"); } - - CHAI_HOST_DEVICE void doSomething() { printf("DerivedB: doSomething : %lX\n", content); } - CHAI_HOST_DEVICE void setContents(unsigned long long val) { content = val; } - -private: - unsigned long long content; -}; - - -GPU_TEST(managed_ptr, customvtabletest) { - - Base b = Base(DerivedA{}); - Base b2 = Base(DerivedB{}); - - b.doSomething(); - b2.doSomething(); - - b.move(chai::GPU); - b2.move(chai::GPU); - - BEGIN_EXEC_ON_DEVICE() - printf("-- GPU Kernel begin\n"); - b.doSomething(); - b2.doSomething(); - printf("-- GPU Kernel end\n"); - END_EXEC() - - - b2.setContents(0xCCCCCCCCCCCCCCCCull); - b2.move(chai::GPU); - - BEGIN_EXEC_ON_DEVICE() - printf("-- GPU Kernel begin\n"); - b.doSomething(); - b2.doSomething(); - b2.setContents(0xBBBBBBBBBBBBBBBBull); - printf("-- GPU Kernel end\n"); - END_EXEC() - - b2.move(chai::CPU); - b2.doSomething(); - - - - - -} - From de62d4e0bb55b7d581248c6d9a0ce4a4cfc2d4ca Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Mon, 3 Jun 2024 14:22:38 -0700 Subject: [PATCH 13/44] makeSharedPtrRecord takes pointers & spaces as initializer list. --- src/chai/ManagedSharedPtr.hpp | 2 +- src/chai/SharedPtrCounter.hpp | 6 +++- src/chai/SharedPtrManager.cpp | 52 +++++++++++++++++++++++++++++++++++ src/chai/SharedPtrManager.hpp | 6 ++++ 4 files changed, 64 insertions(+), 2 deletions(-) diff --git a/src/chai/ManagedSharedPtr.hpp b/src/chai/ManagedSharedPtr.hpp index 6fc04d5f..98cd115e 100644 --- a/src/chai/ManagedSharedPtr.hpp +++ b/src/chai/ManagedSharedPtr.hpp @@ -110,7 +110,7 @@ class ManagedSharedPtr : public CHAICopyable{ CHAI_HOST void swap(ManagedSharedPtr& rhs) noexcept { std::swap(m_active_pointer, rhs.m_active_pointer); std::swap(m_resource_manager, rhs.m_resource_manager); - m_record_count.swap(rhs.m_record_count); + m_record_count.m_swap(rhs.m_record_count); } diff --git a/src/chai/SharedPtrCounter.hpp b/src/chai/SharedPtrCounter.hpp index 272325a6..d1b619e7 100644 --- a/src/chai/SharedPtrCounter.hpp +++ b/src/chai/SharedPtrCounter.hpp @@ -92,7 +92,8 @@ class msp_counted_deleter final : public msp_counted_base { class impl { public: impl(Ptr h_p, Ptr d_p, Deleter d) - : m_record(SharedPtrManager::getInstance()->makeSharedPtrRecord(h_p, d_p, sizeof(std::remove_pointer_t), true)) + : m_record(SharedPtrManager::getInstance()->makeSharedPtrRecord({h_p, d_p},{chai::CPU, chai::GPU}, sizeof(std::remove_pointer_t), true)) + //: m_record(SharedPtrManager::getInstance()->makeSharedPtrRecord(h_p, d_p, sizeof(std::remove_pointer_t), true)) , m_deleter(std::move(d)) {} @@ -104,9 +105,12 @@ class msp_counted_deleter final : public msp_counted_base { public: msp_counted_deleter(Ptr h_p, Ptr d_p, Deleter d) noexcept : m_impl(h_p, d_p, std::move(d)) {} virtual void m_dispose() noexcept { + +#if defined(CHAI_GPUCC) printf("Delete GPU Memory Here...\n"); ::chai::impl::msp_dispose_on_device<<<1,1>>>((Ptr)m_impl.m_record->m_pointers[chai::GPU], m_impl.m_del()); SharedPtrManager::getInstance()->free(m_impl.m_record, chai::GPU); +#endif printf("Delete CPU Memory Here...\n"); m_impl.m_del()((Ptr)m_impl.m_record->m_pointers[chai::CPU]); diff --git a/src/chai/SharedPtrManager.cpp b/src/chai/SharedPtrManager.cpp index 70fa68d3..f982ddb1 100644 --- a/src/chai/SharedPtrManager.cpp +++ b/src/chai/SharedPtrManager.cpp @@ -5,6 +5,7 @@ // SPDX-License-Identifier: BSD-3-Clause ////////////////////////////////////////////////////////////////////////////// #include "chai/SharedPtrManager.hpp" +#include #include "chai/ExecutionSpaces.hpp" #include "chai/config.hpp" @@ -435,6 +436,57 @@ msp_pointer_record* SharedPtrManager::getPointerRecord(void* pointer) return record->second ? *record->second : &s_null_record; } +//msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(void const* c_pointer, void const* c_d_pointer, +msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(std::initializer_list pointers, + std::initializer_list spaces, + size_t size, + //ExecutionSpace space, + bool owned) +{ + int i = 0; + for (void const* c_ptr : pointers) { + void* ptr = const_cast(c_ptr); + + if (ptr == nullptr) return &s_null_record; + + m_resource_manager.registerAllocation(ptr, {ptr, size, m_allocators[spaces.begin()[i++]]->getAllocationStrategy()} ); + } + + void* lookup_pointer = const_cast(pointers.begin()[0]); + + auto pointer_record = getPointerRecord(lookup_pointer); + + if (pointer_record == &s_null_record) { + if (lookup_pointer) { + pointer_record = new msp_pointer_record(); + } else { + return pointer_record; + } + } + else { + CHAI_LOG(Warning, "SharedPtrManager::makeManaged found abandoned pointer record!!!"); + //callback(pointer_record, ACTION_FOUND_ABANDONED, space); + } + + i=0; + for (void const* c_ptr : pointers) { + void* ptr = const_cast(c_ptr); + chai::ExecutionSpace space = spaces.begin()[i]; + + pointer_record->m_pointers[space] = ptr; + pointer_record->m_owned[space] = owned; + registerPointer(pointer_record, space, owned); + + i++; + } + + for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { + pointer_record->m_allocators[space] = getAllocatorId(ExecutionSpace(space)); + } + + return pointer_record; +} + msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(void const* c_pointer, void const* c_d_pointer, size_t size, //ExecutionSpace space, diff --git a/src/chai/SharedPtrManager.hpp b/src/chai/SharedPtrManager.hpp index 25552000..6760d478 100644 --- a/src/chai/SharedPtrManager.hpp +++ b/src/chai/SharedPtrManager.hpp @@ -264,6 +264,12 @@ class SharedPtrManager */ //CHAISHAREDDLL_API size_t getSize(void* pointer); + msp_pointer_record* makeSharedPtrRecord(std::initializer_list pointers, + std::initializer_list spaces, + size_t size, + //ExecutionSpace space, + bool owned); + CHAISHAREDDLL_API msp_pointer_record* makeSharedPtrRecord(void const* c_pointer, void const* c_d_pointer, size_t size, //ExecutionSpace space, From f237611231a8098edc2e3175064ce06d4b4448e4 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 4 Jun 2024 13:50:48 -0700 Subject: [PATCH 14/44] Cleaning up unnecessary logic in SharedPointerRecord. --- src/chai/SharedPointerRecord.hpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/chai/SharedPointerRecord.hpp b/src/chai/SharedPointerRecord.hpp index 1daa1baf..890578ed 100644 --- a/src/chai/SharedPointerRecord.hpp +++ b/src/chai/SharedPointerRecord.hpp @@ -36,17 +36,15 @@ struct msp_pointer_record { int m_allocators[NUM_EXECUTION_SPACES]; - msp_pointer_record(void* host_p = nullptr, void* device_p = nullptr) : + //msp_pointer_record(void* host_p = nullptr, void* device_p = nullptr) : + msp_pointer_record() : m_last_space(CPU) { for (int space = 0; space < NUM_EXECUTION_SPACES; ++space ) { m_pointers[space] = nullptr; m_touched[space] = false; m_owned[space] = true; - //m_allocators[space] = 0; + m_allocators[space] = 0; } - m_pointers[CPU] = host_p; - m_touched[CPU] = true; - m_pointers[GPU] = device_p; } }; From 6e19aa802a7cad226177c8a661b95a3e5d03252e Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 4 Jun 2024 13:53:35 -0700 Subject: [PATCH 15/44] Usind Def Ctor & Dtor to init and free elements of a ManagedArray that are CHAICopyable. --- src/chai/ManagedArray.hpp | 5 +- src/chai/ManagedArray.inl | 1 + src/chai/ManagedSharedPtr.hpp | 13 ++-- src/chai/SharedPtrCounter.hpp | 68 +++++++++++++------ src/chai/SharedPtrManager.cpp | 51 -------------- src/chai/SharedPtrManager.hpp | 4 +- src/chai/SharedPtrManager.inl | 49 +++++++++++++ .../integration/managed_shared_ptr_tests.cpp | 13 +--- 8 files changed, 110 insertions(+), 94 deletions(-) diff --git a/src/chai/ManagedArray.hpp b/src/chai/ManagedArray.hpp index 15961925..2f4666d8 100644 --- a/src/chai/ManagedArray.hpp +++ b/src/chai/ManagedArray.hpp @@ -444,7 +444,8 @@ class ManagedArray : public CHAICopyable CHAI_HOST bool initInner(size_t start = 0) { for (size_t i = start; i < m_size/sizeof(T); ++i) { - m_active_base_pointer[i] = nullptr; + //m_active_base_pointer[i] = nullptr; + new (&m_active_base_pointer[i]) T(); } return true; } @@ -464,7 +465,7 @@ class ManagedArray : public CHAICopyable CHAI_HOST bool freeInner(size_t start = 0) { for (size_t i = start; i < m_size/sizeof(T); ++i) { - m_active_base_pointer[i] = nullptr; + m_active_base_pointer[i].~T(); } return true; } diff --git a/src/chai/ManagedArray.inl b/src/chai/ManagedArray.inl index 7b04966d..6a035384 100644 --- a/src/chai/ManagedArray.inl +++ b/src/chai/ManagedArray.inl @@ -261,6 +261,7 @@ CHAI_HOST void ManagedArray::free(ExecutionSpace space) m_pointer_record = m_resource_manager->makeManaged((void *)m_active_base_pointer,m_size,space,true); } freeInner(); + m_resource_manager->free(m_pointer_record, space); m_active_pointer = nullptr; m_active_base_pointer = nullptr; diff --git a/src/chai/ManagedSharedPtr.hpp b/src/chai/ManagedSharedPtr.hpp index 98cd115e..0820bedd 100644 --- a/src/chai/ManagedSharedPtr.hpp +++ b/src/chai/ManagedSharedPtr.hpp @@ -111,17 +111,16 @@ class ManagedSharedPtr : public CHAICopyable{ std::swap(m_active_pointer, rhs.m_active_pointer); std::swap(m_resource_manager, rhs.m_resource_manager); m_record_count.m_swap(rhs.m_record_count); - } CHAI_HOST void reset() noexcept { ManagedSharedPtr().swap(*this); } - CHAI_HOST ManagedSharedPtr& operator=(std::nullptr_t) { - reset(); - return *this; - } + //CHAI_HOST ManagedSharedPtr& operator=(std::nullptr_t) { + // reset(); + // return *this; + //} CHAI_HOST_DEVICE void shallowCopy(ManagedSharedPtr const& rhs) { m_active_pointer = rhs.m_active_pointer; @@ -278,6 +277,7 @@ CHAI_HOST Tp* msp_make_on_host(Args&&... args) { chai::SharedPtrManager* sptr_manager = chai::SharedPtrManager::getInstance(); auto cpu_allocator = sptr_manager->getAllocator(chai::CPU); + Tp* cpu_ptr = static_cast( cpu_allocator.allocate(1*sizeof(Tp)) ); new (cpu_ptr) Tp{std::forward(args)...}; @@ -303,7 +303,8 @@ ManagedSharedPtr make_shared(Args&&... args) { auto result = ManagedSharedPtr(cpu_pointer, gpu_pointer, [] CHAI_HOST_DEVICE (Tp* p){p->~Tp();} ); - //auto result = ManagedSharedPtr(cpu_pointer, gpu_pointer, [](Tp* p){delete p;}); + + result.registerTouch(chai::CPU); if (!is_CHAICopyable::value) { result.move(chai::GPU, false); diff --git a/src/chai/SharedPtrCounter.hpp b/src/chai/SharedPtrCounter.hpp index d1b619e7..16ff97b3 100644 --- a/src/chai/SharedPtrCounter.hpp +++ b/src/chai/SharedPtrCounter.hpp @@ -8,8 +8,10 @@ #ifndef CHAI_SharedPointerCounter_HPP #define CHAI_SharedPointerCounter_HPP +#include #include #include "chai/ChaiMacros.hpp" +#include "chai/ExecutionSpaces.hpp" #include "chai/SharedPtrManager.hpp" namespace chai @@ -75,6 +77,7 @@ class msp_counted_ptr final : public msp_counted_base { #include +#if defined(CHAI_GPUCC) namespace impl { template class msp_counted_deleter final : public msp_counted_base { class impl { public: - impl(Ptr h_p, Ptr d_p, Deleter d) - : m_record(SharedPtrManager::getInstance()->makeSharedPtrRecord({h_p, d_p},{chai::CPU, chai::GPU}, sizeof(std::remove_pointer_t), true)) - //: m_record(SharedPtrManager::getInstance()->makeSharedPtrRecord(h_p, d_p, sizeof(std::remove_pointer_t), true)) + impl(std::initializer_list ptrs, + std::initializer_list spaces, + Deleter d) + : m_record(SharedPtrManager::getInstance()-> + makeSharedPtrRecord(std::move(ptrs), + std::move(spaces), + sizeof(std::remove_pointer_t), + true)) , m_deleter(std::move(d)) {} + ~impl() { if (m_record) delete m_record; } Deleter& m_del() noexcept { return m_deleter; } msp_pointer_record* m_record; @@ -103,19 +113,33 @@ class msp_counted_deleter final : public msp_counted_base { }; public: - msp_counted_deleter(Ptr h_p, Ptr d_p, Deleter d) noexcept : m_impl(h_p, d_p, std::move(d)) {} + template + msp_counted_deleter(PtrList&& ptrs, + ExecSpaceList&& spaces, + //msp_counted_deleter(std::initializer_list ptrs, + // std::initializer_list spaces, + Deleter d) noexcept + : m_impl(std::forward(ptrs), + std::forward(spaces), + //: m_impl(std::move(ptrs), + // std::move(spaces), + std::move(d)) + {} + virtual void m_dispose() noexcept { + for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { + Ptr ptr = (Ptr)m_impl.m_record->m_pointers[space]; + if (ptr) { + if (space == chai::CPU) m_impl.m_del()(ptr); #if defined(CHAI_GPUCC) - printf("Delete GPU Memory Here...\n"); - ::chai::impl::msp_dispose_on_device<<<1,1>>>((Ptr)m_impl.m_record->m_pointers[chai::GPU], m_impl.m_del()); - SharedPtrManager::getInstance()->free(m_impl.m_record, chai::GPU); + if (space == chai::GPU) ::chai::impl::msp_dispose_on_device<<<1,1>>>(ptr, m_impl.m_del()); #endif - - printf("Delete CPU Memory Here...\n"); - m_impl.m_del()((Ptr)m_impl.m_record->m_pointers[chai::CPU]); - SharedPtrManager::getInstance()->free(m_impl.m_record, chai::CPU); + SharedPtrManager::getInstance()->free(m_impl.m_record, ExecutionSpace(space)); + } + } } + virtual void m_destroy() noexcept { this->~msp_counted_deleter(); } virtual void moveInnerImpl() const { @@ -152,13 +176,15 @@ class msp_record_count { template explicit msp_record_count(Ptr h_p, Ptr d_p, Deleter d) - : m_pi( new msp_counted_deleter(h_p, d_p, d) ) {} + : m_pi( new msp_counted_deleter(std::initializer_list{h_p, d_p,}, std::initializer_list{chai::CPU, chai::GPU}, std::move(d)) ) {} CHAI_HOST_DEVICE ~msp_record_count() noexcept { #if !defined(CHAI_DEVICE_COMPILE) - if (m_pi) m_pi->m_release(); + if (m_pi) { + m_pi->m_release(); + } #endif // !defined(CHAI_DEVICE_COMPILE) } @@ -184,15 +210,13 @@ class msp_record_count { return *this; } - CHAI_HOST_DEVICE - msp_record_count& operator=(std::nullptr_t) { -#if !defined(CHAI_DEVICE_COMPILE) - std::cout << "msp_record_count = nullptr\n"; - if(m_pi) m_pi->m_release(); - //m_pi = nullptr; -#endif // !defined(CHAI_DEVICE_COMPILE) - return *this; - } +// CHAI_HOST_DEVICE +// msp_record_count& operator=(std::nullptr_t) { +//#if !defined(CHAI_DEVICE_COMPILE) +// if(m_pi) m_pi->m_release(); +//#endif // !defined(CHAI_DEVICE_COMPILE) +// return *this; +// } void m_swap(msp_record_count& rhs) noexcept { msp_counted_base* temp = rhs.m_pi; diff --git a/src/chai/SharedPtrManager.cpp b/src/chai/SharedPtrManager.cpp index f982ddb1..9b50c424 100644 --- a/src/chai/SharedPtrManager.cpp +++ b/src/chai/SharedPtrManager.cpp @@ -436,57 +436,6 @@ msp_pointer_record* SharedPtrManager::getPointerRecord(void* pointer) return record->second ? *record->second : &s_null_record; } -//msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(void const* c_pointer, void const* c_d_pointer, -msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(std::initializer_list pointers, - std::initializer_list spaces, - size_t size, - //ExecutionSpace space, - bool owned) -{ - int i = 0; - for (void const* c_ptr : pointers) { - void* ptr = const_cast(c_ptr); - - if (ptr == nullptr) return &s_null_record; - - m_resource_manager.registerAllocation(ptr, {ptr, size, m_allocators[spaces.begin()[i++]]->getAllocationStrategy()} ); - } - - void* lookup_pointer = const_cast(pointers.begin()[0]); - - auto pointer_record = getPointerRecord(lookup_pointer); - - if (pointer_record == &s_null_record) { - if (lookup_pointer) { - pointer_record = new msp_pointer_record(); - } else { - return pointer_record; - } - } - else { - CHAI_LOG(Warning, "SharedPtrManager::makeManaged found abandoned pointer record!!!"); - //callback(pointer_record, ACTION_FOUND_ABANDONED, space); - } - - i=0; - for (void const* c_ptr : pointers) { - void* ptr = const_cast(c_ptr); - chai::ExecutionSpace space = spaces.begin()[i]; - - pointer_record->m_pointers[space] = ptr; - pointer_record->m_owned[space] = owned; - registerPointer(pointer_record, space, owned); - - i++; - } - - for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { - pointer_record->m_allocators[space] = getAllocatorId(ExecutionSpace(space)); - } - - return pointer_record; -} - msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(void const* c_pointer, void const* c_d_pointer, size_t size, //ExecutionSpace space, diff --git a/src/chai/SharedPtrManager.hpp b/src/chai/SharedPtrManager.hpp index 6760d478..cf1317a2 100644 --- a/src/chai/SharedPtrManager.hpp +++ b/src/chai/SharedPtrManager.hpp @@ -264,10 +264,10 @@ class SharedPtrManager */ //CHAISHAREDDLL_API size_t getSize(void* pointer); - msp_pointer_record* makeSharedPtrRecord(std::initializer_list pointers, + template + msp_pointer_record* makeSharedPtrRecord(std::initializer_list pointers, std::initializer_list spaces, size_t size, - //ExecutionSpace space, bool owned); CHAISHAREDDLL_API msp_pointer_record* makeSharedPtrRecord(void const* c_pointer, void const* c_d_pointer, diff --git a/src/chai/SharedPtrManager.inl b/src/chai/SharedPtrManager.inl index bff622d9..5efd1ca0 100644 --- a/src/chai/SharedPtrManager.inl +++ b/src/chai/SharedPtrManager.inl @@ -79,6 +79,55 @@ namespace chai { // // return pointer_record->m_pointers[my_space]; //} +template +msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(std::initializer_list pointers, + std::initializer_list spaces, + size_t size, + bool owned) +{ + int i = 0; + for (Ptr* ptr : pointers) { + if (ptr == nullptr) return &s_null_record; + m_resource_manager.registerAllocation(ptr, + {ptr, size, m_allocators[spaces.begin()[i++]]->getAllocationStrategy()} + ); + } + + Ptr* lookup_pointer = const_cast(pointers.begin()[0]); + + auto pointer_record = getPointerRecord(lookup_pointer); + + if (pointer_record == &s_null_record) { + if (lookup_pointer) { + pointer_record = new msp_pointer_record(); + } else { + return pointer_record; + } + } + else { + CHAI_LOG(Warning, "SharedPtrManager::makeManaged found abandoned pointer record!!!"); + //callback(pointer_record, ACTION_FOUND_ABANDONED, space); + } + + i=0; + for (void const* c_ptr : pointers) { + void* ptr = const_cast(c_ptr); + chai::ExecutionSpace space = spaces.begin()[i]; + + pointer_record->m_pointers[space] = ptr; + pointer_record->m_owned[space] = owned; + registerPointer(pointer_record, space, owned); + + i++; + } + + for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { + pointer_record->m_allocators[space] = getAllocatorId(ExecutionSpace(space)); + } + + return pointer_record; +} + #if defined(CHAI_ENABLE_PICK) template diff --git a/tests/integration/managed_shared_ptr_tests.cpp b/tests/integration/managed_shared_ptr_tests.cpp index ed6cd254..7ddbbad8 100644 --- a/tests/integration/managed_shared_ptr_tests.cpp +++ b/tests/integration/managed_shared_ptr_tests.cpp @@ -184,9 +184,6 @@ GPU_TEST(managed_shared_ptr, shared_ptr_absmem) using DerivedT = BAbsMem; using BaseT = AAbsMem; - std::cout << "size of (DerivedT) : " << sizeof(DerivedT) << std::endl; - std::cout << "size of (BaseT) : " << sizeof(BaseT) << std::endl; - D d; chai::ManagedSharedPtr sptr = chai::make_shared(d); @@ -294,7 +291,6 @@ GPU_TEST(managed_shared_ptr, shared_ptr_nv) chai::ManagedSharedPtr sptr = chai::make_shared(); chai::ManagedSharedPtr sptr2 = sptr; - //chai::ManagedSharedPtr sptr2 = sptr; std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; @@ -329,10 +325,6 @@ GPU_TEST(managed_shared_ptr, shared_ptr_nv) GPU_TEST(managed_shared_ptr, shared_arr_shared_ptr_absmem) { { - - //using DerivedT = NV; - //using BaseT = NV; - using DerivedT = BAbsMem; using BaseT = AAbsMem; @@ -349,9 +341,7 @@ GPU_TEST(managed_shared_ptr, shared_arr_shared_ptr_absmem) D d; arr[0] = chai::make_shared(d); arr.registerTouch(chai::CPU); - //chai::ManagedSharedPtr sptr = chai::make_shared(d); - //arr[0] = sptr; - ////std::cout << "Use count : " << sptr.use_count() << std::endl; + std::cout << "GPU CALL...\n"; forall(gpu(), 0, 1, [=] __device__ (int i) { printf("GPU Body\n"); @@ -382,6 +372,7 @@ GPU_TEST(managed_shared_ptr, shared_arr_shared_ptr_absmem) std::cout << "Arr Map Sz : " << chai::ArrayManager::getInstance()->getPointerMap().size() << std::endl; arr.free(); + std::cout << "arr.free()\n"; } std::cout << "Sptr Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; From 1b4d1fa11b591ebe3f9bb74a35a07e5150153393 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 4 Jun 2024 17:36:28 -0700 Subject: [PATCH 16/44] Non GPU build compiling and passing basic tests... --- src/chai/ManagedArray.hpp | 1 - src/chai/ManagedSharedPtr.hpp | 55 +++++++++++-------- src/chai/SharedPtrCounter.hpp | 52 +++++++----------- .../integration/managed_shared_ptr_tests.cpp | 31 ++++++----- 4 files changed, 67 insertions(+), 72 deletions(-) diff --git a/src/chai/ManagedArray.hpp b/src/chai/ManagedArray.hpp index 2f4666d8..2498b2ad 100644 --- a/src/chai/ManagedArray.hpp +++ b/src/chai/ManagedArray.hpp @@ -444,7 +444,6 @@ class ManagedArray : public CHAICopyable CHAI_HOST bool initInner(size_t start = 0) { for (size_t i = start; i < m_size/sizeof(T); ++i) { - //m_active_base_pointer[i] = nullptr; new (&m_active_base_pointer[i]) T(); } return true; diff --git a/src/chai/ManagedSharedPtr.hpp b/src/chai/ManagedSharedPtr.hpp index 0820bedd..c53e2838 100644 --- a/src/chai/ManagedSharedPtr.hpp +++ b/src/chai/ManagedSharedPtr.hpp @@ -3,6 +3,8 @@ #include +#include "chai/config.hpp" + #include "chai/ArrayManager.hpp" #include "chai/ChaiMacros.hpp" #include "chai/ExecutionSpaces.hpp" @@ -63,8 +65,13 @@ class ManagedSharedPtr : public CHAICopyable{ //// *Default* Ctor with convertible type Yp -> Tp template> - ManagedSharedPtr(Yp* host_p, Yp* device_p, Deleter d) - : m_record_count(host_p, device_p, std::move(d)) + ManagedSharedPtr(std::initializer_list&& ptrs, + std::initializer_list&& spaces, + Deleter d) + : m_record_count(Yp{}, + std::forward>(ptrs), + std::forward>(spaces), + std::move(d)) , m_active_pointer(m_record_count.m_get_pointer(chai::CPU)) , m_resource_manager(SharedPtrManager::getInstance()) {} @@ -117,18 +124,12 @@ class ManagedSharedPtr : public CHAICopyable{ ManagedSharedPtr().swap(*this); } - //CHAI_HOST ManagedSharedPtr& operator=(std::nullptr_t) { - // reset(); - // return *this; - //} - CHAI_HOST_DEVICE void shallowCopy(ManagedSharedPtr const& rhs) { m_active_pointer = rhs.m_active_pointer; m_active_pointer=rhs.m_active_pointer; m_resource_manager=rhs.m_resource_manager; } - /* * Accessors @@ -137,7 +138,7 @@ class ManagedSharedPtr : public CHAICopyable{ const element_type* cget(ExecutionSpace space = chai::CPU) const noexcept { #if !defined(CHAI_DEVICE_COMPILE) if (m_active_pointer) { - move(CPU, false); + move(space, false); } #endif return m_active_pointer; @@ -146,7 +147,7 @@ class ManagedSharedPtr : public CHAICopyable{ element_type* get(ExecutionSpace space = chai::CPU) const noexcept { #if !defined(CHAI_DEVICE_COMPILE) if (m_active_pointer) { - move(CPU); + move(space); } #endif return m_active_pointer; @@ -163,7 +164,6 @@ class ManagedSharedPtr : public CHAICopyable{ //CHAI_HOST_DEVICE //element_type* m_get() const noexcept { return static_cast*>(this)->get(); } - public: long use_count() const noexcept { return m_record_count.m_get_use_count(); } @@ -176,7 +176,6 @@ class ManagedSharedPtr : public CHAICopyable{ void move(ExecutionSpace space, bool registerTouch=(!std::is_const::value || is_CHAICopyable::value)) const { ExecutionSpace prev_space = m_record_count.m_get_record()->m_last_space; - ExecutionSpace oldContext = m_resource_manager->getExecutionSpace(); if (prev_space != GPU && space == GPU) { /// Move nested ManagedArrays first, so they are working with a valid m_active_pointer for the host, // and so the meta data associated with them are updated before we move the other array down. @@ -236,6 +235,7 @@ class ManagedSharedPtr : public CHAICopyable{ namespace detail { +#if defined(CHAI_ENABLE_CUDA) namespace impl { template CHAI_INLINE @@ -297,10 +298,12 @@ ManagedSharedPtr make_shared(Args&&... args) { Tp* cpu_pointer = detail::msp_make_on_host(std::forward(args)...); std::cout << "CPU pointer @ " << cpu_pointer << std::endl; +#if defined(CHAI_ENABLE_CUDA) + Tp* gpu_pointer = detail::msp_make_on_device(); std::cout << "GPU pointer @ " << gpu_pointer << std::endl; cudaDeviceSynchronize(); - auto result = ManagedSharedPtr(cpu_pointer, gpu_pointer, + auto result = ManagedSharedPtr({cpu_pointer, gpu_pointer}, {CPU, GPU}, [] CHAI_HOST_DEVICE (Tp* p){p->~Tp();} ); @@ -311,21 +314,25 @@ ManagedSharedPtr make_shared(Args&&... args) { result.move(chai::CPU, false); } +#else // defined(CHAI_ENABLE_CUDA) + + auto result = ManagedSharedPtr({cpu_pointer}, {CPU}, + [] CHAI_HOST_DEVICE (Tp* p){p->~Tp();} + ); + +#endif // defined(CHAI_ENABLE_CUDA) + std::cout << "End of make_shared\n"; return result; } -template -CHAI_INLINE -CHAI_HOST -ManagedSharedPtr make_shared_deleter(Args... args, Deleter d) { - Tp* gpu_pointer = detail::msp_make_on_device(); - Tp* cpu_pointer = detail::msp_make_on_host(std::forward(args)...); - //Tp* gpu_pointer = detail::msp_make_on_device(std::forward(args)...); - std::cout << "CPU pointer @ " << cpu_pointer << std::endl; - std::cout << "GPU pointer @ " << gpu_pointer << std::endl; - return ManagedSharedPtr(cpu_pointer, gpu_pointer, std::move(d)); -} +//TODO: make_shared_deleter +//template +//CHAI_INLINE +//CHAI_HOST +//ManagedSharedPtr make_shared_deleter(Args... args, Deleter d) { +//..... +//} } // namespace chai diff --git a/src/chai/SharedPtrCounter.hpp b/src/chai/SharedPtrCounter.hpp index 16ff97b3..707dc84e 100644 --- a/src/chai/SharedPtrCounter.hpp +++ b/src/chai/SharedPtrCounter.hpp @@ -95,14 +95,11 @@ class msp_counted_deleter final : public msp_counted_base { class impl { public: - impl(std::initializer_list ptrs, - std::initializer_list spaces, - Deleter d) + template + impl(Ptrs&& ptrs, Spaces&& spaces, Deleter d) : m_record(SharedPtrManager::getInstance()-> - makeSharedPtrRecord(std::move(ptrs), - std::move(spaces), - sizeof(std::remove_pointer_t), - true)) + makeSharedPtrRecord(std::forward(ptrs), std::forward(spaces), + sizeof(std::remove_pointer_t), true)) , m_deleter(std::move(d)) {} ~impl() { if (m_record) delete m_record; } @@ -113,17 +110,9 @@ class msp_counted_deleter final : public msp_counted_base { }; public: - template - msp_counted_deleter(PtrList&& ptrs, - ExecSpaceList&& spaces, - //msp_counted_deleter(std::initializer_list ptrs, - // std::initializer_list spaces, - Deleter d) noexcept - : m_impl(std::forward(ptrs), - std::forward(spaces), - //: m_impl(std::move(ptrs), - // std::move(spaces), - std::move(d)) + template + msp_counted_deleter(Ptrs&& ptrs, Spaces&& spaces, Deleter d) noexcept + : m_impl(std::forward(ptrs), std::forward(spaces), std::move(d)) {} virtual void m_dispose() noexcept { @@ -170,13 +159,20 @@ class msp_record_count { CHAI_HOST_DEVICE constexpr msp_record_count() noexcept : m_pi(0) {} - template - explicit msp_record_count(Ptr h_p, Ptr d_p) - : m_pi( new msp_counted_ptr(h_p, d_p) ) {} + //template + //explicit msp_record_count(Ptr h_p, Ptr d_p) + //: m_pi( new msp_counted_ptr(h_p, d_p) ) {} + + //template + //explicit msp_record_count(Ptr h_p, Ptr d_p, Deleter d) + //: m_pi( new msp_counted_deleter(std::initializer_list{h_p, d_p,}, std::initializer_list{chai::CPU, chai::GPU}, std::move(d)) ) {} - template - explicit msp_record_count(Ptr h_p, Ptr d_p, Deleter d) - : m_pi( new msp_counted_deleter(std::initializer_list{h_p, d_p,}, std::initializer_list{chai::CPU, chai::GPU}, std::move(d)) ) {} + template + explicit msp_record_count(T, Ptrs&& ptrs, Spaces&& spaces, Deleter d) + : m_pi( new msp_counted_deleter( + std::forward(ptrs) + , std::forward(spaces) + , std::move(d)) ) {} CHAI_HOST_DEVICE ~msp_record_count() noexcept @@ -210,14 +206,6 @@ class msp_record_count { return *this; } -// CHAI_HOST_DEVICE -// msp_record_count& operator=(std::nullptr_t) { -//#if !defined(CHAI_DEVICE_COMPILE) -// if(m_pi) m_pi->m_release(); -//#endif // !defined(CHAI_DEVICE_COMPILE) -// return *this; -// } - void m_swap(msp_record_count& rhs) noexcept { msp_counted_base* temp = rhs.m_pi; rhs.m_pi = m_pi; diff --git a/tests/integration/managed_shared_ptr_tests.cpp b/tests/integration/managed_shared_ptr_tests.cpp index 7ddbbad8..9d546edc 100644 --- a/tests/integration/managed_shared_ptr_tests.cpp +++ b/tests/integration/managed_shared_ptr_tests.cpp @@ -30,7 +30,7 @@ #include #define BEGIN_EXEC_ON_DEVICE() \ - forall(gpu(), 0, 1, [=] __device__ (int i) { + forall(gpu(), 0, 1, [=] __device__ (int) { #define END_EXEC()\ }); \ @@ -75,7 +75,7 @@ void PrintMemory(const unsigned char* memory, M_PRINT_MEMORY(memory) \ #define GPU_PRINT_MEMORY(memory, label)\ - forall(gpu(), 0, 1, [=] __device__ (int i) { \ + forall(gpu(), 0, 1, [=] __device__ (int) { \ printf("DEVICE Memory "); printf(label); printf("\n"); \ M_PRINT_MEMORY(memory) \ }); @@ -197,7 +197,7 @@ GPU_TEST(managed_shared_ptr, shared_ptr_absmem) std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; std::cout << "GPU CALL...\n"; - forall(gpu(), 0, 1, [=] __device__ (int i) { + forall(gpu(), 0, 1, [=] __device__ (int) { printf("GPU Body\n"); sptr2->function(); sptr2->d_function(); @@ -206,7 +206,7 @@ GPU_TEST(managed_shared_ptr, shared_ptr_absmem) GPU_ERROR_CHECK( cudaDeviceSynchronize() ); std::cout << "CPU CALL...\n"; - forall(sequential(), 0, 1, [=] (int i) { + forall(sequential(), 0, 1, [=] (int) { printf("CPU Body\n"); sptr->set_content(0xFFFFFFFFFFFFFFFFull); sptr2->function(); @@ -214,7 +214,7 @@ GPU_TEST(managed_shared_ptr, shared_ptr_absmem) }); std::cout << "GPU CALL...\n"; - forall(gpu(), 0, 1, [=] __device__ (int i) { + forall(gpu(), 0, 1, [=] __device__ (int) { printf("GPU Body\n"); sptr2->function(); sptr2->d_function(); @@ -242,7 +242,7 @@ GPU_TEST(managed_shared_ptr, shared_ptr_const) std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; std::cout << "GPU CALL...\n"; - forall(gpu(), 0, 1, [=] __device__ (int i) { + forall(gpu(), 0, 1, [=] __device__ (int) { printf("GPU Body\n"); sptr2->function(); sptr2->d_function(); @@ -251,7 +251,7 @@ GPU_TEST(managed_shared_ptr, shared_ptr_const) GPU_ERROR_CHECK( cudaDeviceSynchronize() ); std::cout << "CPU CALL...\n"; - forall(sequential(), 0, 1, [=] (int i) { + forall(sequential(), 0, 1, [=] (int) { printf("CPU Body\n"); sptr->set_content(0xFFFFFFFFFFFFFFFFull); sptr2->function(); @@ -259,7 +259,7 @@ GPU_TEST(managed_shared_ptr, shared_ptr_const) }); std::cout << "GPU CALL...\n"; - forall(gpu(), 0, 1, [=] __device__ (int i) { + forall(gpu(), 0, 1, [=] __device__ (int) { printf("GPU Body\n"); sptr2->function(); sptr2->d_function(); @@ -295,7 +295,7 @@ GPU_TEST(managed_shared_ptr, shared_ptr_nv) std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; std::cout << "GPU CALL...\n"; - forall(gpu(), 0, 1, [=] __device__ (int i) { + forall(gpu(), 0, 1, [=] __device__ (int) { printf("GPU Body\n"); sptr2->function(); }); @@ -303,14 +303,14 @@ GPU_TEST(managed_shared_ptr, shared_ptr_nv) GPU_ERROR_CHECK( cudaDeviceSynchronize() ); std::cout << "CPU CALL...\n"; - forall(sequential(), 0, 1, [=] (int i) { + forall(sequential(), 0, 1, [=] (int) { printf("CPU Body\n"); //sptr->set_content(0xFFFFFFFFFFFFFFFFull); sptr2->function(); }); std::cout << "GPU CALL...\n"; - forall(gpu(), 0, 1, [=] __device__ (int i) { + forall(gpu(), 0, 1, [=] __device__ (int) { printf("GPU Body\n"); sptr2->function(); }); @@ -343,7 +343,7 @@ GPU_TEST(managed_shared_ptr, shared_arr_shared_ptr_absmem) arr.registerTouch(chai::CPU); std::cout << "GPU CALL...\n"; - forall(gpu(), 0, 1, [=] __device__ (int i) { + forall(gpu(), 0, 1, [=] __device__ (int) { printf("GPU Body\n"); arr[0]->function(); arr[0]->d_function(); @@ -352,7 +352,7 @@ GPU_TEST(managed_shared_ptr, shared_arr_shared_ptr_absmem) GPU_ERROR_CHECK( cudaDeviceSynchronize() ); std::cout << "CPU CALL...\n"; - forall(sequential(), 0, 1, [=] (int i) { + forall(sequential(), 0, 1, [=] (int) { printf("CPU Body\n"); arr[0]->set_content(0xFFFFFFFFFFFFFFFFull); arr[0]->function(); @@ -360,7 +360,7 @@ GPU_TEST(managed_shared_ptr, shared_arr_shared_ptr_absmem) }); std::cout << "GPU CALL...\n"; - forall(gpu(), 0, 1, [=] __device__ (int i) { + forall(gpu(), 0, 1, [=] __device__ (int) { printf("GPU Body\n"); arr[0]->function(); arr[0]->d_function(); @@ -371,9 +371,10 @@ GPU_TEST(managed_shared_ptr, shared_arr_shared_ptr_absmem) std::cout << "Sptr Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; std::cout << "Arr Map Sz : " << chai::ArrayManager::getInstance()->getPointerMap().size() << std::endl; + std::cout << "arr.free()\n"; arr.free(); + std::cout << "End of scope\n"; - std::cout << "arr.free()\n"; } std::cout << "Sptr Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; std::cout << "Arr Map Sz : " << chai::ArrayManager::getInstance()->getPointerMap().size() << std::endl; From d81d019d19f97aed3fae8af3c251406e3ec057e5 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Wed, 5 Jun 2024 08:51:53 -0700 Subject: [PATCH 17/44] empty map assertions for shared_ptr tests. --- .../integration/managed_shared_ptr_tests.cpp | 66 +++++++------------ 1 file changed, 22 insertions(+), 44 deletions(-) diff --git a/tests/integration/managed_shared_ptr_tests.cpp b/tests/integration/managed_shared_ptr_tests.cpp index 9d546edc..593a0c6c 100644 --- a/tests/integration/managed_shared_ptr_tests.cpp +++ b/tests/integration/managed_shared_ptr_tests.cpp @@ -50,35 +50,14 @@ inline void gpuErrorCheck(cudaError_t code, const char *file, int line, bool abo #define GPU_ERROR_CHECK(code) { gpuErrorCheck((code), __FILE__, __LINE__); } -void PrintMemory(const unsigned char* memory, - const char label[] = "contents") -{ - std::cout << "Memory " << label << ": \n"; - for (size_t i = 0; i < 4; i++) - { - for (size_t j = 0; j < 8; j++) - printf("%02X ", static_cast (memory[i * 8 + j])); - printf("\n"); - } -} -#define M_PRINT_MEMORY(memory) \ - for (size_t i = 0; i < 7; i++) \ - { \ - for (size_t j = 0; j < 8; j++) \ - printf("%02X ", static_cast (memory[i * 8 + j])); \ - printf("\n"); \ - } - -#define CPU_PRINT_MEMORY(memory, label)\ - printf("HOST Memory "); printf(label); printf("\n"); \ - M_PRINT_MEMORY(memory) \ - -#define GPU_PRINT_MEMORY(memory, label)\ - forall(gpu(), 0, 1, [=] __device__ (int) { \ - printf("DEVICE Memory "); printf(label); printf("\n"); \ - M_PRINT_MEMORY(memory) \ - }); +#ifdef CHAI_DISABLE_RM +#define assert_empty_array_map(IGNORED) +#define assert_empty_sptr_map(IGNORED) +#else +#define assert_empty_array_map(IGNORED) ASSERT_EQ(chai::ArrayManager::getInstance()->getPointerMap().size(),0) +#define assert_empty_sptr_map(IGNORED) ASSERT_EQ(chai::SharedPtrManager::getInstance()->getPointerMap().size(),0) +#endif class C : chai::CHAIPoly @@ -153,6 +132,16 @@ class AAbsMem : public chai::CHAICopyable , public chai::CHAIPoly CHAI_HOST_DEVICE virtual void set_content(unsigned long long) = 0; }; +class NV +{ +public: + unsigned long long content_NV; + CHAI_HOST_DEVICE NV(void) : content_NV(0xFFFFFFFFFFFFFFFFull) { printf("++ NV has been constructed\n"); } + CHAI_HOST_DEVICE ~NV(void) { printf("-- NV has been destructed\n"); } + CHAI_HOST_DEVICE void function(void) const { printf("%lX\n", content_NV); } +}; + + class BAbsMem : public AAbsMem { public: @@ -224,6 +213,7 @@ GPU_TEST(managed_shared_ptr, shared_ptr_absmem) } std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; + assert_empty_sptr_map(); } GPU_TEST(managed_shared_ptr, shared_ptr_const) @@ -268,25 +258,14 @@ GPU_TEST(managed_shared_ptr, shared_ptr_const) GPU_ERROR_CHECK( cudaDeviceSynchronize() ); } + assert_empty_sptr_map(); std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; } -class NV -{ -public: - unsigned long long content_NV; - CHAI_HOST_DEVICE NV(void) : content_NV(0xFFFFFFFFFFFFFFFFull) { printf("++ NV has been constructed\n"); } - CHAI_HOST_DEVICE ~NV(void) { printf("-- NV has been destructed\n"); } - CHAI_HOST_DEVICE void function(void) const { printf("%lX\n", content_NV); } -}; - GPU_TEST(managed_shared_ptr, shared_ptr_nv) { { using DerivedT = NV; - using BaseT = A; - - std::cout << "size of (DerivedT) : " << sizeof(DerivedT) << std::endl; chai::ManagedSharedPtr sptr = chai::make_shared(); @@ -305,7 +284,6 @@ GPU_TEST(managed_shared_ptr, shared_ptr_nv) std::cout << "CPU CALL...\n"; forall(sequential(), 0, 1, [=] (int) { printf("CPU Body\n"); - //sptr->set_content(0xFFFFFFFFFFFFFFFFull); sptr2->function(); }); @@ -318,6 +296,7 @@ GPU_TEST(managed_shared_ptr, shared_ptr_nv) GPU_ERROR_CHECK( cudaDeviceSynchronize() ); } + assert_empty_sptr_map(); std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; } @@ -331,9 +310,6 @@ GPU_TEST(managed_shared_ptr, shared_arr_shared_ptr_absmem) using ElemT = chai::ManagedSharedPtr; using Container = chai::ManagedArray; - std::cout << "size of (DerivedT) : " << sizeof(DerivedT) << std::endl; - std::cout << "size of (BaseT) : " << sizeof(BaseT) << std::endl; - std::cout << "Sptr Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; std::cout << "Arr Map Sz : " << chai::ArrayManager::getInstance()->getPointerMap().size() << std::endl; @@ -374,9 +350,11 @@ GPU_TEST(managed_shared_ptr, shared_arr_shared_ptr_absmem) std::cout << "arr.free()\n"; arr.free(); std::cout << "End of scope\n"; + assert_empty_array_map(); } std::cout << "Sptr Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; std::cout << "Arr Map Sz : " << chai::ArrayManager::getInstance()->getPointerMap().size() << std::endl; + assert_empty_sptr_map(); } From 32f81bef30b4e291f50ab5988589e54cd612da02 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Wed, 5 Jun 2024 10:14:43 -0700 Subject: [PATCH 18/44] ManagedArray Size updates from pointer record on host call. --- src/chai/ManagedArray.inl | 3 + tests/integration/managed_array_tests.cpp | 87 ++++++++++++++++++++++- 2 files changed, 87 insertions(+), 3 deletions(-) diff --git a/src/chai/ManagedArray.inl b/src/chai/ManagedArray.inl index 6a035384..78a23290 100644 --- a/src/chai/ManagedArray.inl +++ b/src/chai/ManagedArray.inl @@ -287,6 +287,9 @@ CHAI_HOST void ManagedArray::reset() template CHAI_INLINE CHAI_HOST_DEVICE size_t ManagedArray::size() const { +#if !defined(CHAI_DEVICE_COMPILE) + if (!m_is_slice) m_size = m_pointer_record->m_size; +#endif return m_size/sizeof(T); } diff --git a/tests/integration/managed_array_tests.cpp b/tests/integration/managed_array_tests.cpp index 28f4e11b..6dc59ea4 100644 --- a/tests/integration/managed_array_tests.cpp +++ b/tests/integration/managed_array_tests.cpp @@ -730,7 +730,48 @@ TEST(ManagedArray, ReallocateCPU) assert_empty_map(true); } +TEST(ManagedArray, ReallocateCopyCPU) +{ + chai::ManagedArray array(10); + auto array_copy = array; + ASSERT_EQ(array.size(), 10u); + ASSERT_EQ(array_copy.size(), 10u); + + forall(sequential(), 0, 10, [=](int i) { + array[i] = i; + ASSERT_EQ(&array[i], &array_copy[i]); + }); + + array.reallocate(20); + + // This will be incorrect, a call to move, data or copy needs to + // be exectued in order to update the internal active pointer of + // the copied object in order to use operator[] after a reallocation. + ASSERT_NE(&array[0], &array_copy[0]); + + // This would work but for the sake of the test we will check + // operator[] is correct after lambda capture. + //ASSERT_EQ(array.data(), array_copy.data()); + + ASSERT_EQ(array.size(), 20u); + ASSERT_EQ(array_copy.size(), 20u); + + forall(sequential(), 0, 20, [=](int i) { + ASSERT_EQ(&array[i], &array_copy[i]); + if (i < 10) { + ASSERT_EQ(array[i], i); + } else { + array_copy[i] = i; + ASSERT_EQ(array[i], i); + } + }); + + array_copy.free(); + assert_empty_map(true); +} + #if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP) + GPU_TEST(ManagedArray, ReallocateGPU) { chai::ManagedArray array(10); @@ -741,12 +782,52 @@ GPU_TEST(ManagedArray, ReallocateGPU) array.reallocate(20); ASSERT_EQ(array.size(), 20u); - forall(sequential(), 0, 20, [=](int i) { + forall(gpu(), 0, 20, [=]__device__(int i) { if (i < 10) { - ASSERT_EQ(array[i], i); + device_assert(array[i] == i); } else { array[i] = i; - ASSERT_EQ(array[i], i); + device_assert(array[i] == i); + } + }); + + array.free(); + assert_empty_map(true); +} + +GPU_TEST(ManagedArray, ReallocateCopyGPU) +{ + chai::ManagedArray array(10); + auto array_copy = array; + ASSERT_EQ(array.size(), 10u); + ASSERT_EQ(array_copy.size(), 10u); + + forall(gpu(), 0, 10, [=] __device__(int i) { + array[i] = i; + device_assert(&array[i] == &array_copy[i]); + }); + + array.reallocate(20); + ASSERT_EQ(array.size(), 20u); + + // This will be incorrect, a call to move, data or copy needs to + // be exectued in order to update the internal active pointer of + // the copied object in order to use operator[] after a reallocation. + ASSERT_NE(&array[0], &array_copy[0]); + + // This would work but for the sake of the test we will check + // operator[] is correct after lambda capture. + //ASSERT_EQ(array.data(), array_copy.data()); + + + forall(gpu(), 0, 20, [=]__device__(int i) { + device_assert(array.size() == array_copy.size()); + device_assert(&array[i] == &array_copy[i]); + if (i < 10) { + device_assert(array[i] == i); + } else { + array[i] = i; + device_assert(array[i] == i); } }); From d97b556d07dc0aabc9eae8e4249293c0ecfc1f5d Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Wed, 5 Jun 2024 12:37:13 -0700 Subject: [PATCH 19/44] Turn off copy ctor debug output. --- src/chai/ManagedSharedPtr.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/chai/ManagedSharedPtr.hpp b/src/chai/ManagedSharedPtr.hpp index c53e2838..1c37342c 100644 --- a/src/chai/ManagedSharedPtr.hpp +++ b/src/chai/ManagedSharedPtr.hpp @@ -86,7 +86,7 @@ class ManagedSharedPtr : public CHAICopyable{ , m_resource_manager(rhs.m_resource_manager) { #if !defined(CHAI_DEVICE_COMPILE) - std::cout << "ManagedSharedPtr Copy Ctor: m_active_pointer @ " << m_active_pointer << std::endl; + //std::cout << "ManagedSharedPtr Copy Ctor: m_active_pointer @ " << m_active_pointer << std::endl; if (m_active_pointer) move(ArrayManager::getInstance()->getExecutionSpace()); // TODO: Use a generic interface for RAJA queries. //if (m_active_pointer) move(m_resource_manager->getExecutionSpace()); #endif From c9d4bd0541a8d3ff36db642e11b406d7d7c89369 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 11 Jun 2024 10:44:28 -0700 Subject: [PATCH 20/44] Guarding GPU related work that throws an error when built without GPU support. --- src/chai/ManagedArray.inl | 2 ++ src/chai/SharedPtrManager.inl | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/chai/ManagedArray.inl b/src/chai/ManagedArray.inl index 78a23290..5909ac23 100644 --- a/src/chai/ManagedArray.inl +++ b/src/chai/ManagedArray.inl @@ -233,12 +233,14 @@ CHAI_HOST void ManagedArray::reallocate(size_t elems) // trigger a moveInnerImpl, which expects inner values to be initialized. if (initInner(old_size/sizeof(T))) { // if we are active on the GPU, we need to send any newly initialized inner members to the device +#if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP) if (m_pointer_record->m_last_space == GPU && old_size < m_size) { umpire::ResourceManager & umpire_rm = umpire::ResourceManager::getInstance(); void *src = (void *)(((char *)(m_pointer_record->m_pointers[CPU])) + old_size); void *dst = (void *)(((char *)(m_pointer_record->m_pointers[GPU])) + old_size); umpire_rm.copy(dst,src,m_size-old_size); } +#endif } CHAI_LOG(Debug, "m_active_ptr reallocated at address: " << m_active_pointer); diff --git a/src/chai/SharedPtrManager.inl b/src/chai/SharedPtrManager.inl index 5efd1ca0..66436522 100644 --- a/src/chai/SharedPtrManager.inl +++ b/src/chai/SharedPtrManager.inl @@ -16,7 +16,7 @@ #include "umpire/ResourceManager.hpp" -#include +//#include //#if defined(CHAI_ENABLE_UM) //#if !defined(CHAI_THIN_GPU_ALLOCATE) From eb72574644ebd69aeb1f5a8267e33a26fdb04b66 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Wed, 26 Jun 2024 12:44:43 -0700 Subject: [PATCH 21/44] Guarding tests when not built w/ CUDA/HIP --- tests/integration/CMakeLists.txt | 2 + tests/integration/managed_array_tests.cpp | 6 +- tests/integration/managed_ptr_tests.cpp | 109 ---------------------- 3 files changed, 5 insertions(+), 112 deletions(-) diff --git a/tests/integration/CMakeLists.txt b/tests/integration/CMakeLists.txt index c6d11008..bde1c821 100644 --- a/tests/integration/CMakeLists.txt +++ b/tests/integration/CMakeLists.txt @@ -12,6 +12,7 @@ blt_list_append(TO chai_integration_test_depends ELEMENTS blt::hip IF ${CHAI_ENA blt_list_append(TO chai_integration_test_depends ELEMENTS openmp IF ${CHAI_ENABLE_OPENMP}) # ManagedArray tests +if (CHAI_ENABLE_CUDA OR CHAI_ENABLE_HIP) blt_add_executable( NAME managed_array_tests SOURCES managed_array_tests.cpp @@ -37,6 +38,7 @@ target_include_directories( blt_add_test( NAME managed_shared_ptr_test COMMAND managed_shared_ptr_tests) +endif() if (CHAI_ENABLE_MANAGED_PTR) blt_add_executable( diff --git a/tests/integration/managed_array_tests.cpp b/tests/integration/managed_array_tests.cpp index 6dc59ea4..116fe744 100644 --- a/tests/integration/managed_array_tests.cpp +++ b/tests/integration/managed_array_tests.cpp @@ -733,7 +733,7 @@ TEST(ManagedArray, ReallocateCPU) TEST(ManagedArray, ReallocateCopyCPU) { chai::ManagedArray array(10); - auto array_copy = array; + chai::ManagedArray array_copy = array; ASSERT_EQ(array.size(), 10u); ASSERT_EQ(array_copy.size(), 10u); @@ -804,7 +804,7 @@ GPU_TEST(ManagedArray, ReallocateCopyGPU) forall(gpu(), 0, 10, [=] __device__(int i) { array[i] = i; - device_assert(&array[i] == &array_copy[i]); + device_assert(array.data()[i] == array_copy.data()[i]); }); array.reallocate(20); @@ -822,7 +822,7 @@ GPU_TEST(ManagedArray, ReallocateCopyGPU) forall(gpu(), 0, 20, [=]__device__(int i) { device_assert(array.size() == array_copy.size()); - device_assert(&array[i] == &array_copy[i]); + device_assert(array.data()[i] == array_copy.data()[i]); if (i < 10) { device_assert(array[i] == i); } else { diff --git a/tests/integration/managed_ptr_tests.cpp b/tests/integration/managed_ptr_tests.cpp index b0fb7481..570a3fef 100644 --- a/tests/integration/managed_ptr_tests.cpp +++ b/tests/integration/managed_ptr_tests.cpp @@ -167,115 +167,6 @@ class MultipleRawArrayClass { #define assert_empty_map(IGNORED) ASSERT_EQ(chai::SharedPtrManager::getInstance()->getPointerMap().size(),0) -inline void gpuErrorCheck(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) { - fprintf(stderr, "[CHAI] GPU Error: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) { - exit(code); - } - } -} -#define GPU_ERROR_CHECK(code) { gpuErrorCheck((code), __FILE__, __LINE__); } - -GPU_TEST(managed_ptr, shared_ptralloc) -{ - - { - - chai::SharedPtrManager* sptr_manager = chai::SharedPtrManager::getInstance(); - umpire::ResourceManager& res_manager = umpire::ResourceManager::getInstance(); - - auto cpu_allocator = sptr_manager->getAllocator(chai::UM); - TestBase* cpu_ptr = static_cast( cpu_allocator.allocate(1*sizeof(TestDerived)) ); - - new(cpu_ptr) TestDerived(); - - std::cout << "check\n"; - - - TestBase* gpu_ptr = chai::msp_make_on_device(); - - forall(gpu(), 0, 1, [=] __device__ (int i) { - printf("GPU Body\n"); - gpu_ptr->doSomething(); - printf("Mem val : %d\n", gpu_ptr->getMemberValue()); - }); - - std::cout << "Ump alloc cpu : " << cpu_ptr << std::endl; - std::cout << "Ump alloc gpu : " << gpu_ptr << std::endl; - - cpu_ptr->setMemberValue(5); - - unsigned int offset = sizeof(void*); - GPU_ERROR_CHECK(cudaMemcpy((char*)gpu_ptr+offset, (char*)cpu_ptr+offset, sizeof(TestDerived)-offset, cudaMemcpyHostToDevice)); - //GPU_ERROR_CHECK(cudaMemcpy((char*)gpu_ptr, (char*)cpu_ptr, sizeof(TestDerived), cudaMemcpyHostToDevice)); - - forall(gpu(), 0, 1, [=] __device__ (int i) { - printf("GPU Body\n"); - gpu_ptr->doSomething(); - printf("Mem val : %d\n", gpu_ptr->getMemberValue()); - }); - GPU_ERROR_CHECK( cudaPeekAtLastError() ); - GPU_ERROR_CHECK( cudaDeviceSynchronize() ); - - } - //assert_empty_map(); -} - -GPU_TEST(managed_ptr, shared_ptr) -{ - - { - //chai::ManagedSharedPtr sptr(new TestDerived(), - // [](TestDerived*p){ printf("Deleter Call\n"); p->~TestDerived(); }); - //chai::ManagedSharedPtr sptr = chai::ManagedSharedPtr(new TestDerived(), - // [](TestDerived*p){ printf("Custom Deleter Call\n"); delete p; }); - //chai::ManagedSharedPtr sptr(new TestDerived()); - - //chai::ManagedSharedPtr sptr = chai::make_shared(); - - chai::ManagedSharedPtr sptr = chai::make_shared_deleter( - [](TestDerived* p){ printf("Custom Deleter Call\n"); p->~TestDerived(); }); - - std::cout << "use_count : " << sptr.use_count() << std::endl; - - std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; - chai::ManagedSharedPtr sptr2 = sptr; - //sptr2->doSomething(); - std::cout << "use_count : " << sptr.use_count() << std::endl; - - std::cout << "GPU CALL...\n"; - forall(gpu(), 0, 3, [=] __device__ (int i) { - printf("GPU Body\n"); - sptr->doSomething(); - //results[i] = rawArrayClass->getValue(i); - }); - - } - //assert_empty_map(); - - -} - -TEST(managed_ptr, class_with_raw_array) -{ - const int expectedValue = rand(); - - chai::ManagedArray array(1, chai::CPU); - - forall(sequential(), 0, 1, [=] (int i) { - array[i] = expectedValue; - }); - - auto rawArrayClass = chai::make_managed(chai::unpack(array)); - - ASSERT_EQ(rawArrayClass->getValue(0), expectedValue); - - rawArrayClass.free(); - array.free(); -} - TEST(managed_ptr, class_with_multiple_raw_arrays) { const int expectedValue1 = rand(); From be02184ac5058513bb7af383fb9eabe86646cfdf Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 9 Jul 2024 09:56:25 -0700 Subject: [PATCH 22/44] Squash warnings when not building w/ GPU support. --- src/chai/SharedPtrManager.cpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/chai/SharedPtrManager.cpp b/src/chai/SharedPtrManager.cpp index 9b50c424..06ff7978 100644 --- a/src/chai/SharedPtrManager.cpp +++ b/src/chai/SharedPtrManager.cpp @@ -117,8 +117,8 @@ void SharedPtrManager::registerPointer( // if umpire already knows about this pointer, we want to make sure its records and ours // are consistent if (m_resource_manager.hasAllocator(pointer)) { - umpire::util::AllocationRecord *allocation_record = const_cast(m_resource_manager.findAllocationRecord(pointer)); - //allocation_record->size = record->m_size; + // umpire::util::AllocationRecord *allocation_record = const_cast(m_resource_manager.findAllocationRecord(pointer)); + // //allocation_record->size = record->m_size; } // register with umpire if it's not there so that umpire can perform data migrations else { @@ -436,13 +436,16 @@ msp_pointer_record* SharedPtrManager::getPointerRecord(void* pointer) return record->second ? *record->second : &s_null_record; } +// TODO: Need a better way of dealing with non-cuda builds here... msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(void const* c_pointer, void const* c_d_pointer, size_t size, //ExecutionSpace space, bool owned) { void* pointer = const_cast(c_pointer); +#if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP) || defined(CHAI_ENABLE_GPU_SIMULATION_MODE) void* d_pointer = const_cast(c_d_pointer); +#endif if (pointer == nullptr) { return &s_null_record ; @@ -457,9 +460,11 @@ msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(void const* c_pointer, {pointer, size, m_allocators[chai::CPU]->getAllocationStrategy()}); //std::cout << "m_allocators[chai::CPU] : " << m_allocators[chai::CPU]->getName() << std::endl; +#if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP) || defined(CHAI_ENABLE_GPU_SIMULATION_MODE) m_resource_manager.registerAllocation( d_pointer, {d_pointer, size, m_allocators[chai::GPU]->getAllocationStrategy()}); +#endif //std::cout << "m_allocators[chai::GPU] : " << m_allocators[chai::GPU]->getName() << std::endl; auto pointer_record = getPointerRecord(pointer); @@ -478,8 +483,10 @@ msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(void const* c_pointer, pointer_record->m_pointers[chai::CPU] = pointer; pointer_record->m_owned[chai::CPU] = owned; +#if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP) || defined(CHAI_ENABLE_GPU_SIMULATION_MODE) pointer_record->m_pointers[chai::GPU] = d_pointer; pointer_record->m_owned[chai::GPU] = owned; +#endif //pointer_record->m_size = size; //pointer_record->m_user_callback = [] (const msp_pointer_record*, Action, ExecutionSpace) {}; @@ -489,7 +496,9 @@ msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(void const* c_pointer, if (pointer) { registerPointer(pointer_record, chai::CPU, owned); +#if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP) || defined(CHAI_ENABLE_GPU_SIMULATION_MODE) registerPointer(pointer_record, chai::GPU, owned); +#endif } return pointer_record; From 79cb51f8347c05b38ec2f0feaaa640b46bd1c440 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 9 Jul 2024 09:57:07 -0700 Subject: [PATCH 23/44] Squash warnings when building Werror or pedantic. --- src/chai/RajaExecutionSpacePlugin.cpp | 2 +- tests/integration/raja-chai-launch.cpp | 8 ++++---- tests/integration/raja-chai-nested.cpp | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/chai/RajaExecutionSpacePlugin.cpp b/src/chai/RajaExecutionSpacePlugin.cpp index 42db1e66..d399753d 100644 --- a/src/chai/RajaExecutionSpacePlugin.cpp +++ b/src/chai/RajaExecutionSpacePlugin.cpp @@ -46,7 +46,7 @@ RajaExecutionSpacePlugin::postCapture(const RAJA::util::PluginContext&) } } -RAJA_INSTANTIATE_REGISTRY(RAJA::util::PluginRegistry); +RAJA_INSTANTIATE_REGISTRY(RAJA::util::PluginRegistry) // this is needed to link a dynamic lib as RAJA does not provide an exported definition of this symbol. #if defined(_WIN32) && !defined(CHAISTATICLIB) diff --git a/tests/integration/raja-chai-launch.cpp b/tests/integration/raja-chai-launch.cpp index 9ece1e75..27f5d1f9 100644 --- a/tests/integration/raja-chai-launch.cpp +++ b/tests/integration/raja-chai-launch.cpp @@ -292,10 +292,10 @@ CUDA_TEST(Chai, LaunchMultiView) // /////////////////////////////////////////////////////////////////////////// -RAJA_INDEX_VALUE_T(IM, int, "IM"); -RAJA_INDEX_VALUE_T(ID, int, "ID"); -RAJA_INDEX_VALUE_T(IG, int, "IG"); -RAJA_INDEX_VALUE_T(IZ, int, "IZ"); +RAJA_INDEX_VALUE_T(IM, int, "IM") +RAJA_INDEX_VALUE_T(ID, int, "ID") +RAJA_INDEX_VALUE_T(IG, int, "IG") +RAJA_INDEX_VALUE_T(IZ, int, "IZ") void runLTimesTests(Index_type num_moments, Index_type num_directions, diff --git a/tests/integration/raja-chai-nested.cpp b/tests/integration/raja-chai-nested.cpp index edffd064..30c615dc 100644 --- a/tests/integration/raja-chai-nested.cpp +++ b/tests/integration/raja-chai-nested.cpp @@ -258,10 +258,10 @@ CUDA_TEST(Chai, NestedMultiView) // /////////////////////////////////////////////////////////////////////////// -RAJA_INDEX_VALUE_T(IM, int, "IM"); -RAJA_INDEX_VALUE_T(ID, int, "ID"); -RAJA_INDEX_VALUE_T(IG, int, "IG"); -RAJA_INDEX_VALUE_T(IZ, int, "IZ"); +RAJA_INDEX_VALUE_T(IM, int, "IM") +RAJA_INDEX_VALUE_T(ID, int, "ID") +RAJA_INDEX_VALUE_T(IG, int, "IG") +RAJA_INDEX_VALUE_T(IZ, int, "IZ") void runLTimesTests(Index_type num_moments, Index_type num_directions, From 2a53f53777a9fa98f0260a435a2283f05eb01feb Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Mon, 9 Sep 2024 16:40:07 -0700 Subject: [PATCH 24/44] Revert changes to managed_ptr_tests --- tests/integration/managed_ptr_tests.cpp | 65 +++++++++---------------- 1 file changed, 22 insertions(+), 43 deletions(-) diff --git a/tests/integration/managed_ptr_tests.cpp b/tests/integration/managed_ptr_tests.cpp index 570a3fef..579dbe35 100644 --- a/tests/integration/managed_ptr_tests.cpp +++ b/tests/integration/managed_ptr_tests.cpp @@ -4,11 +4,7 @@ // // SPDX-License-Identifier: BSD-3-Clause ////////////////////////////////////////////////////////////////////////////// -#include "chai/ChaiMacros.hpp" -#include "chai/ManagedSharedPtr.hpp" -#include "chai/SharedPtrManager.hpp" #include "gtest/gtest.h" -#include "umpire/ResourceManager.hpp" #define GPU_TEST(X, Y) \ static void gpu_test_##X##Y(); \ @@ -19,7 +15,6 @@ #include "chai/ArrayManager.hpp" #include "chai/ManagedArray.hpp" #include "chai/managed_ptr.hpp" -#include "chai/ManagedSharedPtr.hpp" #include "../src/util/forall.hpp" @@ -76,31 +71,22 @@ class RawPointerClass { class TestBase { public: - CHAI_HOST_DEVICE TestBase() {printf("TestBase Ctor\n");} - CHAI_HOST_DEVICE virtual ~TestBase() {printf("TestBase Dtor\n");} + CHAI_HOST_DEVICE TestBase() {} + CHAI_HOST_DEVICE virtual ~TestBase() {} CHAI_HOST_DEVICE virtual int getValue(const int i) const = 0; - CHAI_HOST_DEVICE virtual int getMemberValue() const = 0; - CHAI_HOST_DEVICE virtual void setMemberValue(int v) = 0; - CHAI_HOST_DEVICE virtual void doSomething() const = 0; }; class TestDerived : public TestBase { public: - CHAI_HOST_DEVICE TestDerived() : TestBase(), m_values(nullptr) {printf("TestDerived Ctor\n");} + CHAI_HOST_DEVICE TestDerived() : TestBase(), m_values(nullptr) {} CHAI_HOST_DEVICE TestDerived(chai::ManagedArray values) : TestBase(), m_values(values) {} - CHAI_HOST_DEVICE virtual ~TestDerived() {printf("TestDerived Dtor\n");} + CHAI_HOST_DEVICE virtual ~TestDerived() {} CHAI_HOST_DEVICE virtual int getValue(const int i) const { return m_values[i]; } - CHAI_HOST_DEVICE virtual int getMemberValue() const {return m_member;} - - CHAI_HOST_DEVICE void setMemberValue(int v) { m_member = v; } - - CHAI_HOST_DEVICE virtual void doSomething() const {printf("TestDerived doSomething()\n");} private: chai::ManagedArray m_values; - int m_member = -1; }; class TestInnerBase { @@ -165,7 +151,23 @@ class MultipleRawArrayClass { int* m_values2; }; -#define assert_empty_map(IGNORED) ASSERT_EQ(chai::SharedPtrManager::getInstance()->getPointerMap().size(),0) +TEST(managed_ptr, class_with_raw_array) +{ + const int expectedValue = rand(); + + chai::ManagedArray array(1, chai::CPU); + + forall(sequential(), 0, 1, [=] (int i) { + array[i] = expectedValue; + }); + + auto rawArrayClass = chai::make_managed(chai::unpack(array)); + + ASSERT_EQ(rawArrayClass->getValue(0), expectedValue); + + rawArrayClass.free(); + array.free(); +} TEST(managed_ptr, class_with_multiple_raw_arrays) { @@ -521,7 +523,6 @@ GPU_TEST(managed_ptr, gpu_class_with_raw_array_and_callback) GPU_TEST(managed_ptr, gpu_class_with_managed_array) { const int expectedValue = rand(); - const int expectedMemberValue = rand(); chai::ManagedArray array(1, chai::CPU); @@ -530,38 +531,16 @@ GPU_TEST(managed_ptr, gpu_class_with_managed_array) }); chai::managed_ptr derived = chai::make_managed(array); - derived->setMemberValue(expectedMemberValue); - - derived.set_callback([=] (chai::Action action, chai::ExecutionSpace space, void*) mutable { - if (action == chai::ACTION_MOVE) { - //printf("trigger move : "); - //if (space == chai::NONE) printf("NONE\n"); - //if (space == chai::CPU) printf("CPU\n"); - //if (space == chai::GPU) printf("GPU\n"); - auto temp = array; // Trigger copy constructor in order to move inner ManagedArray to correct memory space - (void) temp; // Get rid of unused variable warnings - return true; - } - else if (action == chai::ACTION_FREE && space == chai::NONE) { - array.free(); // If TestDerived does not take ownership of the ManagedArray, you can use the callback to clean it up - return true; - } - else { - return false; - } - }); - chai::ManagedArray results(2, chai::GPU); + chai::ManagedArray results(1, chai::GPU); forall(gpu(), 0, 1, [=] __device__ (int i) { results[i] = derived->getValue(i); - results[1] = derived->getMemberValue(); }); results.move(chai::CPU); ASSERT_EQ(results[0], expectedValue); - ASSERT_EQ(results[1], expectedMemberValue); results.free(); derived.free(); From eb235813b5474fb649702f05ad4415855fdb4856 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Thu, 21 Nov 2024 13:01:10 -0800 Subject: [PATCH 25/44] RajaExec plugin construction needs to be static. --- src/chai/RajaExecutionSpacePlugin.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/chai/RajaExecutionSpacePlugin.cpp b/src/chai/RajaExecutionSpacePlugin.cpp index d399753d..04a34fa4 100644 --- a/src/chai/RajaExecutionSpacePlugin.cpp +++ b/src/chai/RajaExecutionSpacePlugin.cpp @@ -64,7 +64,7 @@ PluginStrategy::PluginStrategy() = default; #endif // Register plugin with RAJA -RAJA::util::PluginRegistry::add P( +static RAJA::util::PluginRegistry::add P( "RajaExecutionSpacePlugin", "Plugin to set CHAI execution space based on RAJA execution platform"); From 68de020714f5b9c14c292ff7d60f6d3df7a6e488 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Wed, 11 Dec 2024 16:39:52 -0800 Subject: [PATCH 26/44] HIP agnostic test macros. --- tests/integration/managed_array_tests.cpp | 6 ++ .../integration/managed_shared_ptr_tests.cpp | 61 ++++++++++--------- 2 files changed, 38 insertions(+), 29 deletions(-) diff --git a/tests/integration/managed_array_tests.cpp b/tests/integration/managed_array_tests.cpp index 116fe744..72dfcc74 100644 --- a/tests/integration/managed_array_tests.cpp +++ b/tests/integration/managed_array_tests.cpp @@ -11,7 +11,13 @@ static void gpu_test_##X##Y() #ifdef NDEBUG + +#ifdef CHAI_ENABLE_CUDA #define device_assert(EXP) if( !EXP ) asm ("trap;") +#else +#define device_assert(EXP) if( !EXP ) asm ("s_trap 1;") +#endif + #else #define device_assert(EXP) assert(EXP) #endif diff --git a/tests/integration/managed_shared_ptr_tests.cpp b/tests/integration/managed_shared_ptr_tests.cpp index 593a0c6c..c9f8953d 100644 --- a/tests/integration/managed_shared_ptr_tests.cpp +++ b/tests/integration/managed_shared_ptr_tests.cpp @@ -29,15 +29,7 @@ // Standard library headers #include -#define BEGIN_EXEC_ON_DEVICE() \ - forall(gpu(), 0, 1, [=] __device__ (int) { - -#define END_EXEC()\ - }); \ - GPU_ERROR_CHECK( cudaPeekAtLastError() );\ - GPU_ERROR_CHECK( cudaDeviceSynchronize() );\ - - +#if defined(CHAI_ENABLE_CUDA) inline void gpuErrorCheck(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { @@ -47,8 +39,19 @@ inline void gpuErrorCheck(cudaError_t code, const char *file, int line, bool abo } } } - -#define GPU_ERROR_CHECK(code) { gpuErrorCheck((code), __FILE__, __LINE__); } +#define GPU_ERROR_CHECK(code) { gpuErrorCheck((cuda##code), __FILE__, __LINE__); } +#elif defined(CHAI_ENABLE_HIP) +inline void gpuErrorCheck(hipError_t code, const char *file, int line, bool abort=true) +{ + if (code != hipSuccess) { + fprintf(stderr, "[CHAI] GPU Error: %s %s %d\n", hipGetErrorString(code), file, line); + if (abort) { + exit(code); + } + } +} +#define GPU_ERROR_CHECK(code) { gpuErrorCheck((hip##code), __FILE__, __LINE__); } +#endif #ifdef CHAI_DISABLE_RM @@ -176,8 +179,8 @@ GPU_TEST(managed_shared_ptr, shared_ptr_absmem) D d; chai::ManagedSharedPtr sptr = chai::make_shared(d); - GPU_ERROR_CHECK( cudaPeekAtLastError() ); - GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + GPU_ERROR_CHECK( PeekAtLastError() ); + GPU_ERROR_CHECK( DeviceSynchronize() ); chai::ManagedSharedPtr sptr2 = sptr; sptr2->function(); @@ -191,8 +194,8 @@ GPU_TEST(managed_shared_ptr, shared_ptr_absmem) sptr2->function(); sptr2->d_function(); }); - GPU_ERROR_CHECK( cudaPeekAtLastError() ); - GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + GPU_ERROR_CHECK( PeekAtLastError() ); + GPU_ERROR_CHECK( DeviceSynchronize() ); std::cout << "CPU CALL...\n"; forall(sequential(), 0, 1, [=] (int) { @@ -208,8 +211,8 @@ GPU_TEST(managed_shared_ptr, shared_ptr_absmem) sptr2->function(); sptr2->d_function(); }); - GPU_ERROR_CHECK( cudaPeekAtLastError() ); - GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + GPU_ERROR_CHECK( PeekAtLastError() ); + GPU_ERROR_CHECK( DeviceSynchronize() ); } std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; @@ -237,8 +240,8 @@ GPU_TEST(managed_shared_ptr, shared_ptr_const) sptr2->function(); sptr2->d_function(); }); - GPU_ERROR_CHECK( cudaPeekAtLastError() ); - GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + GPU_ERROR_CHECK( PeekAtLastError() ); + GPU_ERROR_CHECK( DeviceSynchronize() ); std::cout << "CPU CALL...\n"; forall(sequential(), 0, 1, [=] (int) { @@ -254,8 +257,8 @@ GPU_TEST(managed_shared_ptr, shared_ptr_const) sptr2->function(); sptr2->d_function(); }); - GPU_ERROR_CHECK( cudaPeekAtLastError() ); - GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + GPU_ERROR_CHECK( PeekAtLastError() ); + GPU_ERROR_CHECK( DeviceSynchronize() ); } assert_empty_sptr_map(); @@ -278,8 +281,8 @@ GPU_TEST(managed_shared_ptr, shared_ptr_nv) printf("GPU Body\n"); sptr2->function(); }); - GPU_ERROR_CHECK( cudaPeekAtLastError() ); - GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + GPU_ERROR_CHECK( PeekAtLastError() ); + GPU_ERROR_CHECK( DeviceSynchronize() ); std::cout << "CPU CALL...\n"; forall(sequential(), 0, 1, [=] (int) { @@ -292,8 +295,8 @@ GPU_TEST(managed_shared_ptr, shared_ptr_nv) printf("GPU Body\n"); sptr2->function(); }); - GPU_ERROR_CHECK( cudaPeekAtLastError() ); - GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + GPU_ERROR_CHECK( PeekAtLastError() ); + GPU_ERROR_CHECK( DeviceSynchronize() ); } assert_empty_sptr_map(); @@ -324,8 +327,8 @@ GPU_TEST(managed_shared_ptr, shared_arr_shared_ptr_absmem) arr[0]->function(); arr[0]->d_function(); }); - GPU_ERROR_CHECK( cudaPeekAtLastError() ); - GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + GPU_ERROR_CHECK( PeekAtLastError() ); + GPU_ERROR_CHECK( DeviceSynchronize() ); std::cout << "CPU CALL...\n"; forall(sequential(), 0, 1, [=] (int) { @@ -341,8 +344,8 @@ GPU_TEST(managed_shared_ptr, shared_arr_shared_ptr_absmem) arr[0]->function(); arr[0]->d_function(); }); - GPU_ERROR_CHECK( cudaPeekAtLastError() ); - GPU_ERROR_CHECK( cudaDeviceSynchronize() ); + GPU_ERROR_CHECK( PeekAtLastError() ); + GPU_ERROR_CHECK( DeviceSynchronize() ); std::cout << "Sptr Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; std::cout << "Arr Map Sz : " << chai::ArrayManager::getInstance()->getPointerMap().size() << std::endl; From 9bb1281f5ceb8b39ff4c98d83c1fd80c5d814fcb Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Mon, 16 Dec 2024 17:52:26 -0800 Subject: [PATCH 27/44] HIP support for ManagedSharedPtr --- src/chai/ManagedSharedPtr.hpp | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/chai/ManagedSharedPtr.hpp b/src/chai/ManagedSharedPtr.hpp index 1c37342c..dacc57e2 100644 --- a/src/chai/ManagedSharedPtr.hpp +++ b/src/chai/ManagedSharedPtr.hpp @@ -235,7 +235,7 @@ class ManagedSharedPtr : public CHAICopyable{ namespace detail { -#if defined(CHAI_ENABLE_CUDA) +#if defined(CHAI_ENABLE_CUDA) or defined(CHAI_ENABLE_HIP) namespace impl { template CHAI_INLINE @@ -298,10 +298,16 @@ ManagedSharedPtr make_shared(Args&&... args) { Tp* cpu_pointer = detail::msp_make_on_host(std::forward(args)...); std::cout << "CPU pointer @ " << cpu_pointer << std::endl; -#if defined(CHAI_ENABLE_CUDA) +#if defined(CHAI_ENABLE_CUDA) or defined(CHAI_ENABLE_HIP) Tp* gpu_pointer = detail::msp_make_on_device(); - std::cout << "GPU pointer @ " << gpu_pointer << std::endl; cudaDeviceSynchronize(); + std::cout << "GPU pointer @ " << gpu_pointer << std::endl; +#if defined(CHAI_ENABLE_CUDA) + cudaDeviceSynchronize(); +#endif +#if defined(CHAI_ENABLE_HIP) + hipDeviceSynchronize(); +#endif auto result = ManagedSharedPtr({cpu_pointer, gpu_pointer}, {CPU, GPU}, [] CHAI_HOST_DEVICE (Tp* p){p->~Tp();} @@ -314,13 +320,13 @@ ManagedSharedPtr make_shared(Args&&... args) { result.move(chai::CPU, false); } -#else // defined(CHAI_ENABLE_CUDA) +#else // defined(CHAI_ENABLE_CUDA) or defined(CHAI_ENABLE_HIP) auto result = ManagedSharedPtr({cpu_pointer}, {CPU}, [] CHAI_HOST_DEVICE (Tp* p){p->~Tp();} ); -#endif // defined(CHAI_ENABLE_CUDA) +#endif // defined(CHAI_ENABLE_CUDA) or defined(CHAI_ENABLE_HIP) std::cout << "End of make_shared\n"; return result; From 6dcbce4cfaf30f0e84aa91bd0400eba04b02fcfe Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Mon, 3 Feb 2025 14:13:53 -0800 Subject: [PATCH 28/44] Pointing submodules to the same as develop. --- blt | 2 +- scripts/radiuss-spack-configs | 2 +- scripts/uberenv | 2 +- src/tpl/raja | 2 +- src/tpl/umpire | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/blt b/blt index 058b312f..9ff77344 160000 --- a/blt +++ b/blt @@ -1 +1 @@ -Subproject commit 058b312f8a5ef305e12a4380deaa13d618eff54e +Subproject commit 9ff77344f0b2a6ee345e452bddd6bfd46cbbfa35 diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index 8938041f..00c06c2d 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit 8938041fb20dde5e55ae2014aa71333076d139c9 +Subproject commit 00c06c2d0258802fbf4a57ff987314d4acd9f629 diff --git a/scripts/uberenv b/scripts/uberenv index 0d00dc8e..205672b8 160000 --- a/scripts/uberenv +++ b/scripts/uberenv @@ -1 +1 @@ -Subproject commit 0d00dc8e19a889ba07ae433590b87533c4b5b3da +Subproject commit 205672b8b2520d7dc69acefe8738960cd5db0937 diff --git a/src/tpl/raja b/src/tpl/raja index 82d1b926..4d7fcba5 160000 --- a/src/tpl/raja +++ b/src/tpl/raja @@ -1 +1 @@ -Subproject commit 82d1b926ada0fbb15a4a6e0adadc30c715cfda7b +Subproject commit 4d7fcba55ebc7cb972b7cc9f6778b48e43792ea1 diff --git a/src/tpl/umpire b/src/tpl/umpire index 974ef8c1..abd729f4 160000 --- a/src/tpl/umpire +++ b/src/tpl/umpire @@ -1 +1 @@ -Subproject commit 974ef8c18f2728e75005696f6ef27dacce491b88 +Subproject commit abd729f40064175e999a83d11d6b073dac4c01d2 From c88613a90d516001622f018ab0769e3278bbdb8c Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Mon, 3 Feb 2025 14:18:50 -0800 Subject: [PATCH 29/44] raja-chai-launch @ develop --- tests/integration/raja-chai-launch.cpp | 8 ++++---- tests/integration/raja-chai-nested.cpp | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/integration/raja-chai-launch.cpp b/tests/integration/raja-chai-launch.cpp index bf919341..2033a0e8 100644 --- a/tests/integration/raja-chai-launch.cpp +++ b/tests/integration/raja-chai-launch.cpp @@ -292,10 +292,10 @@ CUDA_TEST(Chai, LaunchMultiView) // /////////////////////////////////////////////////////////////////////////// -RAJA_INDEX_VALUE_T(IM, int, "IM") -RAJA_INDEX_VALUE_T(ID, int, "ID") -RAJA_INDEX_VALUE_T(IG, int, "IG") -RAJA_INDEX_VALUE_T(IZ, int, "IZ") +RAJA_INDEX_VALUE_T(IM, int, "IM"); +RAJA_INDEX_VALUE_T(ID, int, "ID"); +RAJA_INDEX_VALUE_T(IG, int, "IG"); +RAJA_INDEX_VALUE_T(IZ, int, "IZ"); void runLTimesTests(Index_type num_moments, Index_type num_directions, diff --git a/tests/integration/raja-chai-nested.cpp b/tests/integration/raja-chai-nested.cpp index 30c615dc..edffd064 100644 --- a/tests/integration/raja-chai-nested.cpp +++ b/tests/integration/raja-chai-nested.cpp @@ -258,10 +258,10 @@ CUDA_TEST(Chai, NestedMultiView) // /////////////////////////////////////////////////////////////////////////// -RAJA_INDEX_VALUE_T(IM, int, "IM") -RAJA_INDEX_VALUE_T(ID, int, "ID") -RAJA_INDEX_VALUE_T(IG, int, "IG") -RAJA_INDEX_VALUE_T(IZ, int, "IZ") +RAJA_INDEX_VALUE_T(IM, int, "IM"); +RAJA_INDEX_VALUE_T(ID, int, "ID"); +RAJA_INDEX_VALUE_T(IG, int, "IG"); +RAJA_INDEX_VALUE_T(IZ, int, "IZ"); void runLTimesTests(Index_type num_moments, Index_type num_directions, From 338db552e3b6b786e65740d565511acd3baa3850 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Mon, 3 Feb 2025 14:33:06 -0800 Subject: [PATCH 30/44] More updates w/ develop. --- src/chai/RajaExecutionSpacePlugin.cpp | 1 + src/chai/managed_ptr.hpp | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/chai/RajaExecutionSpacePlugin.cpp b/src/chai/RajaExecutionSpacePlugin.cpp index 04a34fa4..9b463c6d 100644 --- a/src/chai/RajaExecutionSpacePlugin.cpp +++ b/src/chai/RajaExecutionSpacePlugin.cpp @@ -47,6 +47,7 @@ RajaExecutionSpacePlugin::postCapture(const RAJA::util::PluginContext&) } RAJA_INSTANTIATE_REGISTRY(RAJA::util::PluginRegistry) +RAJA_INSTANTIATE_REGISTRY(RAJA::util::PluginRegistry); // this is needed to link a dynamic lib as RAJA does not provide an exported definition of this symbol. #if defined(_WIN32) && !defined(CHAISTATICLIB) diff --git a/src/chai/managed_ptr.hpp b/src/chai/managed_ptr.hpp index 600e63ff..b5f4d7de 100644 --- a/src/chai/managed_ptr.hpp +++ b/src/chai/managed_ptr.hpp @@ -801,8 +801,6 @@ namespace chai { CHAI_GLOBAL void make_on_device(T** gpuPointer, Args... args) { *gpuPointer = new T(processArguments(args)...); - printf("On GPU @ : %p\n", gpuPointer); - printf("On GPU @ : %p\n", &gpuPointer); } /// @@ -901,7 +899,6 @@ namespace chai { arrayManager->setExecutionSpace(CPU); #endif - printf("On Host\n"); // Create on the host T* cpuPointer = new T(detail::processArguments(args)...); From 5ab9ac3d10722e77c85daee0d3e1e6af906649f5 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Mon, 3 Feb 2025 16:40:11 -0800 Subject: [PATCH 31/44] Fixing clang + cuda build on blueos. --- src/chai/RajaExecutionSpacePlugin.cpp | 1 - tests/integration/managed_array_tests.cpp | 18 ++++++++---------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/chai/RajaExecutionSpacePlugin.cpp b/src/chai/RajaExecutionSpacePlugin.cpp index 9b463c6d..069477af 100644 --- a/src/chai/RajaExecutionSpacePlugin.cpp +++ b/src/chai/RajaExecutionSpacePlugin.cpp @@ -46,7 +46,6 @@ RajaExecutionSpacePlugin::postCapture(const RAJA::util::PluginContext&) } } -RAJA_INSTANTIATE_REGISTRY(RAJA::util::PluginRegistry) RAJA_INSTANTIATE_REGISTRY(RAJA::util::PluginRegistry); // this is needed to link a dynamic lib as RAJA does not provide an exported definition of this symbol. diff --git a/tests/integration/managed_array_tests.cpp b/tests/integration/managed_array_tests.cpp index ffa53980..5ec636e5 100644 --- a/tests/integration/managed_array_tests.cpp +++ b/tests/integration/managed_array_tests.cpp @@ -4,6 +4,14 @@ // // SPDX-License-Identifier: BSD-3-Clause ////////////////////////////////////////////////////////////////////////////// +#include "chai/config.hpp" + +#include "../src/util/forall.hpp" + +#include "chai/ManagedArray.hpp" + +#include "umpire/ResourceManager.hpp" + #include "gtest/gtest.h" #define GPU_TEST(X, Y) \ static void gpu_test_##X##Y(); \ @@ -28,16 +36,6 @@ #define assert_empty_map(IGNORED) ASSERT_EQ(chai::ArrayManager::getInstance()->getPointerMap().size(),0) #endif - -#include "chai/config.hpp" - -#include "../src/util/forall.hpp" - -#include "chai/ManagedArray.hpp" - -#include "umpire/ResourceManager.hpp" - - struct my_point { double x; double y; From abf60116bb795a03c79d02eb6a37aa78b44a4955 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 4 Feb 2025 10:27:12 -0800 Subject: [PATCH 32/44] Better comments for initInner & freeInner; Remove redudant PR inst. --- src/chai/ManagedArray.hpp | 8 ++++---- src/chai/PointerRecord.hpp | 2 -- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/chai/ManagedArray.hpp b/src/chai/ManagedArray.hpp index 509de434..48479d41 100644 --- a/src/chai/ManagedArray.hpp +++ b/src/chai/ManagedArray.hpp @@ -366,7 +366,7 @@ class ManagedArray : public CHAICopyable // shenanigan reasons need to be defined here. #if !defined(CHAI_DISABLE_RM) // if T is a CHAICopyable, then it is important to initialize all the - // ManagedArrays to nullptr at allocation, since it is extremely easy to + // elements with default constructors, since it is extremely easy to // trigger a moveInnerImpl, which expects inner values to be initialized. template ::value, typename std::enable_if::type = 0> @@ -385,9 +385,9 @@ class ManagedArray : public CHAICopyable { return false; } - // if T is a CHAICopyable, then it is important to initialize all the - // ManagedArrays to nullptr at allocation, since it is extremely easy to - // trigger a moveInnerImpl, which expects inner values to be initialized. + + // if T is a CHAICopyable, then it is important to free all the + // CHAICopyable containers, which expect inner values to be initialized. template ::value, typename std::enable_if::type = 0> CHAI_HOST bool freeInner(size_t start = 0) diff --git a/src/chai/PointerRecord.hpp b/src/chai/PointerRecord.hpp index ef28a185..e46ea899 100644 --- a/src/chai/PointerRecord.hpp +++ b/src/chai/PointerRecord.hpp @@ -71,8 +71,6 @@ struct PointerRecord { } }; -struct MyPointerRecord final : public PointerRecord {}; - } // end of namespace chai #endif // CHAI_PointerRecord_HPP From 6706d353a68c429ce5016813602e302809272fb1 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 4 Feb 2025 11:10:33 -0800 Subject: [PATCH 33/44] Pulling out common device helper functions and generalizing some device macros. --- src/chai/ArrayManager.hpp | 114 +------------------------------- src/chai/ChaiMacros.hpp | 14 ++++ src/chai/ChaiManager.hpp | 28 ++++++++ src/chai/SharedPtrManager.hpp | 4 +- src/chai/util/DeviceHelpers.hpp | 88 ++++++++++++++++++++++++ 5 files changed, 133 insertions(+), 115 deletions(-) create mode 100644 src/chai/ChaiManager.hpp create mode 100644 src/chai/util/DeviceHelpers.hpp diff --git a/src/chai/ArrayManager.hpp b/src/chai/ArrayManager.hpp index 9041ee71..4065a537 100644 --- a/src/chai/ArrayManager.hpp +++ b/src/chai/ArrayManager.hpp @@ -7,122 +7,10 @@ #ifndef CHAI_ArrayManager_HPP #define CHAI_ArrayManager_HPP -#include "chai/config.hpp" -#include "chai/ChaiMacros.hpp" -#include "chai/ExecutionSpaces.hpp" -#include "chai/PointerRecord.hpp" -#include "chai/Types.hpp" - -#if defined(CHAI_ENABLE_RAJA_PLUGIN) -#include "chai/pluginLinker.hpp" -#endif - -#include - -#include "umpire/Allocator.hpp" -#include "umpire/util/MemoryMap.hpp" - -#if defined(CHAI_ENABLE_CUDA) -#include -#endif -#if defined(CHAI_ENABLE_HIP) -#include "hip/hip_runtime_api.h" -#endif +#include "chai/ChaiManager.hpp" namespace chai { -// CHAI_GPU_ERROR_CHECK macro -#if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP) - -#ifdef CHAI_ENABLE_GPU_ERROR_CHECKING - -#ifdef CHAI_ENABLE_CUDA -inline void gpuErrorCheck(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) { - fprintf(stderr, "[CHAI] GPU Error: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) { - exit(code); - } - } -} -#elif defined(CHAI_ENABLE_HIP) -inline void gpuErrorCheck(hipError_t code, const char *file, int line, bool abort=true) -{ - if (code != hipSuccess) { - fprintf(stderr, "[CHAI] GPU Error: %s %s %d\n", hipGetErrorString(code), file, line); - if (abort) { - exit(code); - } - } -} -#endif - - -#define CHAI_GPU_ERROR_CHECK(code) { gpuErrorCheck((code), __FILE__, __LINE__); } -#else // CHAI_ENABLE_GPU_ERROR_CHECKING -#define CHAI_GPU_ERROR_CHECK(code) code -#endif // CHAI_ENABLE_GPU_ERROR_CHECKING - -#endif - -// wrapper for hip/cuda synchronize -inline void synchronize() { -#if defined (CHAI_ENABLE_HIP) &&!defined(__HIP_DEVICE_COMPILE__) - CHAI_GPU_ERROR_CHECK(hipDeviceSynchronize()); -#elif defined (CHAI_ENABLE_CUDA) &&!defined(__CUDA_ARCH__) - CHAI_GPU_ERROR_CHECK(cudaDeviceSynchronize()); -#endif -} - -#if defined(CHAI_GPUCC) || defined(CHAI_ENABLE_GPU_SIMULATION_MODE) - -// wrapper for hip/cuda free -CHAI_HOST inline void gpuFree(void* buffer) { -#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) - free(buffer); -#elif defined (CHAI_ENABLE_HIP) - CHAI_GPU_ERROR_CHECK(hipFree(buffer)); -#elif defined (CHAI_ENABLE_CUDA) - CHAI_GPU_ERROR_CHECK(cudaFree(buffer)); -#endif -} - -// wrapper for hip/cuda malloc -CHAI_HOST inline void gpuMalloc(void** devPtr, size_t size) { -#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) - *devPtr = (void*)malloc(size); -#elif defined (CHAI_ENABLE_HIP) - CHAI_GPU_ERROR_CHECK(hipMalloc(devPtr, size)); -#elif defined (CHAI_ENABLE_CUDA) - CHAI_GPU_ERROR_CHECK(cudaMalloc(devPtr, size)); -#endif -} - -// wrapper for hip/cuda managed malloc -CHAI_HOST inline void gpuMallocManaged(void** devPtr, size_t size) { -#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) - *devPtr = (void*)malloc(size); -#elif defined (CHAI_ENABLE_HIP) - CHAI_GPU_ERROR_CHECK(hipMallocManaged(devPtr, size)); -#elif defined (CHAI_ENABLE_CUDA) - CHAI_GPU_ERROR_CHECK(cudaMallocManaged(devPtr, size)); -#endif -} - -// wrapper for hip/cuda mem copy -CHAI_HOST inline void gpuMemcpy(void* dst, const void* src, size_t count, gpuMemcpyKind kind) { -#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) - memcpy(dst, src, count); -#elif defined (CHAI_ENABLE_HIP) - CHAI_GPU_ERROR_CHECK(hipMemcpy(dst, src, count, kind)); -#elif defined (CHAI_ENABLE_CUDA) - CHAI_GPU_ERROR_CHECK(cudaMemcpy(dst, src, count, kind)); -#endif -} - -#endif //#if defined(CHAI_GPUCC) - /*! * \brief Singleton that manages caching and movement of ManagedArray objects. * diff --git a/src/chai/ChaiMacros.hpp b/src/chai/ChaiMacros.hpp index 0747040c..e388b96b 100644 --- a/src/chai/ChaiMacros.hpp +++ b/src/chai/ChaiMacros.hpp @@ -11,6 +11,10 @@ #include "umpire/util/Macros.hpp" +#if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP) +#define CHAI_ENABLE_DEVICE +#endif + #if defined(CHAI_ENABLE_CUDA) #include @@ -27,6 +31,11 @@ #define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyDefault cudaMemcpyDefault +#define gpuSuccess cudaSuccess +#define gpuError_t cudaError_t +#define gpuGetErrorString cudaGetErrorString +#define gpuDeviceSynchronize cudaDeviceSynchronize + // NOTE: Cannot have if defined(__HIPCC__) in the condition below, since __HIPCC__ comes from the included header hip_runtime below. #elif defined(CHAI_ENABLE_HIP) @@ -44,6 +53,11 @@ #define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyDefault hipMemcpyDefault +#define gpuSuccess hipSuccess +#define gpuError_t hipError_t +#define gpuGetErrorString hipGetErrorString +#define gpuDeviceSynchronize hipDeviceSynchronize + #else #define CHAI_HOST diff --git a/src/chai/ChaiManager.hpp b/src/chai/ChaiManager.hpp new file mode 100644 index 00000000..2675604f --- /dev/null +++ b/src/chai/ChaiManager.hpp @@ -0,0 +1,28 @@ +////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC and CHAI +// project contributors. See the CHAI LICENSE file for details. +// +// SPDX-License-Identifier: BSD-3-Clause +////////////////////////////////////////////////////////////////////////////// +#ifndef CHAI_ChaiManager_HPP +#define CHAI_ChaiManager_HPP + +#include "chai/ChaiMacros.hpp" +#include "chai/ExecutionSpaces.hpp" +#include "chai/Types.hpp" + +#include "chai/PointerRecord.hpp" + +#if defined(CHAI_ENABLE_RAJA_PLUGIN) +#include "chai/pluginLinker.hpp" +#endif + +#include + +#include "umpire/Allocator.hpp" +#include "umpire/util/MemoryMap.hpp" + + +#include "chai/util/DeviceHelpers.hpp" + +#endif // CHAI_ChaiManager_HPP diff --git a/src/chai/SharedPtrManager.hpp b/src/chai/SharedPtrManager.hpp index cf1317a2..87d915de 100644 --- a/src/chai/SharedPtrManager.hpp +++ b/src/chai/SharedPtrManager.hpp @@ -10,10 +10,10 @@ #include "chai/config.hpp" #include "chai/ChaiMacros.hpp" #include "chai/ExecutionSpaces.hpp" -//#include "chai/PointerRecord.hpp" + #include "chai/SharedPointerRecord.hpp" #include "chai/Types.hpp" -#include "chai/ArrayManager.hpp" +#include "chai/ChaiManager.hpp" #if defined(CHAI_ENABLE_RAJA_PLUGIN) #include "chai/pluginLinker.hpp" diff --git a/src/chai/util/DeviceHelpers.hpp b/src/chai/util/DeviceHelpers.hpp new file mode 100644 index 00000000..6bbbe0d1 --- /dev/null +++ b/src/chai/util/DeviceHelpers.hpp @@ -0,0 +1,88 @@ +#ifndef CHAI_DEVICE_HELPERS_HPP +#define CHAI_DEVICE_HELPERS_HPP + +#include "chai/config.hpp" +#include "chai/ChaiMacros.hpp" + +namespace chai +{ +// CHAI_GPU_ERROR_CHECK macro +#ifdef CHAI_ENABLE_DEVICE + +#ifdef CHAI_ENABLE_GPU_ERROR_CHECKING + +inline void gpuErrorCheck(gpuError_t code, const char *file, int line, bool abort=true) +{ + if (code != gpuSuccess) { + fprintf(stderr, "[CHAI] GPU Error: %s %s %d\n", gpuGetErrorString(code), file, line); + if (abort) { + exit(code); + } + } +} + +#define CHAI_GPU_ERROR_CHECK(code) { gpuErrorCheck((code), __FILE__, __LINE__); } +#else // CHAI_ENABLE_GPU_ERROR_CHECKING +#define CHAI_GPU_ERROR_CHECK(code) code +#endif // CHAI_ENABLE_GPU_ERROR_CHECKING + +#endif + +// wrapper for hip/cuda synchronize +inline void synchronize() { +#if defined(CHAI_ENABLE_DEVICE) && !defined(CHAI_DEVICE_COMPILE) + CHAI_GPU_ERROR_CHECK(gpuDeviceSynchronize()); +#endif +} + +#if defined(CHAI_GPUCC) || defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + +// wrapper for hip/cuda free +CHAI_HOST inline void gpuFree(void* buffer) { +#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + free(buffer); +#elif defined (CHAI_ENABLE_HIP) + CHAI_GPU_ERROR_CHECK(hipFree(buffer)); +#elif defined (CHAI_ENABLE_CUDA) + CHAI_GPU_ERROR_CHECK(cudaFree(buffer)); +#endif +} + +// wrapper for hip/cuda malloc +CHAI_HOST inline void gpuMalloc(void** devPtr, size_t size) { +#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + *devPtr = (void*)malloc(size); +#elif defined (CHAI_ENABLE_HIP) + CHAI_GPU_ERROR_CHECK(hipMalloc(devPtr, size)); +#elif defined (CHAI_ENABLE_CUDA) + CHAI_GPU_ERROR_CHECK(cudaMalloc(devPtr, size)); +#endif +} + +// wrapper for hip/cuda managed malloc +CHAI_HOST inline void gpuMallocManaged(void** devPtr, size_t size) { +#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + *devPtr = (void*)malloc(size); +#elif defined (CHAI_ENABLE_HIP) + CHAI_GPU_ERROR_CHECK(hipMallocManaged(devPtr, size)); +#elif defined (CHAI_ENABLE_CUDA) + CHAI_GPU_ERROR_CHECK(cudaMallocManaged(devPtr, size)); +#endif +} + +// wrapper for hip/cuda mem copy +CHAI_HOST inline void gpuMemcpy(void* dst, const void* src, size_t count, gpuMemcpyKind kind) { +#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + memcpy(dst, src, count); +#elif defined (CHAI_ENABLE_HIP) + CHAI_GPU_ERROR_CHECK(hipMemcpy(dst, src, count, kind)); +#elif defined (CHAI_ENABLE_CUDA) + CHAI_GPU_ERROR_CHECK(cudaMemcpy(dst, src, count, kind)); +#endif +} + +#endif //#if defined(CHAI_GPUCC) + +} // namespace chai + +#endif // CHAI_DEVICE_HELPERS_HPP From e682c10fa4a091a323e002a7042f131a1cfceea0 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 4 Feb 2025 11:54:33 -0800 Subject: [PATCH 34/44] Squashing warnings in managed_shared_ptr_tests. --- .../integration/managed_shared_ptr_tests.cpp | 38 +++++++++---------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/tests/integration/managed_shared_ptr_tests.cpp b/tests/integration/managed_shared_ptr_tests.cpp index c9f8953d..1b068ea4 100644 --- a/tests/integration/managed_shared_ptr_tests.cpp +++ b/tests/integration/managed_shared_ptr_tests.cpp @@ -67,17 +67,17 @@ class C : chai::CHAIPoly { public: CHAI_HOST_DEVICE C(void) { printf("++ C has been constructed\n"); } - CHAI_HOST_DEVICE ~C(void) { printf("-- C has been destructed\n"); } + CHAI_HOST_DEVICE virtual ~C(void) { printf("-- C has been destructed\n"); } CHAI_HOST_DEVICE virtual void function(void) const = 0; }; -class D : public C +class D final : public C { public: unsigned long long content_D; CHAI_HOST_DEVICE D(void) : content_D(0xDDDDDDDDDDDDDDDDull) { printf("++ D has been constructed\n"); } CHAI_HOST_DEVICE ~D(void) { printf("-- D has been destructed\n"); } - CHAI_HOST_DEVICE virtual void function(void) const { printf("%lX\n", content_D); } + CHAI_HOST_DEVICE virtual void function(void) const { printf("%llX\n", content_D); } }; @@ -87,7 +87,7 @@ class A : chai::CHAIPoly unsigned long long content_A; D d; CHAI_HOST_DEVICE A(void) : content_A(0xAAAAAAAAAAAAAAAAull) { printf("++ A has been constructed\n"); } - CHAI_HOST_DEVICE ~A(void) { printf("-- A has been destructed\n"); } + CHAI_HOST_DEVICE virtual ~A(void) { printf("-- A has been destructed\n"); } CHAI_HOST_DEVICE virtual void function(void) const = 0; CHAI_HOST_DEVICE virtual void d_function(void) const = 0; CHAI_HOST_DEVICE virtual void set_content(unsigned long long) = 0; @@ -100,13 +100,13 @@ class A2 CHAI_HOST_DEVICE ~A2(void) { printf("-- A2 has been destructed\n"); } }; -class B : public A, public A2 +class B final : public A, public A2 { public: unsigned long long content_B; CHAI_HOST_DEVICE B(void) : content_B(0xBBBBBBBBBBBBBBBBull) { printf("++ B has been constructed\n"); } CHAI_HOST_DEVICE ~B(void) { printf("-- B has been destructed\n"); } - CHAI_HOST_DEVICE virtual void function(void) const override { printf("%lX\n", content_B); } + CHAI_HOST_DEVICE virtual void function(void) const override { printf("%llX\n", content_B); } CHAI_HOST_DEVICE virtual void d_function(void) const override { d.function(); } CHAI_HOST_DEVICE virtual void set_content(unsigned long long val) override { content_B = val; content_A = val; } }; @@ -115,9 +115,9 @@ class B : public A, public A2 class AAbsMem : public chai::CHAICopyable , public chai::CHAIPoly { public: - unsigned long long content_A; //chai::ManagedSharedPtr base_member; chai::ManagedSharedPtr base_member; + unsigned long long content_A; CHAI_HOST_DEVICE AAbsMem(void) : content_A(0xAAAAAAAAAAAAAAAAull) { printf("++ A has been constructed\n"); } @@ -129,23 +129,13 @@ class AAbsMem : public chai::CHAICopyable , public chai::CHAIPoly , content_A(0xAAAAAAAAAAAAAAAAull) { printf("++ A has been constructed\n"); } - CHAI_HOST_DEVICE ~AAbsMem(void) { printf("-- A has been destructed\n"); } + CHAI_HOST_DEVICE virtual ~AAbsMem(void) { printf("-- A has been destructed\n"); } CHAI_HOST_DEVICE virtual void function(void) const = 0; CHAI_HOST_DEVICE virtual void d_function(void) const = 0; CHAI_HOST_DEVICE virtual void set_content(unsigned long long) = 0; }; -class NV -{ -public: - unsigned long long content_NV; - CHAI_HOST_DEVICE NV(void) : content_NV(0xFFFFFFFFFFFFFFFFull) { printf("++ NV has been constructed\n"); } - CHAI_HOST_DEVICE ~NV(void) { printf("-- NV has been destructed\n"); } - CHAI_HOST_DEVICE void function(void) const { printf("%lX\n", content_NV); } -}; - - -class BAbsMem : public AAbsMem +class BAbsMem final : public AAbsMem { public: unsigned long long content_B; @@ -164,11 +154,19 @@ class BAbsMem : public AAbsMem } CHAI_HOST_DEVICE ~BAbsMem(void) { printf("-- B has been destructed\n"); } - CHAI_HOST_DEVICE virtual void function(void) const override { printf("%lX\n", content_B); } + CHAI_HOST_DEVICE virtual void function(void) const override { printf("%llX\n", content_B); } CHAI_HOST_DEVICE virtual void d_function(void) const override { base_member->function(); } CHAI_HOST_DEVICE virtual void set_content(unsigned long long val) override { content_B = val; content_A = val; } }; +class NV +{ +public: + unsigned long long content_NV; + CHAI_HOST_DEVICE NV(void) : content_NV(0xFFFFFFFFFFFFFFFFull) { printf("++ NV has been constructed\n"); } + CHAI_HOST_DEVICE ~NV(void) { printf("-- NV has been destructed\n"); } + CHAI_HOST_DEVICE void function(void) const { printf("%llX\n", content_NV); } +}; GPU_TEST(managed_shared_ptr, shared_ptr_absmem) { From 640f8712b8209865190f4032815ec7a325ed2cb6 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 4 Feb 2025 13:14:35 -0800 Subject: [PATCH 35/44] Seperate xnack host-config for amdclang --- .../amdclang-xnack.cmake | 26 +++++++++++++++++++ .../lc/toss_4_x86_64_ib_cray/amdclang.cmake | 2 +- 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 host-configs/lc/toss_4_x86_64_ib_cray/amdclang-xnack.cmake diff --git a/host-configs/lc/toss_4_x86_64_ib_cray/amdclang-xnack.cmake b/host-configs/lc/toss_4_x86_64_ib_cray/amdclang-xnack.cmake new file mode 100644 index 00000000..f7895e1c --- /dev/null +++ b/host-configs/lc/toss_4_x86_64_ib_cray/amdclang-xnack.cmake @@ -0,0 +1,26 @@ +############################################################################## +# Copyright (c) 2024, Lawrence Livermore National Security, LLC and CHAI +# project contributors. See the CHAI LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause +############################################################################## + +# Set up software versions +set(ROCM_VERSION "6.2.0" CACHE PATH "") +set(GCC_VERSION "12.2.1" CACHE PATH "") + +# Set up compilers +set(COMPILER_BASE "/usr/tce/packages/rocmcc/rocmcc-${ROCM_VERSION}-magic" CACHE PATH "") +set(CMAKE_C_COMPILER "${COMPILER_BASE}/bin/amdclang" CACHE PATH "") +set(CMAKE_CXX_COMPILER "${COMPILER_BASE}/bin/amdclang++" CACHE PATH "") + +# Set up compiler flags +set(GCC_HOME "/usr/tce/packages/gcc/gcc-${GCC_VERSION}" CACHE PATH "") +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --gcc-toolchain=${GCC_HOME}" CACHE STRING "") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --gcc-toolchain=${GCC_HOME}" CACHE STRING "") + +# Set up HIP +set(ENABLE_HIP ON CACHE BOOL "") +set(ROCM_PATH "/usr/tce/packages/rocmcc/rocmcc-${ROCM_VERSION}-magic" CACHE PATH "") +set(CMAKE_HIP_ARCHITECTURES "gfx942:xnack+" CACHE STRING "") +set(AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "") diff --git a/host-configs/lc/toss_4_x86_64_ib_cray/amdclang.cmake b/host-configs/lc/toss_4_x86_64_ib_cray/amdclang.cmake index f7895e1c..e9ee1023 100644 --- a/host-configs/lc/toss_4_x86_64_ib_cray/amdclang.cmake +++ b/host-configs/lc/toss_4_x86_64_ib_cray/amdclang.cmake @@ -22,5 +22,5 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --gcc-toolchain=${GCC_HOME}" CACHE STRIN # Set up HIP set(ENABLE_HIP ON CACHE BOOL "") set(ROCM_PATH "/usr/tce/packages/rocmcc/rocmcc-${ROCM_VERSION}-magic" CACHE PATH "") -set(CMAKE_HIP_ARCHITECTURES "gfx942:xnack+" CACHE STRING "") +set(CMAKE_HIP_ARCHITECTURES "gfx942" CACHE STRING "") set(AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "") From 6bfafc90b7ad4c138f4be78beafc6afd59ad7f51 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 4 Feb 2025 13:19:32 -0800 Subject: [PATCH 36/44] Separate host-configs for amdclang w/o +xnack. --- .../amdclang-xnack.cmake | 26 +++++++++++++++++++ .../lc/toss_4_x86_64_ib_cray/amdclang.cmake | 2 +- 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 host-configs/lc/toss_4_x86_64_ib_cray/amdclang-xnack.cmake diff --git a/host-configs/lc/toss_4_x86_64_ib_cray/amdclang-xnack.cmake b/host-configs/lc/toss_4_x86_64_ib_cray/amdclang-xnack.cmake new file mode 100644 index 00000000..f7895e1c --- /dev/null +++ b/host-configs/lc/toss_4_x86_64_ib_cray/amdclang-xnack.cmake @@ -0,0 +1,26 @@ +############################################################################## +# Copyright (c) 2024, Lawrence Livermore National Security, LLC and CHAI +# project contributors. See the CHAI LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause +############################################################################## + +# Set up software versions +set(ROCM_VERSION "6.2.0" CACHE PATH "") +set(GCC_VERSION "12.2.1" CACHE PATH "") + +# Set up compilers +set(COMPILER_BASE "/usr/tce/packages/rocmcc/rocmcc-${ROCM_VERSION}-magic" CACHE PATH "") +set(CMAKE_C_COMPILER "${COMPILER_BASE}/bin/amdclang" CACHE PATH "") +set(CMAKE_CXX_COMPILER "${COMPILER_BASE}/bin/amdclang++" CACHE PATH "") + +# Set up compiler flags +set(GCC_HOME "/usr/tce/packages/gcc/gcc-${GCC_VERSION}" CACHE PATH "") +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --gcc-toolchain=${GCC_HOME}" CACHE STRING "") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --gcc-toolchain=${GCC_HOME}" CACHE STRING "") + +# Set up HIP +set(ENABLE_HIP ON CACHE BOOL "") +set(ROCM_PATH "/usr/tce/packages/rocmcc/rocmcc-${ROCM_VERSION}-magic" CACHE PATH "") +set(CMAKE_HIP_ARCHITECTURES "gfx942:xnack+" CACHE STRING "") +set(AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "") diff --git a/host-configs/lc/toss_4_x86_64_ib_cray/amdclang.cmake b/host-configs/lc/toss_4_x86_64_ib_cray/amdclang.cmake index f7895e1c..e9ee1023 100644 --- a/host-configs/lc/toss_4_x86_64_ib_cray/amdclang.cmake +++ b/host-configs/lc/toss_4_x86_64_ib_cray/amdclang.cmake @@ -22,5 +22,5 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --gcc-toolchain=${GCC_HOME}" CACHE STRIN # Set up HIP set(ENABLE_HIP ON CACHE BOOL "") set(ROCM_PATH "/usr/tce/packages/rocmcc/rocmcc-${ROCM_VERSION}-magic" CACHE PATH "") -set(CMAKE_HIP_ARCHITECTURES "gfx942:xnack+" CACHE STRING "") +set(CMAKE_HIP_ARCHITECTURES "gfx942" CACHE STRING "") set(AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "") From 2d80d44ce5629b5f9cbbcf3754a0a3d5f2fcc0af Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 4 Feb 2025 13:25:52 -0800 Subject: [PATCH 37/44] Squashing wanrings for amdclang HIP builds. --- src/chai/ChaiMacros.hpp | 6 ++++++ src/chai/ManagedArray.inl | 1 + tests/integration/managed_array_tests.cpp | 20 +++++++++++++------- tests/integration/managed_ptr_tests.cpp | 12 ++++++------ 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/src/chai/ChaiMacros.hpp b/src/chai/ChaiMacros.hpp index 0747040c..f9dd6fa0 100644 --- a/src/chai/ChaiMacros.hpp +++ b/src/chai/ChaiMacros.hpp @@ -11,6 +11,12 @@ #include "umpire/util/Macros.hpp" +#define CHAI_UNUSED_VAR RAJA_UNUSED_VAR + +#if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP) +#define CHAI_ENABLE_DEVICE +#endif + #if defined(CHAI_ENABLE_CUDA) #include diff --git a/src/chai/ManagedArray.inl b/src/chai/ManagedArray.inl index 369964d6..b7e78b44 100644 --- a/src/chai/ManagedArray.inl +++ b/src/chai/ManagedArray.inl @@ -55,6 +55,7 @@ CHAI_HOST_DEVICE ManagedArray::ManagedArray( ExecutionSpace space) : ManagedArray() { + CHAI_UNUSED_VAR(elems, space); #if !defined(CHAI_DEVICE_COMPILE) this->allocate(elems, space); #endif diff --git a/tests/integration/managed_array_tests.cpp b/tests/integration/managed_array_tests.cpp index 4803b982..49858db9 100644 --- a/tests/integration/managed_array_tests.cpp +++ b/tests/integration/managed_array_tests.cpp @@ -11,7 +11,13 @@ static void gpu_test_##X##Y() #ifdef NDEBUG -#define device_assert(EXP) if( !EXP ) asm ("trap;") + +#ifdef CHAI_ENABLE_CUDA +#define device_assert(EXP) if( !(EXP) ) asm ("trap;") +#else +#define device_assert(EXP) if( !(EXP) ) asm ("s_trap 1;") +#endif + #else #define device_assert(EXP) assert(EXP) #endif @@ -799,7 +805,7 @@ GPU_TEST(ManagedArray, dataGPU) chai::ManagedArray array; array.allocate(length, chai::GPU, - [&] (const chai::PointerRecord* record, chai::Action act, chai::ExecutionSpace s) { + [&] (const chai::PointerRecord*, chai::Action act, chai::ExecutionSpace s) { if (act == chai::ACTION_MOVE) { if (s == chai::CPU) { ++transfersD2H; @@ -827,7 +833,7 @@ GPU_TEST(ManagedArray, dataGPU) // Move data to device with touch forall(gpu(), 0, length, [=] __device__ (int i) { - int* d_data = array.data(); + array.data(); array[i] += 1; }); @@ -845,7 +851,7 @@ GPU_TEST(ManagedArray, dataGPU) // Access on device with touch (should not be moved) forall(gpu(), 0, length, [=] __device__ (int i) { - int* d_data = array.data(); + array.data(); array[i] += i; }); @@ -896,7 +902,7 @@ GPU_TEST(ManagedArray, cdataGPU) chai::ManagedArray array; array.allocate(length, chai::GPU, - [&] (const chai::PointerRecord* record, chai::Action act, chai::ExecutionSpace s) { + [&] (const chai::PointerRecord*, chai::Action act, chai::ExecutionSpace s) { if (act == chai::ACTION_MOVE) { if (s == chai::CPU) { ++transfersD2H; @@ -1791,11 +1797,11 @@ GPU_TEST(ManagedArray, CopyZero) array.allocate(0); ASSERT_EQ(array.size(), 0u); - forall(gpu(), 0, 1, [=] __device__ (int i) { + forall(gpu(), 0, 1, [=] __device__ (int) { (void) array; }); - forall(sequential(), 0, 1, [=] (int i) { + forall(sequential(), 0, 1, [=] (int) { (void) array; }); diff --git a/tests/integration/managed_ptr_tests.cpp b/tests/integration/managed_ptr_tests.cpp index 000e6210..7a017b74 100644 --- a/tests/integration/managed_ptr_tests.cpp +++ b/tests/integration/managed_ptr_tests.cpp @@ -522,7 +522,7 @@ GPU_TEST(managed_ptr, gpu_class_with_managed_array) chai::ManagedArray array(1, chai::CPU); - forall(sequential(), 0, 1, [=] (int i) { + forall(sequential(), 0, 1, [=] (int) { array[0] = expectedValue; }); @@ -549,7 +549,7 @@ GPU_TEST(managed_ptr, gpu_class_with_raw_ptr) chai::ManagedArray array(1, chai::CPU); - forall(sequential(), 0, 1, [=] (int i) { + forall(sequential(), 0, 1, [=] (int) { array[0] = expectedValue; }); @@ -642,7 +642,7 @@ GPU_TEST(managed_ptr, static_pointer_cast) chai::ManagedArray array(1, chai::CPU); - forall(sequential(), 0, 1, [=] (int i) { + forall(sequential(), 0, 1, [=] (int) { array[0] = expectedValue; }); @@ -675,7 +675,7 @@ GPU_TEST(managed_ptr, dynamic_pointer_cast) chai::ManagedArray array(1, chai::CPU); - forall(sequential(), 0, 1, [=] (int i) { + forall(sequential(), 0, 1, [=] (int) { array[0] = expectedValue; }); @@ -708,7 +708,7 @@ GPU_TEST(managed_ptr, const_pointer_cast) chai::ManagedArray array(1, chai::CPU); - forall(sequential(), 0, 1, [=] (int i) { + forall(sequential(), 0, 1, [=] (int) { array[0] = expectedValue; }); @@ -741,7 +741,7 @@ GPU_TEST(managed_ptr, reinterpret_pointer_cast) chai::ManagedArray array(1, chai::CPU); - forall(sequential(), 0, 1, [=] (int i) { + forall(sequential(), 0, 1, [=] (int) { array[0] = expectedValue; }); From 5119b299e3ca9a4041c9cd7bf28004b055d9a06c Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 4 Feb 2025 13:38:31 -0800 Subject: [PATCH 38/44] Squashing warnings for amdclang in ManagedSharedPtr. --- src/chai/ManagedSharedPtr.hpp | 2 ++ src/chai/SharedPtrCounter.hpp | 1 + tests/integration/managed_array_tests.cpp | 4 ---- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/chai/ManagedSharedPtr.hpp b/src/chai/ManagedSharedPtr.hpp index dacc57e2..358e87e9 100644 --- a/src/chai/ManagedSharedPtr.hpp +++ b/src/chai/ManagedSharedPtr.hpp @@ -136,6 +136,7 @@ class ManagedSharedPtr : public CHAICopyable{ */ CHAI_HOST_DEVICE const element_type* cget(ExecutionSpace space = chai::CPU) const noexcept { + CHAI_UNUSED_VAR(space); #if !defined(CHAI_DEVICE_COMPILE) if (m_active_pointer) { move(space, false); @@ -145,6 +146,7 @@ class ManagedSharedPtr : public CHAICopyable{ } CHAI_HOST_DEVICE element_type* get(ExecutionSpace space = chai::CPU) const noexcept { + CHAI_UNUSED_VAR(space); #if !defined(CHAI_DEVICE_COMPILE) if (m_active_pointer) { move(space); diff --git a/src/chai/SharedPtrCounter.hpp b/src/chai/SharedPtrCounter.hpp index 707dc84e..a51a9d79 100644 --- a/src/chai/SharedPtrCounter.hpp +++ b/src/chai/SharedPtrCounter.hpp @@ -194,6 +194,7 @@ class msp_record_count { CHAI_HOST_DEVICE msp_record_count& operator=(msp_record_count const& rhs) noexcept { + CHAI_UNUSED_VAR(rhs); #if !defined(CHAI_DEVICE_COMPILE) msp_counted_base* temp = rhs.m_pi; if (temp != m_pi) diff --git a/tests/integration/managed_array_tests.cpp b/tests/integration/managed_array_tests.cpp index 81c07755..89648562 100644 --- a/tests/integration/managed_array_tests.cpp +++ b/tests/integration/managed_array_tests.cpp @@ -26,10 +26,6 @@ #define device_assert(EXP) if( !(EXP) ) asm ("s_trap 1;") #endif -#else -#define device_assert(EXP) if( !EXP ) asm ("s_trap 1;") -#endif - #else #define device_assert(EXP) assert(EXP) #endif From bc7d2e3626f14897676b56bf08d68bd5c744a8d2 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 4 Feb 2025 15:47:19 -0800 Subject: [PATCH 39/44] Direct CHAI_UNUSED_VAR declaration. --- src/chai/ChaiMacros.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/chai/ChaiMacros.hpp b/src/chai/ChaiMacros.hpp index f9dd6fa0..1d62ba84 100644 --- a/src/chai/ChaiMacros.hpp +++ b/src/chai/ChaiMacros.hpp @@ -11,8 +11,6 @@ #include "umpire/util/Macros.hpp" -#define CHAI_UNUSED_VAR RAJA_UNUSED_VAR - #if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP) #define CHAI_ENABLE_DEVICE #endif @@ -98,4 +96,10 @@ #endif #endif +namespace chai +{ +template +CHAI_HOST_DEVICE CHAI_INLINE void CHAI_UNUSED_VAR(T &&...) noexcept {} +} // namespace chai + #endif // CHAI_ChaiMacros_HPP From 07e804cd76836efca51ed0ffa4324c5bb0ab4af9 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Wed, 12 Feb 2025 14:44:10 -0800 Subject: [PATCH 40/44] Squash warning from sync call not returning. --- src/chai/ManagedSharedPtr.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/chai/ManagedSharedPtr.hpp b/src/chai/ManagedSharedPtr.hpp index 358e87e9..b4c89ff2 100644 --- a/src/chai/ManagedSharedPtr.hpp +++ b/src/chai/ManagedSharedPtr.hpp @@ -308,7 +308,7 @@ ManagedSharedPtr make_shared(Args&&... args) { cudaDeviceSynchronize(); #endif #if defined(CHAI_ENABLE_HIP) - hipDeviceSynchronize(); + CHAI_UNUSED_VAR(hipDeviceSynchronize()); #endif auto result = ManagedSharedPtr({cpu_pointer, gpu_pointer}, {CPU, GPU}, From 292a0008ea970e79a9abc1be95db9dc242eef246 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 25 Feb 2025 11:11:02 -0800 Subject: [PATCH 41/44] Cleaning up commented code moved to ChaiManager.hpp; Commenting out callback related members in SharedPtrManager. --- src/chai/SharedPtrManager.cpp | 26 ++++---- src/chai/SharedPtrManager.hpp | 122 +++------------------------------- 2 files changed, 23 insertions(+), 125 deletions(-) diff --git a/src/chai/SharedPtrManager.cpp b/src/chai/SharedPtrManager.cpp index 06ff7978..71bb11a4 100644 --- a/src/chai/SharedPtrManager.cpp +++ b/src/chai/SharedPtrManager.cpp @@ -34,8 +34,8 @@ SharedPtrManager* SharedPtrManager::getInstance() SharedPtrManager::SharedPtrManager() : m_pointer_map{}, m_allocators{}, - m_resource_manager{umpire::ResourceManager::getInstance()}, - m_callbacks_active{true} + m_resource_manager{umpire::ResourceManager::getInstance()} + //,m_callbacks_active{true} { m_pointer_map.clear(); m_current_execution_space = NONE; @@ -150,17 +150,17 @@ void SharedPtrManager::deregisterPointer(msp_pointer_record* record, bool deregi } } -void * SharedPtrManager::frontOfAllocation(void * pointer) { - if (pointer) { - if (m_resource_manager.hasAllocator(pointer)) { - auto allocation_record = m_resource_manager.findAllocationRecord(pointer); - if (allocation_record) { - return allocation_record->ptr; - } - } - } - return nullptr; -} +//void * SharedPtrManager::frontOfAllocation(void * pointer) { +// if (pointer) { +// if (m_resource_manager.hasAllocator(pointer)) { +// auto allocation_record = m_resource_manager.findAllocationRecord(pointer); +// if (allocation_record) { +// return allocation_record->ptr; +// } +// } +// } +// return nullptr; +//} void SharedPtrManager::setExecutionSpace(ExecutionSpace space) { diff --git a/src/chai/SharedPtrManager.hpp b/src/chai/SharedPtrManager.hpp index 87d915de..c6aec495 100644 --- a/src/chai/SharedPtrManager.hpp +++ b/src/chai/SharedPtrManager.hpp @@ -7,122 +7,17 @@ #ifndef CHAI_SharedPtrManager_HPP #define CHAI_SharedPtrManager_HPP -#include "chai/config.hpp" -#include "chai/ChaiMacros.hpp" -#include "chai/ExecutionSpaces.hpp" - #include "chai/SharedPointerRecord.hpp" -#include "chai/Types.hpp" #include "chai/ChaiManager.hpp" -#if defined(CHAI_ENABLE_RAJA_PLUGIN) -#include "chai/pluginLinker.hpp" -#endif - -#include - -#include "umpire/Allocator.hpp" -#include "umpire/util/MemoryMap.hpp" - -#if defined(CHAI_ENABLE_CUDA) -#include -#endif -#if defined(CHAI_ENABLE_HIP) -#include "hip/hip_runtime_api.h" -#endif - namespace chai { -// CHAI_GPU_ERROR_CHECK macro -//#if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP) -// -//#ifdef CHAI_ENABLE_GPU_ERROR_CHECKING -// -//#ifdef CHAI_ENABLE_CUDA -//inline void gpuErrorCheck(cudaError_t code, const char *file, int line, bool abort=true) -//{ -// if (code != cudaSuccess) { -// fprintf(stderr, "[CHAI] GPU Error: %s %s %d\n", cudaGetErrorString(code), file, line); -// if (abort) { -// exit(code); -// } -// } -//} -//#elif defined(CHAI_ENABLE_HIP) -//inline void gpuErrorCheck(hipError_t code, const char *file, int line, bool abort=true) -//{ -// if (code != hipSuccess) { -// fprintf(stderr, "[CHAI] GPU Error: %s %s %d\n", hipGetErrorString(code), file, line); -// if (abort) { -// exit(code); -// } -// } -//} -//#endif -// -// -//#define CHAI_GPU_ERROR_CHECK(code) { gpuErrorCheck((code), __FILE__, __LINE__); } -//#else // CHAI_ENABLE_GPU_ERROR_CHECKING -//#define CHAI_GPU_ERROR_CHECK(code) code -//#endif // CHAI_ENABLE_GPU_ERROR_CHECKING -// -//#endif - -//// wrapper for hip/cuda synchronize -//inline void synchronize() { -//#if defined (CHAI_ENABLE_HIP) &&!defined(__HIP_DEVICE_COMPILE__) -// CHAI_GPU_ERROR_CHECK(hipDeviceSynchronize()); -//#elif defined (CHAI_ENABLE_CUDA) &&!defined(__CUDA_ARCH__) -// CHAI_GPU_ERROR_CHECK(cudaDeviceSynchronize()); -//#endif -//} -// -//#if defined(CHAI_GPUCC) -// -//// wrapper for hip/cuda free -//CHAI_HOST inline void gpuFree(void* buffer) { -//#if defined (CHAI_ENABLE_HIP) -// CHAI_GPU_ERROR_CHECK(hipFree(buffer)); -//#elif defined (CHAI_ENABLE_CUDA) -// CHAI_GPU_ERROR_CHECK(cudaFree(buffer)); -//#endif -//} -// -//// wrapper for hip/cuda malloc -//CHAI_HOST inline void gpuMalloc(void** devPtr, size_t size) { -//#if defined (CHAI_ENABLE_HIP) -// CHAI_GPU_ERROR_CHECK(hipMalloc(devPtr, size)); -//#elif defined (CHAI_ENABLE_CUDA) -// CHAI_GPU_ERROR_CHECK(cudaMalloc(devPtr, size)); -//#endif -//} -// -//// wrapper for hip/cuda managed malloc -//CHAI_HOST inline void gpuMallocManaged(void** devPtr, size_t size) { -//#if defined (CHAI_ENABLE_HIP) -// CHAI_GPU_ERROR_CHECK(hipMallocManaged(devPtr, size)); -//#elif defined (CHAI_ENABLE_CUDA) -// CHAI_GPU_ERROR_CHECK(cudaMallocManaged(devPtr, size)); -//#endif -//} -// -//// wrapper for hip/cuda mem copy -//CHAI_HOST inline void gpuMemcpy(void* dst, const void* src, size_t count, gpuMemcpyKind kind) { -//#if defined (CHAI_ENABLE_HIP) -// CHAI_GPU_ERROR_CHECK(hipMemcpy(dst, src, count, kind)); -//#elif defined (CHAI_ENABLE_CUDA) -// CHAI_GPU_ERROR_CHECK(cudaMemcpy(dst, src, count, kind)); -//#endif -//} -// -//#endif //#if defined(CHAI_GPUCC) - /*! * \brief Singleton that manages caching and movement of ManagedArray objects. * * The SharedPtrManager class co-ordinates the allocation and movement of - * ManagedArray objects. These objects are cached, and data is only copied + * ManagedSharedPtr objects. These objects are cached, and data is only copied * between ExecutionSpaces when necessary. This functionality is typically * hidden behind a programming model layer, such as RAJA, or the exmaple * included in util/forall.hpp @@ -133,9 +28,12 @@ namespace chai * \code * const chai::SharedPtrManager* rm = chai::SharedPtrManager::getInstance(); * rm->setExecutionSpace(chai::CPU); - * // Do something in with ManagedArrays on the CPU... but they must be copied! + * // Do something with ManagedSharedPtr on the CPU... but they must be copied! * rm->setExecutionSpace(chai::NONE); * \endcode + * + * SharedPtrManager differs from ArrayManager such that it does not support + * reallocation or callbacks (at this time). */ class SharedPtrManager { @@ -376,7 +274,7 @@ class SharedPtrManager * * \param pointer Pointer to address of that we want the front of the allocation for. */ - CHAISHAREDDLL_API void * frontOfAllocation(void * pointer); + //CHAISHAREDDLL_API void * frontOfAllocation(void * pointer); /*! * \brief set the allocator for an execution space. @@ -398,12 +296,12 @@ class SharedPtrManager /*! * \brief Turn callbacks on. */ - void enableCallbacks() { m_callbacks_active = true; } + //void enableCallbacks() { m_callbacks_active = true; } /*! * \brief Turn callbacks off. */ - void disableCallbacks() { m_callbacks_active = false; } + //void disableCallbacks() { m_callbacks_active = false; } /*! * \brief synchronize the device if there hasn't been a synchronize since the last kernel @@ -512,12 +410,12 @@ class SharedPtrManager /*! * \brief A callback triggered upon memory operations on all ManagedArrays. */ - UserCallback m_user_callback; + //UserCallback m_user_callback; /*! * \brief Controls whether or not callbacks are called. */ - bool m_callbacks_active; + //bool m_callbacks_active; /*! * Whether or not a synchronize has been performed since the launch of the last From ec7e084984b12fbd6832a04ef4814a02a08cc9ce Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 25 Feb 2025 11:30:30 -0800 Subject: [PATCH 42/44] Removing commented code blocks unnecessary for MSPtr. --- src/chai/SharedPtrCounter.hpp | 8 ------ src/chai/SharedPtrManager.inl | 53 ----------------------------------- 2 files changed, 61 deletions(-) diff --git a/src/chai/SharedPtrCounter.hpp b/src/chai/SharedPtrCounter.hpp index a51a9d79..abff134d 100644 --- a/src/chai/SharedPtrCounter.hpp +++ b/src/chai/SharedPtrCounter.hpp @@ -159,14 +159,6 @@ class msp_record_count { CHAI_HOST_DEVICE constexpr msp_record_count() noexcept : m_pi(0) {} - //template - //explicit msp_record_count(Ptr h_p, Ptr d_p) - //: m_pi( new msp_counted_ptr(h_p, d_p) ) {} - - //template - //explicit msp_record_count(Ptr h_p, Ptr d_p, Deleter d) - //: m_pi( new msp_counted_deleter(std::initializer_list{h_p, d_p,}, std::initializer_list{chai::CPU, chai::GPU}, std::move(d)) ) {} - template explicit msp_record_count(T, Ptrs&& ptrs, Spaces&& spaces, Deleter d) : m_pi( new msp_counted_deleter( diff --git a/src/chai/SharedPtrManager.inl b/src/chai/SharedPtrManager.inl index 66436522..0ae13ba6 100644 --- a/src/chai/SharedPtrManager.inl +++ b/src/chai/SharedPtrManager.inl @@ -26,59 +26,6 @@ namespace chai { -//template -//CHAI_INLINE -//void* SharedPtrManager::reallocate(void* pointer, size_t elems, msp_pointer_record* pointer_record) -//{ -// ExecutionSpace my_space = CPU; -// -// for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { -// if (pointer_record->m_pointers[space] == pointer) { -// my_space = static_cast(space); -// break; -// } -// } -// -// for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { -// if (!pointer_record->m_owned[space]) { -// CHAI_LOG(Debug, "Cannot reallocate unowned pointer"); -// return pointer_record->m_pointers[my_space]; -// } -// } -// -// // Call callback with ACTION_FREE before changing the size -// for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { -// if (pointer_record->m_pointers[space]) { -// callback(pointer_record, ACTION_FREE, ExecutionSpace(space)); -// } -// } -// -// // Update the pointer record size -// size_t old_size = pointer_record->m_size; -// size_t new_size = sizeof(T) * elems; -// pointer_record->m_size = new_size; -// -// // only copy however many bytes overlap -// size_t num_bytes_to_copy = std::min(old_size, new_size); -// -// for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { -// void* old_ptr = pointer_record->m_pointers[space]; -// -// if (old_ptr) { -// void* new_ptr = m_allocators[space]->allocate(new_size); -// m_resource_manager.copy(new_ptr, old_ptr, num_bytes_to_copy); -// m_allocators[space]->deallocate(old_ptr); -// -// pointer_record->m_pointers[space] = new_ptr; -// callback(pointer_record, ACTION_ALLOC, ExecutionSpace(space)); -// -// m_pointer_map.erase(old_ptr); -// m_pointer_map.insert(new_ptr, pointer_record); -// } -// } -// -// return pointer_record->m_pointers[my_space]; -//} template msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(std::initializer_list pointers, std::initializer_list spaces, From 0c181e3c35ab20ea497200bc15dd1e0f01cbb0c7 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 25 Feb 2025 11:31:01 -0800 Subject: [PATCH 43/44] Removing debug print statements from MSPtr development. --- src/chai/ManagedSharedPtr.hpp | 21 +-------------------- src/chai/SharedPtrManager.cpp | 34 +++++++++------------------------- src/chai/SharedPtrManager.hpp | 2 +- 3 files changed, 11 insertions(+), 46 deletions(-) diff --git a/src/chai/ManagedSharedPtr.hpp b/src/chai/ManagedSharedPtr.hpp index b4c89ff2..1c93335b 100644 --- a/src/chai/ManagedSharedPtr.hpp +++ b/src/chai/ManagedSharedPtr.hpp @@ -86,9 +86,7 @@ class ManagedSharedPtr : public CHAICopyable{ , m_resource_manager(rhs.m_resource_manager) { #if !defined(CHAI_DEVICE_COMPILE) - //std::cout << "ManagedSharedPtr Copy Ctor: m_active_pointer @ " << m_active_pointer << std::endl; - if (m_active_pointer) move(ArrayManager::getInstance()->getExecutionSpace()); // TODO: Use a generic interface for RAJA queries. - //if (m_active_pointer) move(m_resource_manager->getExecutionSpace()); + if (m_active_pointer) move(ArrayManager::getInstance()->getExecutionSpace()); #endif } @@ -181,14 +179,12 @@ class ManagedSharedPtr : public CHAICopyable{ if (prev_space != GPU && space == GPU) { /// Move nested ManagedArrays first, so they are working with a valid m_active_pointer for the host, // and so the meta data associated with them are updated before we move the other array down. - std::cout << "Pre-move InnerImpl\n"; moveInnerImpl(); } auto old_pointer = m_active_pointer; m_active_pointer = static_cast(m_resource_manager->move( (void *)m_active_pointer, m_record_count.m_get_record(), space, is_CHAIPoly::value)); if (old_pointer != m_active_pointer) { - std::cout << "m_active_pointer @ " << m_active_pointer << " : def touch behaviour : " << (!std::is_const::value || is_CHAICopyable::value) << std::endl; } if (registerTouch) { @@ -197,7 +193,6 @@ class ManagedSharedPtr : public CHAICopyable{ if (space != GPU && prev_space == GPU) { /// Move nested ManagedArrays after the move, so they are working with a valid m_active_pointer for the host, // and so the meta data associated with them are updated with live GPU data - std::cout << "Post-move InnerImpl\n"; moveInnerImpl(); } @@ -260,7 +255,6 @@ __global__ void msp_make_on_device(T* gpuPointer, Args&&... args) template CHAI_INLINE CHAI_HOST Tp* msp_make_on_device(Args&&... args) { - std::cout << "msp_make_on_device\n"; Tp* gpu_ptr = nullptr; chai::SharedPtrManager* sptr_manager = chai::SharedPtrManager::getInstance(); @@ -276,7 +270,6 @@ CHAI_HOST Tp* msp_make_on_device(Args&&... args) { template CHAI_INLINE CHAI_HOST Tp* msp_make_on_host(Args&&... args) { - std::cout << "msp_make_on_host\n"; chai::SharedPtrManager* sptr_manager = chai::SharedPtrManager::getInstance(); auto cpu_allocator = sptr_manager->getAllocator(chai::CPU); @@ -295,15 +288,12 @@ CHAI_INLINE CHAI_HOST ManagedSharedPtr make_shared(Args&&... args) { using Tp_non_const = std::remove_const_t; - std::cout << "make_shared\n"; Tp* cpu_pointer = detail::msp_make_on_host(std::forward(args)...); - std::cout << "CPU pointer @ " << cpu_pointer << std::endl; #if defined(CHAI_ENABLE_CUDA) or defined(CHAI_ENABLE_HIP) Tp* gpu_pointer = detail::msp_make_on_device(); - std::cout << "GPU pointer @ " << gpu_pointer << std::endl; #if defined(CHAI_ENABLE_CUDA) cudaDeviceSynchronize(); #endif @@ -330,18 +320,9 @@ ManagedSharedPtr make_shared(Args&&... args) { #endif // defined(CHAI_ENABLE_CUDA) or defined(CHAI_ENABLE_HIP) - std::cout << "End of make_shared\n"; return result; } -//TODO: make_shared_deleter -//template -//CHAI_INLINE -//CHAI_HOST -//ManagedSharedPtr make_shared_deleter(Args... args, Deleter d) { -//..... -//} - } // namespace chai diff --git a/src/chai/SharedPtrManager.cpp b/src/chai/SharedPtrManager.cpp index 71bb11a4..11c1400b 100644 --- a/src/chai/SharedPtrManager.cpp +++ b/src/chai/SharedPtrManager.cpp @@ -251,24 +251,24 @@ static void copy(void * dst_pointer, void * src_pointer, umpire::ResourceManager #endif - std::cout << "SPtr Manager Copy Call\n"; - std::cout << "dst_ptr @ " << dst_pointer << std::endl; - std::cout << "src_ptr @ " << src_pointer << std::endl; + //std::cout << "SPtr Manager Copy Call\n"; + //std::cout << "dst_ptr @ " << dst_pointer << std::endl; + //std::cout << "src_ptr @ " << src_pointer << std::endl; camp::resources::Resource host_resource(camp::resources::Host::get_default()); if (dst_space == GPU || src_space == GPU) { // Do the copy using the device resource - std::cout << "---- Sptr Manager Device Copy\n"; - std::cout << "---- dst_ptr @ " << dst_pointer << std::endl; - std::cout << "---- src_ptr @ " << src_pointer << std::endl; + //std::cout << "---- Sptr Manager Device Copy\n"; + //std::cout << "---- dst_ptr @ " << dst_pointer << std::endl; + //std::cout << "---- src_ptr @ " << src_pointer << std::endl; if (poly) { - std::cout << "---- POLY COPY\n"; + //std::cout << "---- POLY COPY\n"; std::size_t vtable_size = sizeof(void*); void* poly_src_ptr = ((char*)src_pointer + vtable_size); void* poly_dst_ptr = ((char*)dst_pointer + vtable_size); manager.copy(poly_dst_ptr, poly_src_ptr, device_resource); } else { - std::cout << "---- STD COPY\n"; + //std::cout << "---- STD COPY\n"; manager.copy(dst_pointer, src_pointer, device_resource); } @@ -399,12 +399,6 @@ void SharedPtrManager::free(msp_pointer_record* pointer_record, ExecutionSpace s } } -//size_t SharedPtrManager::getSize(void* ptr) -//{ -// // TODO -// auto pointer_record = getPointerRecord(ptr); -// return pointer_record->m_size; -//} void SharedPtrManager::setDefaultAllocationSpace(ExecutionSpace space) { @@ -439,7 +433,6 @@ msp_pointer_record* SharedPtrManager::getPointerRecord(void* pointer) // TODO: Need a better way of dealing with non-cuda builds here... msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(void const* c_pointer, void const* c_d_pointer, size_t size, - //ExecutionSpace space, bool owned) { void* pointer = const_cast(c_pointer); @@ -451,21 +444,15 @@ msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(void const* c_pointer, return &s_null_record ; } - //if (space == NONE) { - // space = getDefaultAllocationSpace(); - //} - m_resource_manager.registerAllocation( pointer, {pointer, size, m_allocators[chai::CPU]->getAllocationStrategy()}); - //std::cout << "m_allocators[chai::CPU] : " << m_allocators[chai::CPU]->getName() << std::endl; #if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP) || defined(CHAI_ENABLE_GPU_SIMULATION_MODE) m_resource_manager.registerAllocation( d_pointer, {d_pointer, size, m_allocators[chai::GPU]->getAllocationStrategy()}); #endif - //std::cout << "m_allocators[chai::GPU] : " << m_allocators[chai::GPU]->getName() << std::endl; auto pointer_record = getPointerRecord(pointer); @@ -487,7 +474,6 @@ msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(void const* c_pointer, pointer_record->m_pointers[chai::GPU] = d_pointer; pointer_record->m_owned[chai::GPU] = owned; #endif - //pointer_record->m_size = size; //pointer_record->m_user_callback = [] (const msp_pointer_record*, Action, ExecutionSpace) {}; for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { @@ -507,8 +493,6 @@ msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(void const* c_pointer, msp_pointer_record* SharedPtrManager::deepCopyRecord(msp_pointer_record const* record) { msp_pointer_record* new_record = new msp_pointer_record{}; - //const size_t size = record->m_size; - //new_record->m_size = size; //new_record->m_user_callback = [] (const msp_pointer_record*, Action, ExecutionSpace) {}; const ExecutionSpace last_space = record->m_last_space; @@ -547,7 +531,7 @@ SharedPtrManager::getPointerMap() const return mapCopy; } -size_t SharedPtrManager::getTotalNumArrays() const { return m_pointer_map.size(); } +size_t SharedPtrManager::getTotalNumSharedPtrs() const { return m_pointer_map.size(); } // TODO: Investigate counting memory allocated in each execution space if // possible diff --git a/src/chai/SharedPtrManager.hpp b/src/chai/SharedPtrManager.hpp index c6aec495..71339784 100644 --- a/src/chai/SharedPtrManager.hpp +++ b/src/chai/SharedPtrManager.hpp @@ -224,7 +224,7 @@ class SharedPtrManager * * \return The total number of arrays registered with the array manager. */ - CHAISHAREDDLL_API size_t getTotalNumArrays() const; + CHAISHAREDDLL_API size_t getTotalNumSharedPtrs() const; /*! * \brief Get the total amount of memory allocated. From 42b4830cd9e9ca922e7d04118258abc6b54febe8 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 25 Feb 2025 11:32:09 -0800 Subject: [PATCH 44/44] Adding ChaiManager.hpp to chai_headers. --- src/chai/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/chai/CMakeLists.txt b/src/chai/CMakeLists.txt index d653bbb2..c8397ae7 100644 --- a/src/chai/CMakeLists.txt +++ b/src/chai/CMakeLists.txt @@ -15,6 +15,7 @@ set (chai_headers ArrayManager.hpp ArrayManager.inl ChaiMacros.hpp + ChaiManager.hpp ExecutionSpaces.hpp ManagedArray.hpp ManagedArray.inl