Merge branch 'fea-remove-deprecated-funcs' of github.com:harrism/rmm …

…into fea-remove-deprecated-funcs
rapidsai · Apr 23, 2024 · a4d6c96 · a4d6c96
2 parents d81f464 + eeb1c4b
commit a4d6c96
Show file tree

Hide file tree

Showing 9 changed files with 45 additions and 43 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -114,7 +114,8 @@ endif()
 # * add tests --------------------------------------------------------------------------------------
 
 if(BUILD_TESTS AND CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
-  include(cmake/thirdparty/get_gtest.cmake)
+  include(${rapids-cmake-dir}/cpm/gtest.cmake)
+  rapids_cpm_gtest(BUILD_STATIC)
   include(CTest) # calls enable_testing()
 
   add_subdirectory(tests)

diff --git a/README.md b/README.md
@@ -375,14 +375,16 @@ this code is correct:
 
 #### Use of `rmm::device_vector` with multiple devices
 
-> [!CAUTION] In contrast to the uninitialized `rmm:device_uvector`, `rmm::device_vector` **DOES
-> NOT** store the active device during construction, and therefore cannot arrange for it to be
-> active when the destructor runs. It is therefore the responsibility of the user to ensure the
-> currently active device is correct.
+`rmm:device_vector` uses an `rmm::mr::thrust_allocator` to enable `thrust::device_vector` to
+allocate and deallocate memory using RMM. As such, the usual rules for usage of the backing memory
+resource apply: the active device must match the active device at resource construction time. To
+facilitate use in an RAII setting, `rmm::mr::thrust_allocator` records the active device at
+construction time and ensures that device is active whenever it allocates or deallocates memory.
+Usage of `rmm::device_vector` with multiple devices is therefore the same as `rmm::device_buffer`.
+One must _create_ `device_vector`s with the correct device active, but it is safe to destroy them
+with a different active device.
 
-`rmm::device_vector` is therefore slightly less ergonomic to use in a multiple device setting since
-the caller must arrange that active devices on allocation and deallocation match. Recapitulating the
-previous example using `rmm::device_vector`:
+For example, recapitulating the previous example using `rmm::device_vector`:
 
 ```c++
 {
@@ -391,12 +393,16 @@ previous example using `rmm::device_vector`:
   rmm::device_vector<int> vec(16, rmm::mr::thrust_allocator<int>(rmm::cuda_stream_default, &mr));
   RMM_CUDA_TRY(cudaSetDevice(1));
   ...
-  // ERROR: ~vec runs with device 1 active, but needs device 0 to be active
+  // No need to switch back to device 0 before ~vec runs
 }
 ```
 
-A correct example adds a call to `cudaSetDevice(0)` on the line of the error comment before the dtor
-for `~vec` runs.
+> [!NOTE]
+> Although allocation and deallocation in the `thrust_allocator` run with the correct active device,
+> modification of `rmm::device_vector` might necessitate a kernel launch, and this must run with the
+> correct device active. For example, `.resize()` might both allocate _and_ launch a kernel to
+> initialize new elements: the user must arrange for this kernel launch to occur with the correct
+> device for the memory resource active.
 
 ## `cuda_stream_view` and `cuda_stream`
 

diff --git a/cmake/thirdparty/get_gtest.cmake b/cmake/thirdparty/get_gtest.cmake
diff --git a/conda/recipes/librmm/conda_build_config.yaml b/conda/recipes/librmm/conda_build_config.yaml
@@ -16,9 +16,6 @@ cmake_version:
 fmt_version:
   - ">=10.1.1,<11"
 
-gtest_version:
-  - ">=1.13.0"
-
 spdlog_version:
   - ">=1.12.0,<1.13"
 

diff --git a/conda/recipes/librmm/meta.yaml b/conda/recipes/librmm/meta.yaml
@@ -33,8 +33,6 @@ requirements:
     # added as a run requirement via the packages' run_exports.
     - fmt {{ fmt_version }}
     - spdlog {{ spdlog_version }}
-    - gtest {{ gtest_version }}
-    - gmock {{ gtest_version }}
 
 build:
   script_env:
@@ -150,8 +148,6 @@ outputs:
         {% endif %}
         - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
         - {{ pin_subpackage('librmm', exact=True) }}
-        - gtest {{ gtest_version }}
-        - gmock {{ gtest_version }}
     about:
       home: https://rapids.ai/
       license: Apache-2.0

diff --git a/include/rmm/mr/device/cuda_async_memory_resource.hpp b/include/rmm/mr/device/cuda_async_memory_resource.hpp
@@ -25,7 +25,6 @@
 
 #include <cuda/std/type_traits>
 #include <cuda_runtime_api.h>
-#include <thrust/optional.h>
 
 #include <cstddef>
 #include <limits>

diff --git a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
@@ -23,7 +23,6 @@
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <cuda_runtime_api.h>
-#include <thrust/optional.h>
 
 #include <cstddef>
 #include <limits>

diff --git a/include/rmm/mr/device/thrust_allocator_adaptor.hpp b/include/rmm/mr/device/thrust_allocator_adaptor.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <rmm/cuda_device.hpp>
 #include <rmm/detail/thrust_namespace.h>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -39,6 +40,9 @@ namespace rmm::mr {
  * allocate objects of a specific type `T`, but can be freely rebound to other
  * types.
  *
+ * The allocator records the current cuda device and may only be used with a backing
+ * `device_async_resource_ref` valid for the same device.
+ *
  * @tparam T The type of the objects that will be allocated by this allocator
  */
 template <typename T>
@@ -92,7 +96,7 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
    */
   template <typename U>
   thrust_allocator(thrust_allocator<U> const& other)
-    : _mr(other.resource()), _stream{other.stream()}
+    : _mr(other.resource()), _stream{other.stream()}, _device{other._device}
   {
   }
 
@@ -104,6 +108,7 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
    */
   pointer allocate(size_type num)
   {
+    cuda_set_device_raii dev{_device};
     return thrust::device_pointer_cast(
       static_cast<T*>(_mr.allocate_async(num * sizeof(T), _stream)));
   }
@@ -117,6 +122,7 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
    */
   void deallocate(pointer ptr, size_type num)
   {
+    cuda_set_device_raii dev{_device};
     return _mr.deallocate_async(thrust::raw_pointer_cast(ptr), num * sizeof(T), _stream);
   }
 
@@ -143,6 +149,7 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
  private:
   cuda_stream_view _stream{};
   rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
+  cuda_device_id _device{get_current_cuda_device()};
 };
 /** @} */  // end of group
 }  // namespace rmm::mr
diff --git a/tests/mr/device/thrust_allocator_tests.cu b/tests/mr/device/thrust_allocator_tests.cu
@@ -16,7 +16,9 @@
 
 #include "mr_ref_test.hpp"
 
+#include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/detail/error.hpp>
 #include <rmm/device_vector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/thrust_allocator_adaptor.hpp>
@@ -36,19 +38,36 @@ struct allocator_test : public mr_ref_test {};
 
 TEST_P(allocator_test, first)
 {
+  rmm::mr::set_current_device_resource(this->mr.get());
   auto const num_ints{100};
   rmm::device_vector<int> ints(num_ints, 1);
   EXPECT_EQ(num_ints, thrust::reduce(ints.begin(), ints.end()));
 }
 
 TEST_P(allocator_test, defaults)
 {
+  rmm::mr::set_current_device_resource(this->mr.get());
   rmm::mr::thrust_allocator<int> allocator(rmm::cuda_stream_default);
   EXPECT_EQ(allocator.stream(), rmm::cuda_stream_default);
   EXPECT_EQ(allocator.get_upstream_resource(),
             rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()});
 }
 
+TEST_P(allocator_test, multi_device)
+{
+  if (rmm::get_num_cuda_devices() < 2) { GTEST_SKIP() << "Needs at least two devices"; }
+  cuda_set_device_raii with_device{rmm::get_current_cuda_device()};
+  rmm::cuda_stream stream{};
+  // make allocator on device-0
+  rmm::mr::thrust_allocator<int> allocator(stream.view(), this->ref);
+  auto const size{100};
+  EXPECT_NO_THROW([&]() {
+    auto vec = rmm::device_vector<int>(size, allocator);
+    // Destruct with device-1 active
+    RMM_CUDA_TRY(cudaSetDevice(1));
+  }());
+}
+
 INSTANTIATE_TEST_CASE_P(ThrustAllocatorTests,
                         allocator_test,
                         ::testing::Values(mr_factory{"CUDA", &make_cuda},