Skip to content

Commit ea807e8

Browse files
authored
Improve the Arena allocator to reduce memory fragmentation (#916)
Currently the arena allocator divides GPU memory into a global arena and per-thread arenas. For smaller allocations, a per-thread arena allocates large chunks of memory (superblocks) from the global arena and divides them up for individual allocations. However, when deallocating from another arena (producer/consumer pattern), or when we run out of memory and return everything to the global arena, the superblock boundaries are broken. Overtime, this could cause the memory to get more and more fragmented. This PR makes superblocks concrete objects, not just virtual boundaries, and the only units of exchange between the global arena and per-thread arenas. This should make the allocator more resistant to memory fragmentation, especially for long running processes under constant memory pressure. Other notable changes: * The allocator now allocates a fixed but configurable amount of memory from CUDA. This introduces less fragmentation comparing to growing the pool size gradually. * Switched to C++17 `std::shared_mutex`. * Added a bunch of unit tests. fixes #919 fixes #906 Authors: - Rong Ou (https://github.com/rongou) Approvers: - Jake Hemstad (https://github.com/jrhemstad) - Mark Harris (https://github.com/harrism) URL: #916
1 parent 5a239d2 commit ea807e8

File tree

4 files changed

+1281
-403
lines changed

4 files changed

+1281
-403
lines changed

benchmarks/random_allocations/random_allocations.cpp

+4-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
2+
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -170,7 +170,9 @@ inline auto make_pool()
170170

171171
inline auto make_arena()
172172
{
173-
return rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(make_cuda());
173+
auto free = rmm::detail::available_device_memory().first;
174+
constexpr auto reserve{64UL << 20}; // Leave some space for CUDA overhead.
175+
return rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(make_cuda(), free - reserve);
174176
}
175177

176178
inline auto make_binning()

include/rmm/mr/device/arena_memory_resource.hpp

+88-57
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
2+
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -78,26 +78,21 @@ class arena_memory_resource final : public device_memory_resource {
7878
* @brief Construct an `arena_memory_resource`.
7979
*
8080
* @throws rmm::logic_error if `upstream_mr == nullptr`.
81-
* @throws rmm::logic_error if `initial_size` is neither the default nor aligned to a multiple of
82-
* 256 bytes.
83-
* @throws rmm::logic_error if `maximum_size` is neither the default nor aligned to a multiple of
84-
* 256 bytes.
8581
*
86-
* @param upstream_mr The memory resource from which to allocate blocks for the pool
87-
* @param initial_size Minimum size, in bytes, of the initial global arena. Defaults to half of
88-
* the available memory on the current device.
89-
* @param maximum_size Maximum size, in bytes, that the global arena can grow to. Defaults to all
90-
* of the available memory on the current device.
82+
* @param upstream_mr The memory resource from which to allocate blocks for the global arena.
83+
* @param arena_size Size in bytes of the global arena. Defaults to half of the available memory
84+
* on the current device.
85+
* @param dump_log_on_failure If true, dump memory log when running out of memory.
9186
*/
9287
explicit arena_memory_resource(Upstream* upstream_mr,
93-
std::size_t initial_size = global_arena::default_initial_size,
94-
std::size_t maximum_size = global_arena::default_maximum_size,
95-
bool dump_log_on_failure = false)
96-
: global_arena_{upstream_mr, initial_size, maximum_size},
97-
dump_log_on_failure_{dump_log_on_failure}
88+
std::optional<std::size_t> arena_size = std::nullopt,
89+
bool dump_log_on_failure = false)
90+
: global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure}
9891
{
9992
if (dump_log_on_failure_) {
10093
logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log");
94+
// Set the level to `debug` for more detailed output.
95+
logger_->set_level(spdlog::level::info);
10196
}
10297
}
10398

@@ -125,17 +120,15 @@ class arena_memory_resource final : public device_memory_resource {
125120
bool supports_get_mem_info() const noexcept override { return false; }
126121

127122
private:
128-
using global_arena = detail::arena::global_arena<Upstream>;
129-
using arena = detail::arena::arena<Upstream>;
130-
using read_lock = std::shared_lock<std::shared_timed_mutex>;
131-
using write_lock = std::lock_guard<std::shared_timed_mutex>;
123+
using global_arena = rmm::mr::detail::arena::global_arena<Upstream>;
124+
using arena = rmm::mr::detail::arena::arena<Upstream>;
132125

133126
/**
134127
* @brief Allocates memory of size at least `bytes`.
135128
*
136129
* The returned pointer has at least 256-byte alignment.
137130
*
138-
* @throws `std::bad_alloc` if the requested allocation could not be fulfilled.
131+
* @throws `rmm::out_of_memory` if no more memory is available for the requested size.
139132
*
140133
* @param bytes The size in bytes of the allocation.
141134
* @param stream The stream to associate this allocation with.
@@ -144,52 +137,100 @@ class arena_memory_resource final : public device_memory_resource {
144137
void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
145138
{
146139
if (bytes <= 0) { return nullptr; }
140+
#ifdef RMM_ARENA_USE_SIZE_CLASSES
141+
bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
142+
#else
143+
bytes = rmm::detail::align_up(bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
144+
#endif
145+
auto& arena = get_arena(stream);
147146

148-
bytes = detail::arena::align_up(bytes);
149-
auto& arena = get_arena(stream);
150-
void* pointer = arena.allocate(bytes);
147+
{
148+
std::shared_lock lock(mtx_);
149+
void* pointer = arena.allocate(bytes);
150+
if (pointer != nullptr) { return pointer; }
151+
}
151152

152-
if (pointer == nullptr) {
153-
write_lock lock(mtx_);
153+
{
154+
std::unique_lock lock(mtx_);
154155
defragment();
155-
pointer = arena.allocate(bytes);
156+
void* pointer = arena.allocate(bytes);
156157
if (pointer == nullptr) {
157158
if (dump_log_on_failure_) { dump_memory_log(bytes); }
158159
RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
159160
}
161+
return pointer;
160162
}
163+
}
161164

162-
return pointer;
165+
/**
166+
* @brief Defragment memory by returning all superblocks to the global arena.
167+
*/
168+
void defragment()
169+
{
170+
RMM_CUDA_TRY(cudaDeviceSynchronize());
171+
for (auto& thread_arena : thread_arenas_) {
172+
thread_arena.second->clean();
173+
}
174+
for (auto& stream_arena : stream_arenas_) {
175+
stream_arena.second.clean();
176+
}
163177
}
164178

165179
/**
166180
* @brief Deallocate memory pointed to by `ptr`.
167181
*
168182
* @param ptr Pointer to be deallocated.
169183
* @param bytes The size in bytes of the allocation. This must be equal to the
170-
* value of `bytes` that was passed to the `allocate` call that returned `p`.
184+
* value of `bytes` that was passed to the `allocate` call that returned `ptr`.
171185
* @param stream Stream on which to perform deallocation.
172186
*/
173187
void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
174188
{
175189
if (ptr == nullptr || bytes <= 0) { return; }
190+
#ifdef RMM_ARENA_USE_SIZE_CLASSES
191+
bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
192+
#else
193+
bytes = rmm::detail::align_up(bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
194+
#endif
195+
auto& arena = get_arena(stream);
196+
197+
{
198+
std::shared_lock lock(mtx_);
199+
// If the memory being freed does not belong to the arena, the following will return false.
200+
if (arena.deallocate(ptr, bytes, stream)) { return; }
201+
}
202+
203+
{
204+
// Since we are returning this memory to another stream, we need to make sure the current
205+
// stream is caught up.
206+
stream.synchronize_no_throw();
176207

177-
bytes = detail::arena::align_up(bytes);
178-
get_arena(stream).deallocate(ptr, bytes, stream);
208+
std::unique_lock lock(mtx_);
209+
deallocate_from_other_arena(ptr, bytes, stream);
210+
}
179211
}
180212

181213
/**
182-
* @brief Defragment memory by returning all free blocks to the global arena.
214+
* @brief Deallocate memory pointed to by `ptr` that was allocated in a different arena.
215+
*
216+
* @param ptr Pointer to be deallocated.
217+
* @param bytes The size in bytes of the allocation. This must be equal to the
218+
* value of `bytes` that was passed to the `allocate` call that returned `ptr`.
219+
* @param stream Stream on which to perform deallocation.
183220
*/
184-
void defragment()
221+
void deallocate_from_other_arena(void* ptr, std::size_t bytes, cuda_stream_view stream)
185222
{
186-
RMM_CUDA_TRY(cudaDeviceSynchronize());
187-
for (auto& thread_arena : thread_arenas_) {
188-
thread_arena.second->clean();
189-
}
190-
for (auto& stream_arena : stream_arenas_) {
191-
stream_arena.second.clean();
223+
if (use_per_thread_arena(stream)) {
224+
for (auto const& thread_arena : thread_arenas_) {
225+
if (thread_arena.second->deallocate(ptr, bytes)) { return; }
226+
}
227+
} else {
228+
for (auto& stream_arena : stream_arenas_) {
229+
if (stream_arena.second.deallocate(ptr, bytes)) { return; }
230+
}
192231
}
232+
233+
if (!global_arena_.deallocate(ptr, bytes)) { RMM_FAIL("allocation not found"); }
193234
}
194235

195236
/**
@@ -213,12 +254,12 @@ class arena_memory_resource final : public device_memory_resource {
213254
{
214255
auto const thread_id = std::this_thread::get_id();
215256
{
216-
read_lock lock(mtx_);
257+
std::shared_lock lock(map_mtx_);
217258
auto const iter = thread_arenas_.find(thread_id);
218259
if (iter != thread_arenas_.end()) { return *iter->second; }
219260
}
220261
{
221-
write_lock lock(mtx_);
262+
std::unique_lock lock(map_mtx_);
222263
auto thread_arena = std::make_shared<arena>(global_arena_);
223264
thread_arenas_.emplace(thread_id, thread_arena);
224265
thread_local detail::arena::arena_cleaner<Upstream> cleaner{thread_arena};
@@ -235,12 +276,12 @@ class arena_memory_resource final : public device_memory_resource {
235276
{
236277
RMM_LOGGING_ASSERT(!use_per_thread_arena(stream));
237278
{
238-
read_lock lock(mtx_);
279+
std::shared_lock lock(map_mtx_);
239280
auto const iter = stream_arenas_.find(stream.value());
240281
if (iter != stream_arenas_.end()) { return iter->second; }
241282
}
242283
{
243-
write_lock lock(mtx_);
284+
std::unique_lock lock(map_mtx_);
244285
stream_arenas_.emplace(stream.value(), global_arena_);
245286
return stream_arenas_.at(stream.value());
246287
}
@@ -269,18 +310,6 @@ class arena_memory_resource final : public device_memory_resource {
269310
logger_->info("**************************************************");
270311
logger_->info("Global arena:");
271312
global_arena_.dump_memory_log(logger_);
272-
logger_->info("Per-thread arenas:");
273-
for (auto const& thread_arena : thread_arenas_) {
274-
logger_->info(" Thread {}:", thread_arena.first);
275-
thread_arena.second->dump_memory_log(logger_);
276-
}
277-
if (!stream_arenas_.empty()) {
278-
logger_->info("Per-stream arenas:");
279-
for (auto const& stream_arena : stream_arenas_) {
280-
logger_->info(" Stream {}:", static_cast<void*>(stream_arena.first));
281-
stream_arena.second.dump_memory_log(logger_);
282-
}
283-
}
284313
logger_->flush();
285314
}
286315

@@ -304,11 +333,13 @@ class arena_memory_resource final : public device_memory_resource {
304333
/// Implementation note: for small sizes, map is more efficient than unordered_map.
305334
std::map<cudaStream_t, arena> stream_arenas_;
306335
/// If true, dump memory information to log on allocation failure.
307-
bool dump_log_on_failure_;
336+
bool dump_log_on_failure_{};
308337
/// The logger for memory dump.
309338
std::shared_ptr<spdlog::logger> logger_{};
310-
/// Mutex for read and write locks.
311-
mutable std::shared_timed_mutex mtx_;
339+
/// Mutex for read and write locks on arena maps.
340+
mutable std::shared_mutex map_mtx_;
341+
/// Mutex for shared and unique locks on the mr.
342+
mutable std::shared_mutex mtx_;
312343
};
313344

314345
} // namespace rmm::mr

0 commit comments

Comments
 (0)