1
1
/*
2
- * Copyright (c) 2020-2021 , NVIDIA CORPORATION.
2
+ * Copyright (c) 2020-2022 , NVIDIA CORPORATION.
3
3
*
4
4
* Licensed under the Apache License, Version 2.0 (the "License");
5
5
* you may not use this file except in compliance with the License.
@@ -78,26 +78,21 @@ class arena_memory_resource final : public device_memory_resource {
78
78
* @brief Construct an `arena_memory_resource`.
79
79
*
80
80
* @throws rmm::logic_error if `upstream_mr == nullptr`.
81
- * @throws rmm::logic_error if `initial_size` is neither the default nor aligned to a multiple of
82
- * 256 bytes.
83
- * @throws rmm::logic_error if `maximum_size` is neither the default nor aligned to a multiple of
84
- * 256 bytes.
85
81
*
86
- * @param upstream_mr The memory resource from which to allocate blocks for the pool
87
- * @param initial_size Minimum size, in bytes, of the initial global arena. Defaults to half of
88
- * the available memory on the current device.
89
- * @param maximum_size Maximum size, in bytes, that the global arena can grow to. Defaults to all
90
- * of the available memory on the current device.
82
+ * @param upstream_mr The memory resource from which to allocate blocks for the global arena.
83
+ * @param arena_size Size in bytes of the global arena. Defaults to half of the available memory
84
+ * on the current device.
85
+ * @param dump_log_on_failure If true, dump memory log when running out of memory.
91
86
*/
92
87
explicit arena_memory_resource (Upstream* upstream_mr,
93
- std::size_t initial_size = global_arena::default_initial_size,
94
- std::size_t maximum_size = global_arena::default_maximum_size,
95
- bool dump_log_on_failure = false )
96
- : global_arena_{upstream_mr, initial_size, maximum_size},
97
- dump_log_on_failure_{dump_log_on_failure}
88
+ std::optional<std::size_t > arena_size = std::nullopt,
89
+ bool dump_log_on_failure = false )
90
+ : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure}
98
91
{
99
92
if (dump_log_on_failure_) {
100
93
logger_ = spdlog::basic_logger_mt (" arena_memory_dump" , " rmm_arena_memory_dump.log" );
94
+ // Set the level to `debug` for more detailed output.
95
+ logger_->set_level (spdlog::level::info);
101
96
}
102
97
}
103
98
@@ -125,17 +120,15 @@ class arena_memory_resource final : public device_memory_resource {
125
120
bool supports_get_mem_info () const noexcept override { return false ; }
126
121
127
122
private:
128
- using global_arena = detail::arena::global_arena<Upstream>;
129
- using arena = detail::arena::arena<Upstream>;
130
- using read_lock = std::shared_lock<std::shared_timed_mutex>;
131
- using write_lock = std::lock_guard<std::shared_timed_mutex>;
123
+ using global_arena = rmm::mr::detail::arena::global_arena<Upstream>;
124
+ using arena = rmm::mr::detail::arena::arena<Upstream>;
132
125
133
126
/* *
134
127
* @brief Allocates memory of size at least `bytes`.
135
128
*
136
129
* The returned pointer has at least 256-byte alignment.
137
130
*
138
- * @throws `std::bad_alloc ` if the requested allocation could not be fulfilled .
131
+ * @throws `rmm::out_of_memory ` if no more memory is available for the requested size .
139
132
*
140
133
* @param bytes The size in bytes of the allocation.
141
134
* @param stream The stream to associate this allocation with.
@@ -144,52 +137,100 @@ class arena_memory_resource final : public device_memory_resource {
144
137
void * do_allocate (std::size_t bytes, cuda_stream_view stream) override
145
138
{
146
139
if (bytes <= 0 ) { return nullptr ; }
140
+ #ifdef RMM_ARENA_USE_SIZE_CLASSES
141
+ bytes = rmm::mr::detail::arena::align_to_size_class (bytes);
142
+ #else
143
+ bytes = rmm::detail::align_up (bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
144
+ #endif
145
+ auto & arena = get_arena (stream);
147
146
148
- bytes = detail::arena::align_up (bytes);
149
- auto & arena = get_arena (stream);
150
- void * pointer = arena.allocate (bytes);
147
+ {
148
+ std::shared_lock lock (mtx_);
149
+ void * pointer = arena.allocate (bytes);
150
+ if (pointer != nullptr ) { return pointer; }
151
+ }
151
152
152
- if (pointer == nullptr ) {
153
- write_lock lock (mtx_);
153
+ {
154
+ std::unique_lock lock (mtx_);
154
155
defragment ();
155
- pointer = arena.allocate (bytes);
156
+ void * pointer = arena.allocate (bytes);
156
157
if (pointer == nullptr ) {
157
158
if (dump_log_on_failure_) { dump_memory_log (bytes); }
158
159
RMM_FAIL (" Maximum pool size exceeded" , rmm::out_of_memory);
159
160
}
161
+ return pointer;
160
162
}
163
+ }
161
164
162
- return pointer;
165
+ /* *
166
+ * @brief Defragment memory by returning all superblocks to the global arena.
167
+ */
168
+ void defragment ()
169
+ {
170
+ RMM_CUDA_TRY (cudaDeviceSynchronize ());
171
+ for (auto & thread_arena : thread_arenas_) {
172
+ thread_arena.second ->clean ();
173
+ }
174
+ for (auto & stream_arena : stream_arenas_) {
175
+ stream_arena.second .clean ();
176
+ }
163
177
}
164
178
165
179
/* *
166
180
* @brief Deallocate memory pointed to by `ptr`.
167
181
*
168
182
* @param ptr Pointer to be deallocated.
169
183
* @param bytes The size in bytes of the allocation. This must be equal to the
170
- * value of `bytes` that was passed to the `allocate` call that returned `p `.
184
+ * value of `bytes` that was passed to the `allocate` call that returned `ptr `.
171
185
* @param stream Stream on which to perform deallocation.
172
186
*/
173
187
void do_deallocate (void * ptr, std::size_t bytes, cuda_stream_view stream) override
174
188
{
175
189
if (ptr == nullptr || bytes <= 0 ) { return ; }
190
+ #ifdef RMM_ARENA_USE_SIZE_CLASSES
191
+ bytes = rmm::mr::detail::arena::align_to_size_class (bytes);
192
+ #else
193
+ bytes = rmm::detail::align_up (bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
194
+ #endif
195
+ auto & arena = get_arena (stream);
196
+
197
+ {
198
+ std::shared_lock lock (mtx_);
199
+ // If the memory being freed does not belong to the arena, the following will return false.
200
+ if (arena.deallocate (ptr, bytes, stream)) { return ; }
201
+ }
202
+
203
+ {
204
+ // Since we are returning this memory to another stream, we need to make sure the current
205
+ // stream is caught up.
206
+ stream.synchronize_no_throw ();
176
207
177
- bytes = detail::arena::align_up (bytes);
178
- get_arena (stream).deallocate (ptr, bytes, stream);
208
+ std::unique_lock lock (mtx_);
209
+ deallocate_from_other_arena (ptr, bytes, stream);
210
+ }
179
211
}
180
212
181
213
/* *
182
- * @brief Defragment memory by returning all free blocks to the global arena.
214
+ * @brief Deallocate memory pointed to by `ptr` that was allocated in a different arena.
215
+ *
216
+ * @param ptr Pointer to be deallocated.
217
+ * @param bytes The size in bytes of the allocation. This must be equal to the
218
+ * value of `bytes` that was passed to the `allocate` call that returned `ptr`.
219
+ * @param stream Stream on which to perform deallocation.
183
220
*/
184
- void defragment ( )
221
+ void deallocate_from_other_arena ( void * ptr, std:: size_t bytes, cuda_stream_view stream )
185
222
{
186
- RMM_CUDA_TRY (cudaDeviceSynchronize ());
187
- for (auto & thread_arena : thread_arenas_) {
188
- thread_arena.second ->clean ();
189
- }
190
- for (auto & stream_arena : stream_arenas_) {
191
- stream_arena.second .clean ();
223
+ if (use_per_thread_arena (stream)) {
224
+ for (auto const & thread_arena : thread_arenas_) {
225
+ if (thread_arena.second ->deallocate (ptr, bytes)) { return ; }
226
+ }
227
+ } else {
228
+ for (auto & stream_arena : stream_arenas_) {
229
+ if (stream_arena.second .deallocate (ptr, bytes)) { return ; }
230
+ }
192
231
}
232
+
233
+ if (!global_arena_.deallocate (ptr, bytes)) { RMM_FAIL (" allocation not found" ); }
193
234
}
194
235
195
236
/* *
@@ -213,12 +254,12 @@ class arena_memory_resource final : public device_memory_resource {
213
254
{
214
255
auto const thread_id = std::this_thread::get_id ();
215
256
{
216
- read_lock lock (mtx_ );
257
+ std::shared_lock lock (map_mtx_ );
217
258
auto const iter = thread_arenas_.find (thread_id);
218
259
if (iter != thread_arenas_.end ()) { return *iter->second ; }
219
260
}
220
261
{
221
- write_lock lock (mtx_ );
262
+ std::unique_lock lock (map_mtx_ );
222
263
auto thread_arena = std::make_shared<arena>(global_arena_);
223
264
thread_arenas_.emplace (thread_id, thread_arena);
224
265
thread_local detail::arena::arena_cleaner<Upstream> cleaner{thread_arena};
@@ -235,12 +276,12 @@ class arena_memory_resource final : public device_memory_resource {
235
276
{
236
277
RMM_LOGGING_ASSERT (!use_per_thread_arena (stream));
237
278
{
238
- read_lock lock (mtx_ );
279
+ std::shared_lock lock (map_mtx_ );
239
280
auto const iter = stream_arenas_.find (stream.value ());
240
281
if (iter != stream_arenas_.end ()) { return iter->second ; }
241
282
}
242
283
{
243
- write_lock lock (mtx_ );
284
+ std::unique_lock lock (map_mtx_ );
244
285
stream_arenas_.emplace (stream.value (), global_arena_);
245
286
return stream_arenas_.at (stream.value ());
246
287
}
@@ -269,18 +310,6 @@ class arena_memory_resource final : public device_memory_resource {
269
310
logger_->info (" **************************************************" );
270
311
logger_->info (" Global arena:" );
271
312
global_arena_.dump_memory_log (logger_);
272
- logger_->info (" Per-thread arenas:" );
273
- for (auto const & thread_arena : thread_arenas_) {
274
- logger_->info (" Thread {}:" , thread_arena.first );
275
- thread_arena.second ->dump_memory_log (logger_);
276
- }
277
- if (!stream_arenas_.empty ()) {
278
- logger_->info (" Per-stream arenas:" );
279
- for (auto const & stream_arena : stream_arenas_) {
280
- logger_->info (" Stream {}:" , static_cast <void *>(stream_arena.first ));
281
- stream_arena.second .dump_memory_log (logger_);
282
- }
283
- }
284
313
logger_->flush ();
285
314
}
286
315
@@ -304,11 +333,13 @@ class arena_memory_resource final : public device_memory_resource {
304
333
// / Implementation note: for small sizes, map is more efficient than unordered_map.
305
334
std::map<cudaStream_t, arena> stream_arenas_;
306
335
// / If true, dump memory information to log on allocation failure.
307
- bool dump_log_on_failure_;
336
+ bool dump_log_on_failure_{} ;
308
337
// / The logger for memory dump.
309
338
std::shared_ptr<spdlog::logger> logger_{};
310
- // / Mutex for read and write locks.
311
- mutable std::shared_timed_mutex mtx_;
339
+ // / Mutex for read and write locks on arena maps.
340
+ mutable std::shared_mutex map_mtx_;
341
+ // / Mutex for shared and unique locks on the mr.
342
+ mutable std::shared_mutex mtx_;
312
343
};
313
344
314
345
} // namespace rmm::mr
0 commit comments