Skip to content

Commit 962545c

Browse files
committed
for flecsii
1 parent 6854662 commit 962545c

17 files changed

Lines changed: 1047 additions & 196 deletions

src/realm/deppart/byfield.cc

Lines changed: 91 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -325,13 +325,61 @@ namespace Realm {
325325
bool _exclusive)
326326
: parent_space(_parent), field_data(_field_data) {
327327
this->exclusive = _exclusive;
328-
Memory my_mem = field_data[0].inst.get_location();
329-
Processor best_proc;
330-
assert(choose_proc(best_proc, my_mem));
331-
Cuda::GPUProcessor* gpu_proc = dynamic_cast<Cuda::GPUProcessor*>(get_runtime()->get_processor_impl(best_proc));
332-
assert(gpu_proc);
333-
this->gpu = gpu_proc->gpu;
334-
this->stream = gpu_proc->gpu->get_deppart_stream();
328+
areg.force_instantiation();
329+
// GPU setup (this->gpu, this->stream) deferred to execute(), which runs on the
330+
// correct node after dispatch() has forwarded to the instance owner if needed.
331+
}
332+
333+
template<int N, typename T, typename FT>
334+
template <typename S>
335+
GPUByFieldMicroOp<N, T, FT>::GPUByFieldMicroOp(
336+
NodeID _requestor, AsyncMicroOp *_async_microop, S& s)
337+
: GPUMicroOp<N,T>(_requestor, _async_microop)
338+
, parent_space() {
339+
bool ok = true;
340+
size_t n = 0;
341+
ok = ok && (s >> parent_space);
342+
ok = ok && (s >> this->exclusive);
343+
ok = ok && (s >> n);
344+
field_data.resize(n);
345+
for(size_t i = 0; i < n && ok; i++)
346+
ok = ok && (s >> field_data[i].index_space) &&
347+
(s >> field_data[i].inst) &&
348+
(s >> field_data[i].field_offset) &&
349+
(s >> field_data[i].scratch_buffer);
350+
// Deserialize colors manually to avoid std::vector<bool> proxy issues
351+
size_t nc = 0;
352+
ok = ok && (s >> nc);
353+
for(size_t i = 0; i < nc && ok; i++) {
354+
FT c;
355+
ok = ok && (s >> c);
356+
if(ok) colors.push_back(c);
357+
}
358+
ok = ok && (s >> sparsity_outputs);
359+
assert(ok);
360+
(void)ok;
361+
}
362+
363+
template<int N, typename T, typename FT>
364+
template <typename S>
365+
bool GPUByFieldMicroOp<N, T, FT>::serialize_params(S& s) const {
366+
bool ok = true;
367+
ok = ok && (s << parent_space);
368+
ok = ok && (s << this->exclusive);
369+
ok = ok && (s << field_data.size());
370+
for(size_t i = 0; i < field_data.size() && ok; i++)
371+
ok = ok && (s << field_data[i].index_space) &&
372+
(s << field_data[i].inst) &&
373+
(s << field_data[i].field_offset) &&
374+
(s << field_data[i].scratch_buffer);
375+
// Serialize colors manually to avoid std::vector<bool> proxy issues
376+
ok = ok && (s << colors.size());
377+
for(size_t i = 0; i < colors.size() && ok; i++) {
378+
FT c = colors[i];
379+
ok = ok && (s << c);
380+
}
381+
ok = ok && (s << sparsity_outputs);
382+
return ok;
335383
}
336384

337385
template<int N, typename T, typename FT>
@@ -342,6 +390,17 @@ namespace Realm {
342390
void GPUByFieldMicroOp<N, T, FT>::dispatch(
343391
PartitioningOperation *op, bool inline_ok) {
344392

393+
// GPU by-field must execute on the node that owns the GPU memory
394+
NodeID exec_node = ID(field_data[0].inst).instance_owner_node();
395+
if(this->exclusive) {
396+
for(const auto& it : sparsity_outputs)
397+
assert(NodeID(ID(it.second).sparsity_creator_node()) == exec_node);
398+
}
399+
if(exec_node != Network::my_node_id) {
400+
PartitioningMicroOp::template forward_microop<GPUByFieldMicroOp<N,T,FT> >(exec_node, op, this);
401+
return;
402+
}
403+
345404
// We have to register ourselves as a waiter on sparse inputs before dispatching.
346405

347406
for (size_t i = 0; i < field_data.size(); i++) {
@@ -367,6 +426,10 @@ namespace Realm {
367426
sparsity_outputs[_val] = _sparsity;
368427
}
369428

429+
template <int N, typename T, typename FT>
430+
ActiveMessageHandlerReg<RemoteMicroOpMessage<GPUByFieldMicroOp<N, T, FT> > >
431+
GPUByFieldMicroOp<N, T, FT>::areg;
432+
370433
#endif
371434

372435

@@ -383,12 +446,26 @@ namespace Realm {
383446
: PartitioningOperation(reqs, _finish_event, _finish_gen)
384447
, parent(_parent)
385448
, field_data(_field_data)
449+
, exclusive_gpu_owner(exclusive_gpu_exec_node())
386450
{}
387451

388452
template <int N, typename T, typename FT>
389453
ByFieldOperation<N,T,FT>::~ByFieldOperation(void)
390454
{}
391455

456+
template <int N, typename T, typename FT>
457+
NodeID ByFieldOperation<N,T,FT>::exclusive_gpu_exec_node(void) const
458+
{
459+
if(field_data.size() != 1)
460+
return -1;
461+
462+
Memory::Kind kind = field_data[0].inst.get_location().kind();
463+
if((kind != Memory::GPU_FB_MEM) && (kind != Memory::Z_COPY_MEM))
464+
return -1;
465+
466+
return ID(field_data[0].inst).instance_owner_node();
467+
}
468+
392469
template <int N, typename T, typename FT>
393470
IndexSpace<N,T> ByFieldOperation<N,T,FT>::add_color(FT color)
394471
{
@@ -401,8 +478,13 @@ namespace Realm {
401478
subspace.bounds = parent.bounds;
402479

403480
// get a sparsity ID by round-robin'ing across the nodes that have field data
404-
int target_node = ID(field_data[colors.size() % field_data.size()].inst).instance_owner_node();
405-
SparsityMap<N,T> sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert<SparsityMap<N,T> >();
481+
int target_node = (exclusive_gpu_owner >= 0) ?
482+
exclusive_gpu_owner :
483+
ID(field_data[colors.size() % field_data.size()].inst).instance_owner_node();
484+
if(exclusive_gpu_owner >= 0)
485+
assert(target_node == exclusive_gpu_exec_node());
486+
SparsityMap<N,T> sparsity =
487+
create_deppart_output_sparsity(target_node).convert<SparsityMap<N, T>>();
406488
subspace.sparsity = sparsity;
407489

408490
colors.push_back(color);

src/realm/deppart/byfield.h

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,10 @@ namespace Realm {
7373
template<int N, typename T, typename FT>
7474
class GPUByFieldMicroOp : public GPUMicroOp<N, T> {
7575
public:
76+
static const int DIM = N;
77+
typedef T IDXTYPE;
78+
typedef FT FIELDTYPE;
79+
7680
GPUByFieldMicroOp(
7781
const IndexSpace<N, T> &_parent,
7882
std::vector<FieldDataDescriptor<IndexSpace<N,T>,FT> > _field_data,
@@ -87,7 +91,18 @@ namespace Realm {
8791
void add_sparsity_output(FT _val, SparsityMap<N, T> _sparsity);
8892

8993
protected:
90-
const IndexSpace<N, T> parent_space;
94+
friend struct RemoteMicroOpMessage<GPUByFieldMicroOp<N,T,FT> >;
95+
static ActiveMessageHandlerReg<RemoteMicroOpMessage<GPUByFieldMicroOp<N,T,FT> > > areg;
96+
97+
friend class PartitioningMicroOp;
98+
template <typename S>
99+
REALM_ATTR_WARN_UNUSED(bool serialize_params(S& s) const);
100+
101+
// construct from received packet
102+
template <typename S>
103+
GPUByFieldMicroOp(NodeID _requestor, AsyncMicroOp *_async_microop, S& s);
104+
105+
IndexSpace<N, T> parent_space;
91106
std::vector<FieldDataDescriptor<IndexSpace<N,T>,FT> > field_data;
92107
std::vector<FT> colors;
93108
std::map<FT, SparsityMap<N,T> > sparsity_outputs;
@@ -112,10 +127,13 @@ namespace Realm {
112127
virtual void print(std::ostream& os) const;
113128

114129
protected:
130+
NodeID exclusive_gpu_exec_node(void) const;
131+
115132
IndexSpace<N,T> parent;
116133
std::vector<FieldDataDescriptor<IndexSpace<N,T>,FT> > field_data;
117134
std::vector<FT> colors;
118135
std::vector<SparsityMap<N,T> > subspaces;
136+
int exclusive_gpu_owner;
119137
};
120138

121139
};

src/realm/deppart/byfield_gpu_impl.hpp

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,20 @@ namespace Realm {
1717
template <int N, typename T, typename FT>
1818
void GPUByFieldMicroOp<N,T,FT>::execute()
1919
{
20+
// Resolve the local GPU processor now that we are guaranteed to be on the
21+
// correct node (dispatch() forwarded us here if the instance was remote).
22+
{
23+
Memory my_mem = field_data[0].inst.get_location();
24+
Processor best_proc;
25+
assert(choose_proc(best_proc, my_mem));
26+
Cuda::GPUProcessor *gpu_proc =
27+
dynamic_cast<Cuda::GPUProcessor *>(get_runtime()->get_processor_impl(best_proc));
28+
assert(gpu_proc);
29+
this->gpu = gpu_proc->gpu;
30+
this->stream = gpu_proc->gpu->get_deppart_stream();
31+
}
32+
33+
2034

2135
Cuda::AutoGPUContext agc(this->gpu);
2236

@@ -75,15 +89,14 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
7589
}
7690

7791

78-
Memory zcpy_mem;
79-
assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM, buffer_arena.location));
80-
81-
// We need to pass the accessors to the GPU so it can read field values.
82-
RegionInstance accessors_instance = this->realm_malloc(field_data.size() * sizeof(AffineAccessor<FT,N,T>), zcpy_mem);
83-
AffineAccessor<FT,N,T>* d_accessors = reinterpret_cast<AffineAccessor<FT,N,T>*>(AffineAccessor<char,1>(accessors_instance, 0).base);
92+
std::vector<AffineAccessor<FT,N,T>> h_accessors(field_data.size());
8493
for (size_t i = 0; i < field_data.size(); ++i) {
85-
d_accessors[i] = AffineAccessor<FT,N,T>(field_data[i].inst, field_data[i].field_offset);
94+
h_accessors[i] = AffineAccessor<FT,N,T>(field_data[i].inst, field_data[i].field_offset);
8695
}
96+
AffineAccessor<FT,N,T>* d_accessors = buffer_arena.alloc<AffineAccessor<FT,N,T>>(field_data.size());
97+
CUDA_CHECK(cudaMemcpyAsync(d_accessors, h_accessors.data(),
98+
field_data.size() * sizeof(AffineAccessor<FT,N,T>),
99+
cudaMemcpyHostToDevice, stream), stream);
87100

88101
buffer_arena.commit(false);
89102

@@ -103,7 +116,7 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
103116
int count = 0;
104117
if (count) {}
105118
bool host_fallback = false;
106-
std::vector<RegionInstance> h_instances(colors.size(), RegionInstance::NO_INST);
119+
std::vector<Rect<N, T>*> host_rect_buffers(colors.size(), nullptr);
107120
std::vector<size_t> entry_counts(colors.size(), 0);
108121
while (num_completed < inst_space.num_entries) {
109122
try {
@@ -167,7 +180,7 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
167180
});
168181

169182
if (host_fallback) {
170-
this->split_output(d_new_rects, num_new_rects, h_instances, entry_counts, buffer_arena);
183+
this->split_output(d_new_rects, num_new_rects, host_rect_buffers, entry_counts, buffer_arena);
171184
}
172185

173186
if (num_output==0 || host_fallback) {
@@ -216,7 +229,7 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
216229
} else {
217230
host_fallback = true;
218231
if (num_output > 0) {
219-
this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
232+
this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena);
220233
}
221234
curr_tile = tile_size / 2;
222235
}
@@ -248,7 +261,7 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
248261
return kv.second;
249262
});
250263
} catch (arena_oom&) {
251-
this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
264+
this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena);
252265
host_fallback = true;
253266
}
254267
}
@@ -261,10 +274,9 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
261274
}
262275
size_t idx = color_indices.at(it.first);
263276
if (entry_counts[idx] > 0) {
264-
Rect<N, T>* h_rects = reinterpret_cast<Rect<N,T> *>(AffineAccessor<char,1>(h_instances[idx], 0).base);
265-
span<Rect<N, T>> h_rects_span(h_rects, entry_counts[idx]);
277+
span<Rect<N, T>> h_rects_span(host_rect_buffers[idx], entry_counts[idx]);
266278
impl->contribute_dense_rect_list(h_rects_span, true);
267-
h_instances[idx].destroy();
279+
deppart_host_free(host_rect_buffers[idx]);
268280
} else {
269281
impl->contribute_nothing();
270282
}

0 commit comments

Comments
 (0)