@@ -325,13 +325,61 @@ namespace Realm {
325325 bool _exclusive)
326326 : parent_space(_parent), field_data(_field_data) {
327327 this ->exclusive = _exclusive;
328- Memory my_mem = field_data[0 ].inst .get_location ();
329- Processor best_proc;
330- assert (choose_proc (best_proc, my_mem));
331- Cuda::GPUProcessor* gpu_proc = dynamic_cast <Cuda::GPUProcessor*>(get_runtime ()->get_processor_impl (best_proc));
332- assert (gpu_proc);
333- this ->gpu = gpu_proc->gpu ;
334- this ->stream = gpu_proc->gpu ->get_deppart_stream ();
328+ areg.force_instantiation ();
329+ // GPU setup (this->gpu, this->stream) deferred to execute(), which runs on the
330+ // correct node after dispatch() has forwarded to the instance owner if needed.
331+ }
332+
333+ template <int N, typename T, typename FT>
334+ template <typename S>
335+ GPUByFieldMicroOp<N, T, FT>::GPUByFieldMicroOp(
336+ NodeID _requestor, AsyncMicroOp *_async_microop, S& s)
337+ : GPUMicroOp<N,T>(_requestor, _async_microop)
338+ , parent_space() {
339+ bool ok = true ;
340+ size_t n = 0 ;
341+ ok = ok && (s >> parent_space);
342+ ok = ok && (s >> this ->exclusive );
343+ ok = ok && (s >> n);
344+ field_data.resize (n);
345+ for (size_t i = 0 ; i < n && ok; i++)
346+ ok = ok && (s >> field_data[i].index_space ) &&
347+ (s >> field_data[i].inst ) &&
348+ (s >> field_data[i].field_offset ) &&
349+ (s >> field_data[i].scratch_buffer );
350+ // Deserialize colors manually to avoid std::vector<bool> proxy issues
351+ size_t nc = 0 ;
352+ ok = ok && (s >> nc);
353+ for (size_t i = 0 ; i < nc && ok; i++) {
354+ FT c;
355+ ok = ok && (s >> c);
356+ if (ok) colors.push_back (c);
357+ }
358+ ok = ok && (s >> sparsity_outputs);
359+ assert (ok);
360+ (void )ok;
361+ }
362+
363+ template <int N, typename T, typename FT>
364+ template <typename S>
365+ bool GPUByFieldMicroOp<N, T, FT>::serialize_params(S& s) const {
366+ bool ok = true ;
367+ ok = ok && (s << parent_space);
368+ ok = ok && (s << this ->exclusive );
369+ ok = ok && (s << field_data.size ());
370+ for (size_t i = 0 ; i < field_data.size () && ok; i++)
371+ ok = ok && (s << field_data[i].index_space ) &&
372+ (s << field_data[i].inst ) &&
373+ (s << field_data[i].field_offset ) &&
374+ (s << field_data[i].scratch_buffer );
375+ // Serialize colors manually to avoid std::vector<bool> proxy issues
376+ ok = ok && (s << colors.size ());
377+ for (size_t i = 0 ; i < colors.size () && ok; i++) {
378+ FT c = colors[i];
379+ ok = ok && (s << c);
380+ }
381+ ok = ok && (s << sparsity_outputs);
382+ return ok;
335383 }
336384
337385 template <int N, typename T, typename FT>
@@ -342,6 +390,17 @@ namespace Realm {
342390 void GPUByFieldMicroOp<N, T, FT>::dispatch(
343391 PartitioningOperation *op, bool inline_ok) {
344392
393+ // GPU by-field must execute on the node that owns the GPU memory
394+ NodeID exec_node = ID (field_data[0 ].inst ).instance_owner_node ();
395+ if (this ->exclusive ) {
396+ for (const auto & it : sparsity_outputs)
397+ assert (NodeID (ID (it.second ).sparsity_creator_node ()) == exec_node);
398+ }
399+ if (exec_node != Network::my_node_id) {
400+ PartitioningMicroOp::template forward_microop<GPUByFieldMicroOp<N,T,FT> >(exec_node, op, this );
401+ return ;
402+ }
403+
345404 // We have to register ourselves as a waiter on sparse inputs before dispatching.
346405
347406 for (size_t i = 0 ; i < field_data.size (); i++) {
@@ -367,6 +426,10 @@ namespace Realm {
367426 sparsity_outputs[_val] = _sparsity;
368427 }
369428
429+ template <int N, typename T, typename FT>
430+ ActiveMessageHandlerReg<RemoteMicroOpMessage<GPUByFieldMicroOp<N, T, FT> > >
431+ GPUByFieldMicroOp<N, T, FT>::areg;
432+
370433#endif
371434
372435
@@ -383,12 +446,26 @@ namespace Realm {
383446 : PartitioningOperation(reqs, _finish_event, _finish_gen)
384447 , parent(_parent)
385448 , field_data(_field_data)
449+ , exclusive_gpu_owner(exclusive_gpu_exec_node())
386450 {}
387451
388452 template <int N, typename T, typename FT>
389453 ByFieldOperation<N,T,FT>::~ByFieldOperation (void )
390454 {}
391455
456+ template <int N, typename T, typename FT>
457+ NodeID ByFieldOperation<N,T,FT>::exclusive_gpu_exec_node(void ) const
458+ {
459+ if (field_data.size () != 1 )
460+ return -1 ;
461+
462+ Memory::Kind kind = field_data[0 ].inst .get_location ().kind ();
463+ if ((kind != Memory::GPU_FB_MEM) && (kind != Memory::Z_COPY_MEM))
464+ return -1 ;
465+
466+ return ID (field_data[0 ].inst ).instance_owner_node ();
467+ }
468+
392469 template <int N, typename T, typename FT>
393470 IndexSpace<N,T> ByFieldOperation<N,T,FT>::add_color(FT color)
394471 {
@@ -401,8 +478,13 @@ namespace Realm {
401478 subspace.bounds = parent.bounds ;
402479
403480 // get a sparsity ID by round-robin'ing across the nodes that have field data
404- int target_node = ID (field_data[colors.size () % field_data.size ()].inst ).instance_owner_node ();
405- SparsityMap<N,T> sparsity = get_runtime ()->get_available_sparsity_impl (target_node)->me .convert <SparsityMap<N,T> >();
481+ int target_node = (exclusive_gpu_owner >= 0 ) ?
482+ exclusive_gpu_owner :
483+ ID (field_data[colors.size () % field_data.size ()].inst ).instance_owner_node ();
484+ if (exclusive_gpu_owner >= 0 )
485+ assert (target_node == exclusive_gpu_exec_node ());
486+ SparsityMap<N,T> sparsity =
487+ create_deppart_output_sparsity (target_node).convert <SparsityMap<N, T>>();
406488 subspace.sparsity = sparsity;
407489
408490 colors.push_back (color);
0 commit comments