StanfordLegion
diff --git a/‎src/realm/deppart/byfield.cc‎
Lines changed: 91 additions & 9 deletions b/‎src/realm/deppart/byfield.cc‎
Lines changed: 91 additions & 9 deletions
diff --git a/‎src/realm/deppart/byfield.h‎
Lines changed: 19 additions & 1 deletion b/‎src/realm/deppart/byfield.h‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎src/realm/deppart/byfield_gpu_impl.hpp‎
Lines changed: 26 additions & 14 deletions b/‎src/realm/deppart/byfield_gpu_impl.hpp‎
Lines changed: 26 additions & 14 deletions
@@ -325,13 +325,61 @@ namespace Realm {
     bool _exclusive)
     : parent_space(_parent), field_data(_field_data) {
     this->exclusive = _exclusive;
-    Memory my_mem = field_data[0].inst.get_location();
-    Processor best_proc;
-    assert(choose_proc(best_proc, my_mem));
-    Cuda::GPUProcessor* gpu_proc = dynamic_cast<Cuda::GPUProcessor*>(get_runtime()->get_processor_impl(best_proc));
-    assert(gpu_proc);
-    this->gpu = gpu_proc->gpu;
-    this->stream = gpu_proc->gpu->get_deppart_stream();
+    areg.force_instantiation();
+    // GPU setup (this->gpu, this->stream) deferred to execute(), which runs on the
+    // correct node after dispatch() has forwarded to the instance owner if needed.
+  }
+
+  template<int N, typename T, typename FT>
+  template <typename S>
+  GPUByFieldMicroOp<N, T, FT>::GPUByFieldMicroOp(
+    NodeID _requestor, AsyncMicroOp *_async_microop, S& s)
+    : GPUMicroOp<N,T>(_requestor, _async_microop)
+    , parent_space() {
+    bool ok = true;
+    size_t n = 0;
+    ok = ok && (s >> parent_space);
+    ok = ok && (s >> this->exclusive);
+    ok = ok && (s >> n);
+    field_data.resize(n);
+    for(size_t i = 0; i < n && ok; i++)
+      ok = ok && (s >> field_data[i].index_space) &&
+                 (s >> field_data[i].inst) &&
+                 (s >> field_data[i].field_offset) &&
+                 (s >> field_data[i].scratch_buffer);
+    // Deserialize colors manually to avoid std::vector<bool> proxy issues
+    size_t nc = 0;
+    ok = ok && (s >> nc);
+    for(size_t i = 0; i < nc && ok; i++) {
+      FT c;
+      ok = ok && (s >> c);
+      if(ok) colors.push_back(c);
+    }
+    ok = ok && (s >> sparsity_outputs);
+    assert(ok);
+    (void)ok;
+  }
+
+  template<int N, typename T, typename FT>
+  template <typename S>
+  bool GPUByFieldMicroOp<N, T, FT>::serialize_params(S& s) const {
+    bool ok = true;
+    ok = ok && (s << parent_space);
+    ok = ok && (s << this->exclusive);
+    ok = ok && (s << field_data.size());
+    for(size_t i = 0; i < field_data.size() && ok; i++)
+      ok = ok && (s << field_data[i].index_space) &&
+                 (s << field_data[i].inst) &&
+                 (s << field_data[i].field_offset) &&
+                 (s << field_data[i].scratch_buffer);
+    // Serialize colors manually to avoid std::vector<bool> proxy issues
+    ok = ok && (s << colors.size());
+    for(size_t i = 0; i < colors.size() && ok; i++) {
+      FT c = colors[i];
+      ok = ok && (s << c);
+    }
+    ok = ok && (s << sparsity_outputs);
+    return ok;
   }
 
   template<int N, typename T, typename FT>
@@ -342,6 +390,17 @@ namespace Realm {
   void GPUByFieldMicroOp<N, T, FT>::dispatch(
     PartitioningOperation *op, bool inline_ok) {
 
+    // GPU by-field must execute on the node that owns the GPU memory
+    NodeID exec_node = ID(field_data[0].inst).instance_owner_node();
+    if(this->exclusive) {
+      for(const auto& it : sparsity_outputs)
+        assert(NodeID(ID(it.second).sparsity_creator_node()) == exec_node);
+    }
+    if(exec_node != Network::my_node_id) {
+      PartitioningMicroOp::template forward_microop<GPUByFieldMicroOp<N,T,FT> >(exec_node, op, this);
+      return;
+    }
+
     // We have to register ourselves as a waiter on sparse inputs before dispatching.
 
     for (size_t i = 0; i < field_data.size(); i++) {
@@ -367,6 +426,10 @@ namespace Realm {
     sparsity_outputs[_val] = _sparsity;
   }
 
+  template <int N, typename T, typename FT>
+  ActiveMessageHandlerReg<RemoteMicroOpMessage<GPUByFieldMicroOp<N, T, FT> > >
+      GPUByFieldMicroOp<N, T, FT>::areg;
+
 #endif
 
 
@@ -383,12 +446,26 @@ namespace Realm {
     : PartitioningOperation(reqs, _finish_event, _finish_gen)
     , parent(_parent)
     , field_data(_field_data)
+    , exclusive_gpu_owner(exclusive_gpu_exec_node())
   {}
 
   template <int N, typename T, typename FT>
   ByFieldOperation<N,T,FT>::~ByFieldOperation(void)
   {}
 
+  template <int N, typename T, typename FT>
+  NodeID ByFieldOperation<N,T,FT>::exclusive_gpu_exec_node(void) const
+  {
+    if(field_data.size() != 1)
+      return -1;
+
+    Memory::Kind kind = field_data[0].inst.get_location().kind();
+    if((kind != Memory::GPU_FB_MEM) && (kind != Memory::Z_COPY_MEM))
+      return -1;
+
+    return ID(field_data[0].inst).instance_owner_node();
+  }
+
   template <int N, typename T, typename FT>
   IndexSpace<N,T> ByFieldOperation<N,T,FT>::add_color(FT color)
   {
@@ -401,8 +478,13 @@ namespace Realm {
     subspace.bounds = parent.bounds;
 
     // get a sparsity ID by round-robin'ing across the nodes that have field data
-    int target_node = ID(field_data[colors.size() % field_data.size()].inst).instance_owner_node();
-    SparsityMap<N,T> sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert<SparsityMap<N,T> >();
+    int target_node = (exclusive_gpu_owner >= 0) ?
+        exclusive_gpu_owner :
+        ID(field_data[colors.size() % field_data.size()].inst).instance_owner_node();
+    if(exclusive_gpu_owner >= 0)
+      assert(target_node == exclusive_gpu_exec_node());
+    SparsityMap<N,T> sparsity =
+        create_deppart_output_sparsity(target_node).convert<SparsityMap<N, T>>();
     subspace.sparsity = sparsity;
 
     colors.push_back(color);
 
@@ -73,6 +73,10 @@ namespace Realm {
   template<int N, typename T, typename FT>
     class GPUByFieldMicroOp : public GPUMicroOp<N, T> {
   public:
+    static const int DIM = N;
+    typedef T IDXTYPE;
+    typedef FT FIELDTYPE;
+
     GPUByFieldMicroOp(
         const IndexSpace<N, T> &_parent,
         std::vector<FieldDataDescriptor<IndexSpace<N,T>,FT> > _field_data,
@@ -87,7 +91,18 @@ namespace Realm {
     void add_sparsity_output(FT _val, SparsityMap<N, T> _sparsity);
 
   protected:
-    const IndexSpace<N, T> parent_space;
+    friend struct RemoteMicroOpMessage<GPUByFieldMicroOp<N,T,FT> >;
+    static ActiveMessageHandlerReg<RemoteMicroOpMessage<GPUByFieldMicroOp<N,T,FT> > > areg;
+
+    friend class PartitioningMicroOp;
+    template <typename S>
+    REALM_ATTR_WARN_UNUSED(bool serialize_params(S& s) const);
+
+    // construct from received packet
+    template <typename S>
+    GPUByFieldMicroOp(NodeID _requestor, AsyncMicroOp *_async_microop, S& s);
+
+    IndexSpace<N, T> parent_space;
     std::vector<FieldDataDescriptor<IndexSpace<N,T>,FT> > field_data;
     std::vector<FT> colors;
     std::map<FT, SparsityMap<N,T> > sparsity_outputs;
@@ -112,10 +127,13 @@ namespace Realm {
     virtual void print(std::ostream& os) const;
 
   protected:
+    NodeID exclusive_gpu_exec_node(void) const;
+
     IndexSpace<N,T> parent;
     std::vector<FieldDataDescriptor<IndexSpace<N,T>,FT> > field_data;
     std::vector<FT> colors;
     std::vector<SparsityMap<N,T> > subspaces;
+    int exclusive_gpu_owner;
   };
 
 };
 
@@ -17,6 +17,20 @@ namespace Realm {
 template <int N, typename T, typename FT>
 void GPUByFieldMicroOp<N,T,FT>::execute()
 {
+  // Resolve the local GPU processor now that we are guaranteed to be on the
+  // correct node (dispatch() forwarded us here if the instance was remote).
+  {
+    Memory my_mem = field_data[0].inst.get_location();
+    Processor best_proc;
+    assert(choose_proc(best_proc, my_mem));
+    Cuda::GPUProcessor *gpu_proc =
+        dynamic_cast<Cuda::GPUProcessor *>(get_runtime()->get_processor_impl(best_proc));
+    assert(gpu_proc);
+    this->gpu = gpu_proc->gpu;
+    this->stream = gpu_proc->gpu->get_deppart_stream();
+  }
+
+
 
   Cuda::AutoGPUContext agc(this->gpu);
 
@@ -75,15 +89,14 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
   }
 
 
-  Memory zcpy_mem;
-  assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM, buffer_arena.location));
-
-  // We need to pass the accessors to the GPU so it can read field values.
-  RegionInstance accessors_instance = this->realm_malloc(field_data.size() * sizeof(AffineAccessor<FT,N,T>), zcpy_mem);
-  AffineAccessor<FT,N,T>* d_accessors = reinterpret_cast<AffineAccessor<FT,N,T>*>(AffineAccessor<char,1>(accessors_instance, 0).base);
+  std::vector<AffineAccessor<FT,N,T>> h_accessors(field_data.size());
   for (size_t i = 0; i < field_data.size(); ++i) {
-    d_accessors[i] = AffineAccessor<FT,N,T>(field_data[i].inst, field_data[i].field_offset);
+    h_accessors[i] = AffineAccessor<FT,N,T>(field_data[i].inst, field_data[i].field_offset);
   }
+  AffineAccessor<FT,N,T>* d_accessors = buffer_arena.alloc<AffineAccessor<FT,N,T>>(field_data.size());
+  CUDA_CHECK(cudaMemcpyAsync(d_accessors, h_accessors.data(),
+                             field_data.size() * sizeof(AffineAccessor<FT,N,T>),
+                             cudaMemcpyHostToDevice, stream), stream);
 
   buffer_arena.commit(false);
 
@@ -103,7 +116,7 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
   int count = 0;
   if (count) {}
   bool host_fallback = false;
-  std::vector<RegionInstance> h_instances(colors.size(), RegionInstance::NO_INST);
+  std::vector<Rect<N, T>*> host_rect_buffers(colors.size(), nullptr);
   std::vector<size_t> entry_counts(colors.size(), 0);
   while (num_completed < inst_space.num_entries) {
     try {
@@ -167,7 +180,7 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
                            });
 
       if (host_fallback) {
-        this->split_output(d_new_rects, num_new_rects, h_instances, entry_counts, buffer_arena);
+        this->split_output(d_new_rects, num_new_rects, host_rect_buffers, entry_counts, buffer_arena);
       }
 
       if (num_output==0 || host_fallback) {
@@ -216,7 +229,7 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
         } else {
           host_fallback = true;
           if (num_output > 0) {
-            this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+            this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena);
           }
           curr_tile = tile_size / 2;
         }
@@ -248,7 +261,7 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
                               return kv.second;
                            });
     } catch (arena_oom&) {
-      this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+      this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena);
       host_fallback = true;
     }
   }
@@ -261,10 +274,9 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
       }
       size_t idx = color_indices.at(it.first);
       if (entry_counts[idx] > 0) {
-        Rect<N, T>* h_rects = reinterpret_cast<Rect<N,T> *>(AffineAccessor<char,1>(h_instances[idx], 0).base);
-        span<Rect<N, T>> h_rects_span(h_rects, entry_counts[idx]);
+        span<Rect<N, T>> h_rects_span(host_rect_buffers[idx], entry_counts[idx]);
         impl->contribute_dense_rect_list(h_rects_span, true);
-        h_instances[idx].destroy();
+        deppart_host_free(host_rect_buffers[idx]);
       } else {
         impl->contribute_nothing();
       }