From 64c32c78dc8292bb16b4912a2b31f636be9cb15b Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Fri, 9 Jan 2026 12:14:21 -0800
Subject: [PATCH 01/32] cleaned up diff

---
 cmake/deppart_tmpl.cu.in                     |   20 +
 src/CMakeLists.txt                           |   36 +-
 src/realm/deppart/byfield.cc                 |  120 +-
 src/realm/deppart/byfield.h                  |   24 +
 src/realm/deppart/byfield_gpu_impl.hpp       |  155 ++
 src/realm/deppart/byfield_gpu_kernels.hpp    |   57 +
 src/realm/deppart/byfield_gpu_tmpl.cu        |   64 +
 src/realm/deppart/byfield_tmpl.cc            |   15 +-
 src/realm/deppart/image.cc                   |  402 ++++-
 src/realm/deppart/image.h                    |  291 ++--
 src/realm/deppart/image_gpu_impl.hpp         |  446 +++++
 src/realm/deppart/image_gpu_kernels.hpp      |  167 ++
 src/realm/deppart/image_gpu_tmpl.cu          |   62 +
 src/realm/deppart/image_tmpl.cc              |    7 +-
 src/realm/deppart/partitions.cc              |   27 +-
 src/realm/deppart/partitions.h               |  241 +++
 src/realm/deppart/partitions_gpu.cu          |   29 +
 src/realm/deppart/partitions_gpu_impl.hpp    | 1604 +++++++++++++++++
 src/realm/deppart/partitions_gpu_kernels.hpp |  811 +++++++++
 src/realm/deppart/setops.cc                  |   26 +-
 src/realm/deppart/sparsity_impl.cc           |  181 +-
 src/realm/deppart/sparsity_impl.h            |    6 +
 src/realm/deppart/untemplated_gpu_kernels.cu |  119 ++
 src/realm/indexspace.h                       |   20 +
 src/realm/indexspace.inl                     |  193 ++-
 src/realm/inst_layout.inl                    |   10 +-
 src/realm/sparsity.h                         |   23 +-
 src/realm/sparsity.inl                       |   32 +-
 tests/CMakeLists.txt                         |    4 +
 tests/deppart.cc                             | 1621 +++++++++++++++++-
 tests/gpu_deppart_1d.cc                      |  327 ++++
 31 files changed, 6742 insertions(+), 398 deletions(-)
 create mode 100644 cmake/deppart_tmpl.cu.in
 create mode 100644 src/realm/deppart/byfield_gpu_impl.hpp
 create mode 100644 src/realm/deppart/byfield_gpu_kernels.hpp
 create mode 100644 src/realm/deppart/byfield_gpu_tmpl.cu
 create mode 100644 src/realm/deppart/image_gpu_impl.hpp
 create mode 100644 src/realm/deppart/image_gpu_kernels.hpp
 create mode 100644 src/realm/deppart/image_gpu_tmpl.cu
 create mode 100644 src/realm/deppart/partitions_gpu.cu
 create mode 100644 src/realm/deppart/partitions_gpu_impl.hpp
 create mode 100644 src/realm/deppart/partitions_gpu_kernels.hpp
 create mode 100644 src/realm/deppart/untemplated_gpu_kernels.cu
 create mode 100644 tests/gpu_deppart_1d.cc

diff --git a/cmake/deppart_tmpl.cu.in b/cmake/deppart_tmpl.cu.in
new file mode 100644
index 0000000000..01978e21ac
--- /dev/null
+++ b/cmake/deppart_tmpl.cu.in
@@ -0,0 +1,20 @@
+/*
+ * Copyright 2025 Stanford University, NVIDIA Corporation
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#cmakedefine INST_N1 @INST_N1@
+#cmakedefine INST_N2 @INST_N2@
+#include "@SRCFILE@_gpu_tmpl.cu"
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7054eb2e94..fd0b1fb81a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -38,7 +38,6 @@ set(REALM_SOURCES
     nodeset.cc
     operation.cc
     proc_impl.cc
-    realm_assert.cc
     repl_heap.cc
     rsrv_impl.cc
     runtime_impl.cc
@@ -64,12 +63,13 @@ set(REALM_SOURCES
     deppart/partitions.cc
     deppart/setops.cc
     deppart/sparsity_impl.cc
+    deppart/untemplated_gpu_kernels.cu
     numa/numa_module.cc
     numa/numasysif.cc
     procset/procset_module.cc
 )
 
-if(REALM_USE_CUDA)
+if(TARGET CUDA::cuda_driver AND REALM_USE_CUDA)
   list(APPEND REALM_SOURCES cuda/cuda_module.cc cuda/cuda_internal.cc cuda/cuda_access.cc)
   if(REALM_USE_NVTX)
     list(APPEND REALM_SOURCES nvtx.cc)
@@ -77,15 +77,15 @@ if(REALM_USE_CUDA)
   list(APPEND REALM_CUDA_SOURCES cuda/cuda_memcpy.cu)
 endif()
 
-if(REALM_USE_HIP)
+if(TARGET hip::host)
   list(APPEND REALM_SOURCES hip/hip_module.cc hip/hip_internal.cc hip/hip_access.cc)
 endif()
 
-if(REALM_USE_LLVM)
+if(TARGET LLVM::LLVM)
   list(APPEND REALM_SOURCES llvmjit/llvmjit_internal.cc llvmjit/llvmjit_module.cc)
 endif()
 
-if(REALM_USE_HDF5)
+if(TARGET hdf5::hdf5)
   list(APPEND REALM_SOURCES hdf5/hdf5_module.cc hdf5/hdf5_internal.cc hdf5/hdf5_access.cc)
 endif()
 
@@ -100,11 +100,11 @@ if(REALM_USE_PREALM)
   list(APPEND REALM_SOURCES prealm/prealm.cc)
 endif()
 
-if(REALM_USE_PYTHON)
+if(TARGET Python3::Python)
   list(APPEND REALM_SOURCES python/python_module.cc python/python_source.cc)
 endif()
 
-if(REALM_USE_UCX)
+if(TARGET ucx::ucp)
   list(
     APPEND
     REALM_SOURCES
@@ -119,14 +119,12 @@ if(REALM_USE_UCX)
   )
 endif()
 
-if(REALM_USE_GASNETEX)
-  if (NOT REALM_ENABLE_GASNETEX_WRAPPER)
-    list(APPEND REALM_SOURCES gasnet1/gasnet1_module.cc gasnet1/gasnetmsg.cc)
-  endif()
+if(TARGET GASNet::GASNet)
+  list(APPEND REALM_SOURCES gasnet1/gasnet1_module.cc gasnet1/gasnetmsg.cc)
   list(APPEND REALM_SOURCES gasnetex/gasnetex_module.cc gasnetex/gasnetex_internal.cc)
 endif()
 
-if(REALM_USE_MPI)
+if(TARGET MPI::MPI_CXX)
   list(APPEND REALM_SOURCES mpi/mpi_module.cc mpi/am_mpi.cc)
 endif()
 
@@ -145,7 +143,7 @@ configure_file(
   @ONLY
 )
 
-# generate per-dimension object files for deppart stuff
+# Generate per-dimension object files for CPU deppart.
 foreach(INST_N1 RANGE 1 ${REALM_MAX_DIM})
   foreach(INST_N2 RANGE 1 ${REALM_MAX_DIM})
     foreach(SRCFILE realm/deppart/image realm/deppart/preimage realm/deppart/byfield)
@@ -157,6 +155,18 @@ foreach(INST_N1 RANGE 1 ${REALM_MAX_DIM})
   endforeach()
 endforeach()
 
+# Generate per-dimension object files for GPU deppart.
+foreach(INST_N1 RANGE 1 ${REALM_MAX_DIM})
+    foreach(INST_N2 RANGE 1 ${REALM_MAX_DIM})
+        foreach(SRCFILE realm/deppart/byfield realm/deppart/image)
+            set(_result_file "${CMAKE_CURRENT_BINARY_DIR}/${SRCFILE}_gpu_${INST_N1}_${INST_N2}.cu")
+            # use cmake's configure_file for a portable way of creating wrapper source files
+            configure_file("${PROJECT_SOURCE_DIR}/cmake/deppart_tmpl.cu.in" "${_result_file}")
+            list(APPEND REALM_SOURCES "${_result_file}")
+        endforeach()
+    endforeach()
+endforeach()
+
 set(REALM_SOURCES
     ${REALM_SOURCES}
     PARENT_SCOPE
diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc
index cc6a0d6cc4..51b106f519 100644
--- a/src/realm/deppart/byfield.cc
+++ b/src/realm/deppart/byfield.cc
@@ -277,8 +277,55 @@ namespace Realm {
     (void)ok;
   }
 
-  template <int N, typename T, typename FT>
-  ActiveMessageHandlerReg<RemoteMicroOpMessage<ByFieldMicroOp<N,T,FT> > > ByFieldMicroOp<N,T,FT>::areg;
+  template<int N, typename T, typename FT>
+  ActiveMessageHandlerReg<RemoteMicroOpMessage<ByFieldMicroOp<N, T, FT> > > ByFieldMicroOp<N, T, FT>::areg;
+
+
+  ////////////////////////////////////////////////////////////////////////
+  //
+  // class GPUByFieldMicroOp<N, T, FT>
+
+  template<int N, typename T, typename FT>
+  GPUByFieldMicroOp<N, T, FT>::GPUByFieldMicroOp(
+    const IndexSpace<N, T> &_parent,
+    std::vector<FieldDataDescriptor<IndexSpace<N, T>, FT> > _field_data,
+    bool _exclusive)
+    : parent_space(_parent), field_data(_field_data) {
+    this->exclusive = _exclusive;
+  }
+
+  template<int N, typename T, typename FT>
+  GPUByFieldMicroOp<N, T, FT>::~GPUByFieldMicroOp() {
+  }
+
+  template<int N, typename T, typename FT>
+  void GPUByFieldMicroOp<N, T, FT>::dispatch(
+    PartitioningOperation *op, bool inline_ok) {
+
+    // We have to register ourselves as a waiter on sparse inputs before dispatching.
+
+    for (size_t i = 0; i < field_data.size(); i++) {
+      IndexSpace<N, T> inst_space = field_data[i].index_space;
+      if (!inst_space.dense()) {
+        bool registered = SparsityMapImpl<N, T>::lookup(inst_space.sparsity)->add_waiter(this, true /*precise*/);
+        if (registered)
+          this->wait_count.fetch_add(1);
+      }
+    }
+
+    if (!parent_space.dense()) {
+      bool registered = SparsityMapImpl<N, T>::lookup(parent_space.sparsity)->add_waiter(this, true /*precise*/);
+      if (registered) this->wait_count.fetch_add(1);
+    }
+    this->finish_dispatch(op, inline_ok);
+  }
+
+  template<int N, typename T, typename FT>
+  void GPUByFieldMicroOp<N, T, FT>::add_sparsity_output(
+    FT _val, SparsityMap<N, T> _sparsity) {
+    colors.push_back(_val);
+    sparsity_outputs[_val] = _sparsity;
+  }
 
 
   ////////////////////////////////////////////////////////////////////////
@@ -322,21 +369,44 @@ namespace Realm {
     return subspace;
   }
 
-  template <int N, typename T, typename FT>
-  void ByFieldOperation<N,T,FT>::execute(void)
-  {
-    for(size_t i = 0; i < subspaces.size(); i++)
-      SparsityMapImpl<N,T>::lookup(subspaces[i])->set_contributor_count(field_data.size());
-
-    for(size_t i = 0; i < field_data.size(); i++) {
-      ByFieldMicroOp<N,T,FT> *uop = new ByFieldMicroOp<N,T,FT>(parent,
-							       field_data[i].index_space,
-							       field_data[i].inst,
-							       field_data[i].field_offset);
-      for(size_t j = 0; j < colors.size(); j++)
-	uop->add_sparsity_output(colors[j], subspaces[j]);
-      //uop.set_value_set(colors);
-      uop->dispatch(this, true /* ok to run in this thread */);
+  template<int N, typename T, typename FT>
+  void ByFieldOperation<N, T, FT>::execute(void) {
+
+
+    // If the field data is on the GPU, we need to launch a GPUByFieldMicroOp.
+    // Rather than one micro-op per field, we can do them all in one micro-op.
+    // Launching multiple GPU micro-ops just adds overhead, and
+    // there isn't enough work to need multiple GPUs.
+    std::vector<FieldDataDescriptor<IndexSpace<N,T>,FT> > gpu_field_data;
+    std::vector<FieldDataDescriptor<IndexSpace<N,T>,FT> > cpu_field_data;
+    for (size_t i = 0; i < field_data.size(); i++) {
+      if (field_data[i].inst.get_location().kind() == Memory::GPU_FB_MEM) {
+        gpu_field_data.push_back(field_data[i]);
+      } else {
+        cpu_field_data.push_back(field_data[i]);
+      }
+    }
+    if (!cpu_field_data.empty()) {
+      for (size_t i = 0; i < subspaces.size(); i++)
+        SparsityMapImpl<N, T>::lookup(subspaces[i])->set_contributor_count(cpu_field_data.size() + (gpu_field_data.empty() ? 0 : 1));
+      for (size_t i = 0; i < cpu_field_data.size(); i++) {
+        ByFieldMicroOp<N, T, FT> *uop = new ByFieldMicroOp<N, T, FT>(parent,
+                                                                     cpu_field_data[i].index_space,
+                                                                     cpu_field_data[i].inst,
+                                                                     cpu_field_data[i].field_offset);
+        for (size_t j = 0; j < colors.size(); j++)
+          uop->add_sparsity_output(colors[j], subspaces[j]);
+
+        uop->dispatch(this, true /* ok to run in this thread */);
+      }
+    }
+    if (!gpu_field_data.empty()) {
+      GPUByFieldMicroOp<N, T, FT> *uop = new GPUByFieldMicroOp<N, T, FT>(parent, gpu_field_data, cpu_field_data.empty());
+      for (size_t i = 0; i < colors.size(); i++) {
+        uop->add_sparsity_output(colors[i], subspaces[i]);
+      }
+      uop->dispatch(this, false);
+
     }
   }
 
@@ -345,20 +415,4 @@ namespace Realm {
   {
     os << "ByFieldOperation(" << parent << ")";
   }
-
-#define DOIT(N,T,F) \
-  template class ByFieldMicroOp<N,T,F>; \
-  template class ByFieldOperation<N,T,F>; \
-  template ByFieldMicroOp<N,T,F>::ByFieldMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \
-  template Event IndexSpace<N,T>::create_subspaces_by_field(const std::vector<FieldDataDescriptor<IndexSpace<N,T>,F> >&, \
-							     const std::vector<F>&, \
-							     std::vector<IndexSpace<N,T> >&, \
-							     const ProfilingRequestSet &, \
-							     Event) const;
-#ifndef REALM_TEMPLATES_ONLY
-  FOREACH_NTF(DOIT)
-#endif
-
-  // instantiations of point/rect-field templates handled in byfield_tmpl.cc
-
 };
diff --git a/src/realm/deppart/byfield.h b/src/realm/deppart/byfield.h
index 1ff62b415e..92902efbd1 100644
--- a/src/realm/deppart/byfield.h
+++ b/src/realm/deppart/byfield.h
@@ -21,6 +21,7 @@
 #define REALM_DEPPART_BYFIELD_H
 
 #include "realm/deppart/partitions.h"
+#include "realm/deppart/rectlist.h"
 
 namespace Realm {
 
@@ -67,6 +68,29 @@ namespace Realm {
     std::map<FT, SparsityMap<N,T> > sparsity_outputs;
   };
 
+  template<int N, typename T, typename FT>
+    class GPUByFieldMicroOp : public GPUMicroOp<N, T> {
+  public:
+    GPUByFieldMicroOp(
+        const IndexSpace<N, T> &_parent,
+        std::vector<FieldDataDescriptor<IndexSpace<N,T>,FT> > _field_data,
+        bool _exclusive);
+
+    virtual ~GPUByFieldMicroOp(void);
+
+    virtual void execute(void);
+
+    void dispatch(PartitioningOperation *op, bool inline_ok);
+
+    void add_sparsity_output(FT _val, SparsityMap<N, T> _sparsity);
+
+  protected:
+    const IndexSpace<N, T> parent_space;
+    std::vector<FieldDataDescriptor<IndexSpace<N,T>,FT> > field_data;
+    std::vector<FT> colors;
+    std::map<FT, SparsityMap<N,T> > sparsity_outputs;
+  };
+
   template <int N, typename T, typename FT>
   class ByFieldOperation : public PartitioningOperation {
   public:
diff --git a/src/realm/deppart/byfield_gpu_impl.hpp b/src/realm/deppart/byfield_gpu_impl.hpp
new file mode 100644
index 0000000000..f2aa8c3288
--- /dev/null
+++ b/src/realm/deppart/byfield_gpu_impl.hpp
@@ -0,0 +1,155 @@
+#pragma once
+#include "realm/deppart/byfield.h"
+#include "realm/deppart/byfield_gpu_kernels.hpp"
+#include "realm/deppart/partitions_gpu_impl.hpp"
+#include <cub/cub.cuh>
+#include "realm/nvtx.h"
+
+namespace Realm {
+
+/*
+ *  Input (stored in MicroOp): Array of field instances, a parent index space, and a list of colors
+ *  Output: A list of (potentially overlapping) points in original instances ∩ parent index space marked with their color,
+ *  which it then sends off to complete_pipeline.
+ *  Approach: Intersect all instance rectangles with parent rectangles in parallel. For surviving rectangles, use
+ *  prefix sum + binary search to iterate over these in parallel and mark each point with its color.
+ */
+template <int N, typename T, typename FT>
+void GPUByFieldMicroOp<N,T,FT>::execute()
+{
+
+  // For profiling.
+  NVTX_DEPPART(byfield_gpu);
+
+  cudaStream_t stream = Cuda::get_task_cuda_stream();
+
+  Memory my_mem = field_data[0].inst.get_location();
+
+  collapsed_space<N, T> inst_space;
+
+  const char* val = std::getenv("TILE_SIZE");  // or any env var
+  size_t tile_size = 100000000; //default
+  if (val) {
+    tile_size = atoi(val);
+  }
+
+  RegionInstance fixed_buffer = this->realm_malloc(tile_size, my_mem);
+  Arena buffer_arena(reinterpret_cast<void *>(AffineAccessor<char, 1>(fixed_buffer, 0).base), tile_size);
+
+  inst_space.offsets = buffer_arena.alloc<size_t>(field_data.size() + 1);
+  inst_space.num_children = field_data.size();
+
+  GPUMicroOp<N, T>::collapse_multi_space(field_data, inst_space, buffer_arena, stream);
+
+  collapsed_space<N, T> collapsed_parent;
+
+  // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs.
+  GPUMicroOp<N, T>::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream);
+
+
+  // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter
+  // to figure out where to write each rectangle.
+  RegionInstance inst_counters_instance = this->realm_malloc((2*field_data.size() + 1) * sizeof(uint32_t), my_mem);
+  uint32_t* d_inst_counters = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(inst_counters_instance, 0).base);
+
+  // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second
+  // to track which instance each rectangle came from in the populate phase.
+  uint32_t* d_inst_prefix = d_inst_counters + field_data.size();
+  size_t num_valid_rects = 0;
+  Rect<N, T>* d_valid_rects;
+
+  // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from.
+  GPUMicroOp<N, T>::template construct_input_rectlist<Rect<N, T>>(inst_space, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream);
+
+
+  // Early out if we don't have any rectangles.
+  if (num_valid_rects == 0) {
+    for (std::pair<const FT, SparsityMap<N, T>> it : sparsity_outputs) {
+      SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it.second);
+      if (this->exclusive) {
+        impl->gpu_finalize();
+      } else {
+        impl->contribute_nothing();
+      }
+    }
+    inst_counters_instance.destroy();
+    return;
+  }
+
+
+  // Prefix sum the valid rectangles by volume.
+  size_t total_pts;
+
+  size_t* d_prefix_rects;
+  GPUMicroOp<N, T>::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream);
+
+  // Now we have everything we need to actually populate our outputs.
+  RegionInstance points_instance = this->realm_malloc(total_pts * sizeof(PointDesc<N,T>), my_mem);
+  PointDesc<N,T>* d_points = reinterpret_cast<PointDesc<N,T>*>(AffineAccessor<char,1>(points_instance, 0).base);
+
+  FT* d_colors;
+  RegionInstance colors_instance;
+
+
+  // Memcpying a boolean vector breaks things for some reason so we have this disgusting workaround.
+  if constexpr(std::is_same_v<FT,bool>) {
+    std::vector<uint8_t> flat_colors(colors.size());
+    for (size_t i = 0; i < colors.size(); i++) {
+      flat_colors[i] = colors[i] ? 1 : 0;
+    }
+    colors_instance = this->realm_malloc(total_pts * sizeof(PointDesc<N,T>), my_mem);
+    uint8_t* d_flat_colors = reinterpret_cast<uint8_t*>(AffineAccessor<char,1>(colors_instance, 0).base);
+    CUDA_CHECK(cudaMemcpyAsync(d_flat_colors, flat_colors.data(), colors.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, stream), stream);
+    d_colors = reinterpret_cast<FT*>(d_flat_colors);
+  } else {
+    colors_instance = this->realm_malloc(colors.size() * sizeof(FT), my_mem);
+    d_colors = reinterpret_cast<FT*>(AffineAccessor<char,1>(colors_instance, 0).base);
+    CUDA_CHECK(cudaMemcpyAsync(d_colors, colors.data(), colors.size() * sizeof(FT), cudaMemcpyHostToDevice, stream), stream);
+  }
+
+
+  Memory zcpy_mem;
+  assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM));
+
+  // We need to pass the accessors to the GPU so it can read field values.
+  RegionInstance accessors_instance = this->realm_malloc(field_data.size() * sizeof(AffineAccessor<FT,N,T>), zcpy_mem);
+  AffineAccessor<FT,N,T>* d_accessors = reinterpret_cast<AffineAccessor<FT,N,T>*>(AffineAccessor<char,1>(accessors_instance, 0).base);
+  for (size_t i = 0; i < field_data.size(); ++i) {
+    d_accessors[i] = AffineAccessor<FT,N,T>(field_data[i].inst, field_data[i].field_offset);
+  }
+
+
+
+  // This is where the work is actually done - each thread figures out which points to read, reads it, marks a PointDesc with its color, and writes it out.
+  byfield_gpuPopulateBitmasksKernel<N,T,FT><<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, d_colors, total_pts, colors.size(), num_valid_rects, field_data.size(), d_points);
+  KERNEL_CHECK(stream);
+
+
+  // Map colors to their output index to match send output iterator.
+  std::map<FT, size_t> color_indices;
+  for (size_t i = 0; i < colors.size(); i++) {
+    color_indices[colors[i]] = i;
+  }
+
+  CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+  colors_instance.destroy();
+  accessors_instance.destroy();
+  inst_counters_instance.destroy();
+
+  // Ship off the points for final processing.
+  size_t out_rects = 0;
+  RectDesc<N, T>* trash;
+  this->complete_pipeline(d_points, total_pts, trash, out_rects, buffer_arena,
+    /* the Container: */  sparsity_outputs,
+    /* getIndex: */       [&](auto const& kv){
+                            // elem is a SparsityMap<N,T> from the vector
+                            return color_indices.at(kv.first);
+                         },
+    /* getMap: */         [&](auto const& kv){
+                          // return the SparsityMap key itself
+                          return kv.second;
+                       });
+
+    points_instance.destroy();
+}
+}
diff --git a/src/realm/deppart/byfield_gpu_kernels.hpp b/src/realm/deppart/byfield_gpu_kernels.hpp
new file mode 100644
index 0000000000..f1ec217f9b
--- /dev/null
+++ b/src/realm/deppart/byfield_gpu_kernels.hpp
@@ -0,0 +1,57 @@
+#pragma once
+#include "realm/deppart/byfield.h"
+#include "realm/deppart/partitions_gpu_kernels.hpp"
+
+namespace Realm {
+
+
+template <
+  int N, typename T, typename FT
+>
+__global__
+void byfield_gpuPopulateBitmasksKernel(
+  AffineAccessor<FT,N,T>* accessors,
+  Rect<N,T>* rects,
+  size_t* prefix,
+  uint32_t* inst_prefix,
+  FT* d_colors,
+  size_t numPoints,
+  size_t numColors,
+  size_t numRects,
+  size_t num_insts,
+  PointDesc<N,T> *d_points
+) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= numPoints) return;
+
+  // Binary search to find which rectangle this point belongs to.
+  uint32_t r = bsearch(prefix, numRects, idx);
+
+  // Binary search to find which instance this rectangle belongs to.
+  size_t inst_idx = bsearch(inst_prefix, num_insts, r);
+
+  // Now we know which rectangle we're in, figure out the point coordinates.
+  size_t offset = idx - prefix[r];
+  Point<N, T> p;
+  for (int k = N-1; k >= 0; --k) {
+    size_t dim = rects[r].hi[k] + 1 - rects[r].lo[k];
+    p[k]  = rects[r].lo[k] + (offset % dim);
+    offset /= dim;
+  }
+
+  // Read the field value at that point.
+  FT ptr = accessors[inst_idx].read(p);
+
+  // Find our color's idx and write output.
+  PointDesc<N,T> point_desc;
+  point_desc.point = p;
+  for (size_t i = 0; i < numColors; ++i) {
+    if (ptr == d_colors[i]) {
+      point_desc.src_idx = i;
+      break;
+    }
+  }
+  d_points[idx] = point_desc;
+}
+
+} // namespace Realm
\ No newline at end of file
diff --git a/src/realm/deppart/byfield_gpu_tmpl.cu b/src/realm/deppart/byfield_gpu_tmpl.cu
new file mode 100644
index 0000000000..807fc1ad0b
--- /dev/null
+++ b/src/realm/deppart/byfield_gpu_tmpl.cu
@@ -0,0 +1,64 @@
+/* Copyright 2024 Stanford University, NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// per‐dimension instantiator for the GPU By Field Operation
+// Mirrors CPU Approach (byfield_tmpl.cc)
+
+#define REALM_TEMPLATES_ONLY
+#include "realm/deppart/byfield_gpu_impl.hpp"
+#include "realm/deppart/inst_helper.h"
+
+
+#ifndef INST_N1
+  #error "INST_N1 must be defined before including byfield_gpu_tmpl.cu"
+#endif
+#ifndef INST_N2
+  #error "INST_N2 must be defined before including byfield_gpu_tmpl.cu"
+#endif
+
+#define FOREACH_TT(__func__)       \
+  __func__(int,    int)            \
+  __func__(int,    unsigned)       \
+  __func__(int,    long long)      \
+  __func__(unsigned,int)           \
+  __func__(unsigned,unsigned)      \
+  __func__(unsigned,long long)     \
+  __func__(long long, int)         \
+  __func__(long long, unsigned)    \
+  __func__(long long, long long)
+
+#define FOREACH_T(__func__)       \
+  __func__(int)            \
+  __func__(unsigned)       \
+  __func__(long long)
+
+namespace Realm {
+  #define N1 INST_N1
+  #define N2 INST_N2
+
+  #define ZP(N,T) Point<N,T>
+  #define ZR(N,T) Rect<N,T>
+
+  #define DO_WITH_FT(N, T, FT) \
+    template class ByFieldMicroOp<N,T,FT>; \
+    template class GPUByFieldMicroOp<N,T,FT>;
+
+  #define DOIT(T1,T2) \
+    DO_WITH_FT(N1,T1,ZP(N2,T2))
+
+  FOREACH_TT(DOIT)
+
+  FOREACH_NTF(DO_WITH_FT)
+} // namespace Realm
\ No newline at end of file
diff --git a/src/realm/deppart/byfield_tmpl.cc b/src/realm/deppart/byfield_tmpl.cc
index 7c58bc725b..38a95a040d 100644
--- a/src/realm/deppart/byfield_tmpl.cc
+++ b/src/realm/deppart/byfield_tmpl.cc
@@ -17,7 +17,7 @@
 
 // per-dimension instantiator for byfield.cc
 
-#define REALM_TEMPLATES_ONLY
+#undef REALM_TEMPLATES_ONLY
 #include "./byfield.cc"
 
 #ifndef INST_N1
@@ -43,6 +43,19 @@ namespace Realm {
 #define N1 INST_N1
 #define N2 INST_N2
 
+#define DOIT(N,T,F) \
+  template class ByFieldMicroOp<N,T,F>; \
+  template class GPUByFieldMicroOp<N,T,F>; \
+  template class ByFieldOperation<N,T,F>; \
+  template ByFieldMicroOp<N,T,F>::ByFieldMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \
+  template Event IndexSpace<N,T>::create_subspaces_by_field(const std::vector<FieldDataDescriptor<IndexSpace<N,T>,F> >&, \
+							     const std::vector<F>&, \
+							     std::vector<IndexSpace<N,T> >&, \
+							     const ProfilingRequestSet &, \
+							     Event) const;
+  
+FOREACH_NTF(DOIT)
+
 #define ZP(N,T) Point<N,T>
 #define ZR(N,T) Rect<N,T>
 #define DOIT2(T1,T2) \
diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc
index e598c22033..660d0f77ad 100644
--- a/src/realm/deppart/image.cc
+++ b/src/realm/deppart/image.cc
@@ -30,6 +30,77 @@ namespace Realm {
   extern Logger log_part;
   extern Logger log_uop_timing;
 
+  template <int N, typename T>
+  template <int N2, typename T2>
+  Event IndexSpace<N, T>::gpu_subspaces_by_image(
+      const DomainTransform<N, T, N2, T2> &domain_transform,
+      const std::vector<IndexSpace<N2, T2>> &sources,
+      std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
+      std::pair<size_t, size_t> &sizes, RegionInstance buffer, Event wait_on) const {
+    // output vector should start out empty
+    assert(images.empty());
+
+    if (buffer==RegionInstance::NO_INST) {
+      size_t optimal_size = 0;
+      for (size_t i = 0; i < sources.size(); i++) {
+        optimal_size += 5 * sources[i].volume() * sizeof(RectDesc<N, T>);
+      }
+      size_t minimal_size = 0;
+      size_t source_entries = 0;
+      bool bvh = false;
+      for (size_t i = 0; i < sources.size(); ++i) {
+        IndexSpace<N2,T2> my_space = sources[i];
+        if (my_space.dense()) {
+          source_entries += 1;
+        } else {
+          bvh = true;
+          source_entries += my_space.sparsity.impl()->get_entries().size();
+        }
+      }
+      minimal_size += sizeof(Rect<N2, T2>) * source_entries;
+      if (this->dense()) {
+        minimal_size += sizeof(Rect<N, T>);
+      } else {
+        minimal_size += sizeof(Rect<N, T>) * this->sparsity.impl()->get_entries().size();
+      }
+      if (bvh) {
+        minimal_size +=
+          (source_entries * sizeof(uint64_t)) +
+          (source_entries * sizeof(size_t)) +
+          ((2*source_entries - 1) * sizeof(Rect<N, T>)) +
+          (2 * (2*source_entries - 1) * sizeof(int)) +
+          sizeof(Rect<N, T>) +
+          (2 * source_entries * sizeof(uint64_t)) +
+          (source_entries * sizeof(uint64_t));
+      }
+      sizes = std::make_pair(minimal_size, minimal_size + optimal_size);
+      return Event::NO_EVENT;
+    }
+
+    GenEventImpl *finish_event = GenEventImpl::create_genevent();
+    Event e = finish_event->current_event();
+
+    GPUImageOperation<N, T, N2, T2> *op = new GPUImageOperation<N, T, N2, T2>(
+        *this, domain_transform, reqs, sizes.first, buffer, finish_event, ID(e).event_generation());
+
+    size_t n = sources.size();
+    images.resize(n);
+    for (size_t i = 0; i < n; i++) {
+      images[i] = op->add_source(sources[i]);
+
+      if(!images[i].dense()) {
+        e = Event::merge_events(
+            {e, SparsityMapRefCounter(images[i].sparsity.id).add_references(1)});
+      }
+
+      log_dpops.info() << "image: " << *this << " src=" << sources[i] << " -> "
+                       << images[i] << " (" << e << ")";
+    }
+
+    op->launch(wait_on);
+    return e;
+  }
+
   template <int N, typename T>
   template <int N2, typename T2>
   Event IndexSpace<N, T>::create_subspaces_by_image(
@@ -495,23 +566,83 @@ namespace Realm {
       target_node = ID(source.sparsity).sparsity_creator_node();
     else
       if(!domain_transform.ptr_data.empty())
-	target_node = ID(domain_transform.ptr_data[sources.size() % domain_transform.ptr_data.size()].inst).instance_owner_node();
+        target_node = ID(domain_transform.ptr_data[sources.size() % domain_transform.ptr_data.size()].inst).instance_owner_node();
       else
-	target_node = ID(domain_transform.range_data[sources.size() % domain_transform.range_data.size()].inst).instance_owner_node();
+	      target_node = ID(domain_transform.range_data[sources.size() % domain_transform.range_data.size()].inst).instance_owner_node();
+
+    SparsityMap<N,T> sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert<SparsityMap<N,T> >();
+    image.sparsity = sparsity;
+
+    sources.push_back(source);
+    diff_rhss.push_back(diff_rhs);
+    images.push_back(sparsity);
+    is_intersection = false;
+
+    return image;
+  }
+
+  template <int N, typename T, int N2, typename T2>
+  IndexSpace<N,T> ImageOperation<N,T,N2,T2>::add_source_with_intersection(const IndexSpace<N2,T2>& source,
+                                                                         const IndexSpace<N,T>& diff_rhs)
+  {
+    // try to filter out obviously empty sources
+    if(parent.empty() || source.empty())
+      return IndexSpace<N,T>::make_empty();
+
+    // otherwise it'll be something smaller than the current parent
+    IndexSpace<N,T> image;
+    image.bounds = parent.bounds;
+
+    // if the source has a sparsity map, use the same node - otherwise
+    // get a sparsity ID by round-robin'ing across the nodes that have field data
+    int target_node;
+    if(!source.dense())
+      target_node = ID(source.sparsity).sparsity_creator_node();
+    else
+      if(!domain_transform.ptr_data.empty())
+        target_node = ID(domain_transform.ptr_data[sources.size() % domain_transform.ptr_data.size()].inst).instance_owner_node();
+      else
+	      target_node = ID(domain_transform.range_data[sources.size() % domain_transform.range_data.size()].inst).instance_owner_node();
+
     SparsityMap<N,T> sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert<SparsityMap<N,T> >();
     image.sparsity = sparsity;
 
     sources.push_back(source);
     diff_rhss.push_back(diff_rhs);
     images.push_back(sparsity);
+    is_intersection = true;
 
     return image;
   }
 
   template <int N, typename T, int N2, typename T2>
   void ImageOperation<N, T, N2, T2>::execute(void) {
-   if (domain_transform.type ==
-       DomainTransform<N, T, N2, T2>::DomainTransformType::STRUCTURED) {
+
+  	std::vector<FieldDataDescriptor<IndexSpace<N2,T2>,Point<N, T>> > gpu_ptr_data;
+  	std::vector<FieldDataDescriptor<IndexSpace<N2,T2>,Point<N, T>> > cpu_ptr_data;
+  	std::vector<FieldDataDescriptor<IndexSpace<N2,T2>,Rect<N, T>> > gpu_rect_data;
+  	std::vector<FieldDataDescriptor<IndexSpace<N2,T2>,Rect<N, T>> > cpu_rect_data;
+  	for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) {
+  		if (domain_transform.ptr_data[i].inst.get_location().kind() ==
+		      Memory::GPU_FB_MEM) {
+  			gpu_ptr_data.push_back(domain_transform.ptr_data[i]);
+		      } else {
+		      	cpu_ptr_data.push_back(domain_transform.ptr_data[i]);
+		      }
+  	}
+  	for (size_t i = 0; i < domain_transform.range_data.size(); i++) {
+  		if (domain_transform.range_data[i].inst.get_location().kind() ==
+		      Memory::GPU_FB_MEM) {
+  			gpu_rect_data.push_back(domain_transform.range_data[i]);
+		      } else {
+		      	cpu_rect_data.push_back(domain_transform.range_data[i]);
+		      }
+  	}
+  	bool gpu_data = !gpu_ptr_data.empty() || !gpu_rect_data.empty();
+  	bool cpu_data = !cpu_ptr_data.empty() || !cpu_rect_data.empty();
+    if (domain_transform.type ==
+       DomainTransform<N, T, N2, T2>::DomainTransformType::STRUCTURED && !gpu_data) {
+
     for (size_t i = 0; i < sources.size(); i++) {
      SparsityMapImpl<N, T>::lookup(images[i])->set_contributor_count(1);
     }
@@ -523,64 +654,89 @@ namespace Realm {
     for (size_t j = 0; j < sources.size(); j++) {
      micro_op->add_sparsity_output(sources[j], images[j]);
     }
-
     micro_op->dispatch(this, /*inline_ok=*/true);
-   } else {
-    if (!DeppartConfig::cfg_disable_intersection_optimization) {
-     // build the overlap tester based on the field index spaces - they're more
-     // likely to be known and
-     //  denser
-     ComputeOverlapMicroOp<N2, T2> *uop =
-         new ComputeOverlapMicroOp<N2, T2>(this);
+  } else if (!DeppartConfig::cfg_disable_intersection_optimization && !gpu_data) {
+       	// build the overlap tester based on the field index spaces - they're more
+       	// likely to be known and
+       	//  denser
+       	ComputeOverlapMicroOp<N2, T2> *uop =
+		   new ComputeOverlapMicroOp<N2, T2>(this);
 
-     for (size_t i = 0; i < domain_transform.ptr_data.size(); i++)
-      uop->add_input_space(domain_transform.ptr_data[i].index_space);
+       	for (size_t i = 0; i < domain_transform.ptr_data.size(); i++)
+       		uop->add_input_space(domain_transform.ptr_data[i].index_space);
 
-     for (size_t i = 0; i < domain_transform.range_data.size(); i++)
-      uop->add_input_space(domain_transform.range_data[i].index_space);
+       	for (size_t i = 0; i < domain_transform.range_data.size(); i++)
+       		uop->add_input_space(domain_transform.range_data[i].index_space);
 
-     // we will ask this uop to also prefetch the sources we will intersect test
-     // against it
-     for (size_t i = 0; i < sources.size(); i++)
-      uop->add_extra_dependency(sources[i]);
+       	// we will ask this uop to also prefetch the sources we will intersect test
+       	// against it
+       	for (size_t i = 0; i < sources.size(); i++)
+       		uop->add_extra_dependency(sources[i]);
 
-     uop->dispatch(this, true /* ok to run in this thread */);
+       	uop->dispatch(this, true /* ok to run in this thread */);
     } else {
-     // launch full cross-product of image micro ops right away
-     for (size_t i = 0; i < sources.size(); i++)
-      SparsityMapImpl<N, T>::lookup(images[i])->set_contributor_count(
-          domain_transform.ptr_data.size() +
-          domain_transform.range_data.size());
-
-     for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) {
-      ImageMicroOp<N, T, N2, T2> *uop = new ImageMicroOp<N, T, N2, T2>(
-          parent, domain_transform.ptr_data[i].index_space,
-          domain_transform.ptr_data[i].inst,
-          domain_transform.ptr_data[i].field_offset, false /*ptrs*/);
-      for (size_t j = 0; j < sources.size(); j++)
-       if (diff_rhss.empty())
-        uop->add_sparsity_output(sources[j], images[j]);
-       else
-        uop->add_sparsity_output_with_difference(sources[j], diff_rhss[j],
-                                                 images[j]);
-
-      uop->dispatch(this, true /* ok to run in this thread */);
-     }
-
-     for (size_t i = 0; i < domain_transform.range_data.size(); i++) {
-      ImageMicroOp<N, T, N2, T2> *uop = new ImageMicroOp<N, T, N2, T2>(
-          parent, domain_transform.range_data[i].index_space,
-          domain_transform.range_data[i].inst,
-          domain_transform.range_data[i].field_offset, true /*ranges*/);
-      for (size_t j = 0; j < sources.size(); j++)
-       if (diff_rhss.empty())
-        uop->add_sparsity_output(sources[j], images[j]);
-       else
-        uop->add_sparsity_output_with_difference(sources[j], diff_rhss[j],
-                                                 images[j]);
-
-      uop->dispatch(this, true /* ok to run in this thread */);
-     }
+    if (cpu_data) {
+	    	// launch full cross-product of image micro ops right away
+	    	for (size_t i = 0; i < sources.size(); i++)
+	    		SparsityMapImpl<N, T>::lookup(images[i])->set_contributor_count(
+				cpu_ptr_data.size() +
+				cpu_rect_data.size() + (gpu_data ? 1 : 0));
+
+	    	for (size_t i = 0; i < cpu_ptr_data.size(); i++) {
+	    		ImageMicroOp<N, T, N2, T2> *uop = new ImageMicroOp<N, T, N2, T2>(
+				parent, cpu_ptr_data[i].index_space,
+				cpu_ptr_data[i].inst,
+				cpu_ptr_data[i].field_offset, false /*ptrs*/);
+	    		for (size_t j = 0; j < sources.size(); j++)
+	    			if (diff_rhss.empty())
+	    				uop->add_sparsity_output(sources[j], images[j]);
+	    			else
+	    				uop->add_sparsity_output_with_difference(sources[j], diff_rhss[j],
+										     images[j]);
+
+	    		uop->dispatch(this, true /* ok to run in this thread */);
+	    	}
+
+	    	for (size_t i = 0; i < cpu_rect_data.size(); i++) {
+	    		ImageMicroOp<N, T, N2, T2> *uop = new ImageMicroOp<N, T, N2, T2>(
+				parent, cpu_rect_data[i].index_space,
+				cpu_rect_data[i].inst,
+				cpu_rect_data[i].field_offset, true /*ranges*/);
+	    		for (size_t j = 0; j < sources.size(); j++)
+	    			if (diff_rhss.empty())
+	    				uop->add_sparsity_output(sources[j], images[j]);
+	    			else
+	    				uop->add_sparsity_output_with_difference(sources[j], diff_rhss[j],
+										     images[j]);
+
+	    		uop->dispatch(this, true /* ok to run in this thread */);
+	    	}
+	    }
+    if (gpu_data) {
+    	std::swap(domain_transform.ptr_data, gpu_ptr_data);
+    	std::swap(domain_transform.range_data, gpu_rect_data);
+        const char* val = std::getenv("TILE_SIZE");  // or any env var
+        size_t tile_size = 100000000; //default
+        if (val) {
+          tile_size = atoi(val);
+        }
+        std::vector<size_t> byte_fields = {sizeof(char)};
+        IndexSpace<1> instance_index_space(Rect<1>(0, tile_size-1));
+        RegionInstance buffer;
+        Memory my_mem;
+        if (domain_transform.ptr_data.size() > 0) {
+          my_mem = domain_transform.ptr_data[0].inst.get_location();
+        } else {
+          my_mem = domain_transform.range_data[0].inst.get_location();
+        }
+        RegionInstance::create_instance(buffer, my_mem, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
+    	GPUImageMicroOp<N, T, N2, T2> *micro_op =
+	       new GPUImageMicroOp<N, T, N2, T2>(
+		 parent, domain_transform, !cpu_data, tile_size, buffer);
+    	for (size_t j = 0; j < sources.size(); j++) {
+    		micro_op->add_sparsity_output(sources[j], images[j]);
+    	}
+    	micro_op->dispatch(this, true);
     }
    }
   }
@@ -662,6 +818,74 @@ namespace Realm {
     os << "ImageOperation(" << parent << ")";
   }
 
+  ////////////////////////////////////////////////////////////////////////
+  //
+  // class GPUImageOperation<N,T,N2,T2>
+
+  template <int N, typename T, int N2, typename T2>
+  GPUImageOperation<N, T, N2, T2>::GPUImageOperation(
+      const IndexSpace<N, T> &_parent,
+      const DomainTransform<N, T, N2, T2> &_domain_transform,
+      const ProfilingRequestSet &reqs, size_t _buffer_size, RegionInstance _buffer,
+      GenEventImpl *_finish_event, EventImpl::gen_t _finish_gen)
+      : PartitioningOperation(reqs, _finish_event, _finish_gen),
+        parent(_parent),
+        domain_transform(_domain_transform),
+        buffer_size(_buffer_size),
+        buffer(_buffer) {}
+
+  template <int N, typename T, int N2, typename T2>
+  GPUImageOperation<N,T,N2,T2>::~GPUImageOperation(void)
+  {}
+
+  template <int N, typename T, int N2, typename T2>
+  IndexSpace<N,T> GPUImageOperation<N,T,N2,T2>::add_source(const IndexSpace<N2,T2>& source)
+  {
+    // try to filter out obviously empty sources
+    if(parent.empty() || source.empty())
+      return IndexSpace<N,T>::make_empty();
+
+    // otherwise it'll be something smaller than the current parent
+    IndexSpace<N,T> image;
+    image.bounds = parent.bounds;
+
+    // if the source has a sparsity map, use the same node - otherwise
+    // get a sparsity ID by round-robin'ing across the nodes that have field data
+    int target_node = 0;
+    if(!source.dense())
+      target_node = ID(source.sparsity).sparsity_creator_node();
+    else
+      if(!domain_transform.ptr_data.empty())
+	target_node = ID(domain_transform.ptr_data[sources.size() % domain_transform.ptr_data.size()].inst).instance_owner_node();
+      else
+	target_node = ID(domain_transform.range_data[sources.size() % domain_transform.range_data.size()].inst).instance_owner_node();
+
+    SparsityMap<N,T> sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert<SparsityMap<N,T> >();
+    image.sparsity = sparsity;
+
+    sources.push_back(source);
+    images.push_back(sparsity);
+
+    return image;
+  }
+
+  template <int N, typename T, int N2, typename T2>
+  void GPUImageOperation<N, T, N2, T2>::execute(void) {
+    	GPUImageMicroOp<N, T, N2, T2> *micro_op =
+	       new GPUImageMicroOp<N, T, N2, T2>(
+		 parent, domain_transform, true, buffer_size, buffer);
+    	for (size_t j = 0; j < sources.size(); j++) {
+    		micro_op->add_sparsity_output(sources[j], images[j]);
+    	}
+    	micro_op->dispatch(this, true);
+  }
+
+  template <int N, typename T, int N2, typename T2>
+  void GPUImageOperation<N,T,N2,T2>::print(std::ostream& os) const
+  {
+    os << "ImageOperation(" << parent << ")";
+  }
+
   ////////////////////////////////////////////////////////////////////////
   //
   // class StructuredImageMicroOp<N, T, N2, T2>
@@ -783,6 +1007,72 @@ namespace Realm {
    }
   }
 
+    ////////////////////////////////////////////////////////////////////////
+  //
+  // class StructuredImageMicroOp<N, T, N2, T2>
+
+  template <int N, typename T, int N2, typename T2>
+  GPUImageMicroOp<N, T, N2, T2>::GPUImageMicroOp(
+      const IndexSpace<N, T> &_parent,
+      const DomainTransform<N, T, N2, T2> &_domain_transform,
+      bool _exclusive, size_t _fixed_buffer_size, RegionInstance _buffer)
+      : parent_space(_parent), domain_transform(_domain_transform), fixed_buffer_size(_fixed_buffer_size), buffer(_buffer)
+  {
+	  this->exclusive = _exclusive;
+  }
+
+  template <int N, typename T, int N2, typename T2>
+  GPUImageMicroOp<N, T, N2, T2>::~GPUImageMicroOp() {}
+
+  template <int N, typename T, int N2, typename T2>
+  void GPUImageMicroOp<N, T, N2, T2>::dispatch(
+      PartitioningOperation *op, bool inline_ok) {
+
+    for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) {
+      IndexSpace<N2, T2> inst_space = domain_transform.ptr_data[i].index_space;
+      if (!inst_space.dense()) {
+        // it's safe to add the count after the registration only because we initialized
+        //  the count to 2 instead of 1
+        bool registered = SparsityMapImpl<N2,T2>::lookup(inst_space.sparsity)->add_waiter(this, true /*precise*/);
+        if(registered)
+          this->wait_count.fetch_add(1);
+      }
+    }
+
+    for (size_t i = 0; i < sources.size(); i++) {
+      if (!sources[i].dense()) {
+        bool registered = SparsityMapImpl<N2, T2>::lookup(sources[i].sparsity)
+                              ->add_waiter(this, true /*precise*/);
+        if (registered) this->wait_count.fetch_add(1);
+      }
+    }
+
+    if (!parent_space.dense()) {
+      bool registered = SparsityMapImpl<N, T>::lookup(parent_space.sparsity)
+                            ->add_waiter(this, true /*precise*/);
+      if (registered) this->wait_count.fetch_add(1);
+    }
+    this->finish_dispatch(op, inline_ok);
+  }
+
+  template <int N, typename T, int N2, typename T2>
+  void GPUImageMicroOp<N, T, N2, T2>::add_sparsity_output(
+      IndexSpace<N2, T2> _source, SparsityMap<N, T> _sparsity) {
+   sources.push_back(_source);
+   // TODO(apryakhin): Handle and test this sparsity ref-count path.
+   sparsity_outputs.push_back(_sparsity);
+  }
+
+  template <int N, typename T, int N2, typename T2>
+  void GPUImageMicroOp<N, T, N2, T2>::execute(void) {
+    TimeStamp ts("StructuredImageMicroOp::execute", true, &log_uop_timing);
+    if (domain_transform.ptr_data.size() > 0) {
+      gpu_populate_ptrs();
+    } else {
+      gpu_populate_rngs();
+    }
+  }
+
   ////////////////////////////////////////////////////////////////////////
 
   // instantiations of templates handled in image_tmpl.cc
diff --git a/src/realm/deppart/image.h b/src/realm/deppart/image.h
index 0e0fbfe03f..82b6393eb7 100644
--- a/src/realm/deppart/image.h
+++ b/src/realm/deppart/image.h
@@ -24,117 +24,188 @@
 #include "realm/deppart/rectlist.h"
 
 namespace Realm {
+    template<int N, typename T, int N2, typename T2>
+    class ImageMicroOp : public PartitioningMicroOp {
+    public:
+        static const int DIM = N;
+        typedef T IDXTYPE;
+        static const int DIM2 = N2;
+        typedef T2 IDXTYPE2;
 
-  template <int N, typename T, int N2, typename T2>
-  class ImageMicroOp : public PartitioningMicroOp {
-  public:
-    static const int DIM = N;
-    typedef T IDXTYPE;
-    static const int DIM2 = N2;
-    typedef T2 IDXTYPE2;
-
-    ImageMicroOp(IndexSpace<N,T> _parent_space, IndexSpace<N2,T2> _inst_space,
-		 RegionInstance _inst, size_t _field_offset, bool _is_ranged);
-
-    virtual ~ImageMicroOp(void);
-
-    void add_sparsity_output(IndexSpace<N2,T2> _source, SparsityMap<N,T> _sparsity);
-    void add_sparsity_output_with_difference(IndexSpace<N2,T2> _source,
-                                             IndexSpace<N,T> _diff_rhs,
-                                             SparsityMap<N,T> _sparsity);
-    void add_approx_output(int index, PartitioningOperation *op);
-
-    virtual void execute(void);
-
-    void dispatch(PartitioningOperation *op, bool inline_ok);
-
-  protected:
-    friend struct RemoteMicroOpMessage<ImageMicroOp<N,T,N2,T2> >;
-    static ActiveMessageHandlerReg<RemoteMicroOpMessage<ImageMicroOp<N,T,N2,T2> > > areg;
-
-    friend class PartitioningMicroOp;
-    template <typename S>
-    REALM_ATTR_WARN_UNUSED(bool serialize_params(S& s) const);
-
-    // construct from received packet
-    template <typename S>
-    ImageMicroOp(NodeID _requestor, AsyncMicroOp *_async_microop, S& s);
-
-    template <typename BM>
-    void populate_bitmasks_ptrs(std::map<int, BM *>& bitmasks);
-
-    template <typename BM>
-    void populate_bitmasks_ranges(std::map<int, BM *>& bitmasks);
-
-    template <typename BM>
-    void populate_approx_bitmask_ptrs(BM& bitmask);
-
-    template <typename BM>
-    void populate_approx_bitmask_ranges(BM& bitmask);
-
-    IndexSpace<N,T> parent_space;
-    IndexSpace<N2,T2> inst_space;
-    RegionInstance inst;
-    size_t field_offset;
-    bool is_ranged;
-    std::vector<IndexSpace<N2,T2> > sources;
-    std::vector<IndexSpace<N,T> > diff_rhss;
-    std::vector<SparsityMap<N,T> > sparsity_outputs;
-    int approx_output_index;
-    intptr_t approx_output_op;
-  };
-
-  template <int N, typename T, int N2, typename T2>
-  class ImageOperation : public PartitioningOperation {
-  public:
-   ImageOperation(const IndexSpace<N, T>& _parent,
-                  const DomainTransform<N, T, N2, T2>& _domain_transform,
-                  const ProfilingRequestSet& reqs, GenEventImpl* _finish_event,
-                  EventImpl::gen_t _finish_gen);
-
-   virtual ~ImageOperation(void);
-
-   IndexSpace<N, T> add_source(const IndexSpace<N2, T2>& source);
-   IndexSpace<N, T> add_source_with_difference(
-       const IndexSpace<N2, T2>& source, const IndexSpace<N, T>& diff_rhs);
-
-   virtual void execute(void);
-
-   virtual void print(std::ostream& os) const;
-
-   virtual void set_overlap_tester(void* tester);
-
-  protected:
-   IndexSpace<N, T> parent;
-   DomainTransform<N, T, N2, T2> domain_transform;
-   std::vector<IndexSpace<N2, T2>> sources;
-   std::vector<IndexSpace<N, T>> diff_rhss;
-   std::vector<SparsityMap<N, T>> images;
-  };
-
-  template <int N, typename T, int N2, typename T2>
-  class StructuredImageMicroOp : public PartitioningMicroOp {
-   public:
-    StructuredImageMicroOp(
-        const IndexSpace<N, T>& _parent,
-        const StructuredTransform<N, T, N2, T2>& _transform);
-
-    virtual ~StructuredImageMicroOp(void);
-    virtual void execute(void);
-
-    virtual void populate(std::map<int, HybridRectangleList<N, T>*>& bitmasks);
-
-    void dispatch(PartitioningOperation* op, bool inline_ok);
-    void add_sparsity_output(IndexSpace<N2, T2> _source,
-                             SparsityMap<N, T> _sparsity);
-
-   protected:
-    IndexSpace<N, T> parent_space;
-    StructuredTransform<N, T, N2, T2> transform;
-    std::vector<IndexSpace<N2, T2>> sources;
-    std::vector<SparsityMap<N, T>> sparsity_outputs;
-  };
+        ImageMicroOp(IndexSpace<N, T> _parent_space, IndexSpace<N2, T2> _inst_space,
+                     RegionInstance _inst, size_t _field_offset, bool _is_ranged);
 
-  };  // namespace Realm
+        virtual ~ImageMicroOp(void);
+
+        void add_sparsity_output(IndexSpace<N2, T2> _source, SparsityMap<N, T> _sparsity);
+
+        void add_sparsity_output_with_difference(IndexSpace<N2, T2> _source,
+                                                 IndexSpace<N, T> _diff_rhs,
+                                                 SparsityMap<N, T> _sparsity);
+
+        void add_sparsity_output_with_intersection(IndexSpace<N2, T2> _source,
+                                                   IndexSpace<N, T> _diff_rhs,
+                                                   SparsityMap<N, T> _sparsity);
+
+        void add_approx_output(int index, PartitioningOperation *op);
+
+        virtual void execute(void);
+
+        void dispatch(PartitioningOperation *op, bool inline_ok);
+
+    protected:
+        friend struct RemoteMicroOpMessage<ImageMicroOp<N, T, N2, T2> >;
+        static ActiveMessageHandlerReg<RemoteMicroOpMessage<ImageMicroOp<N, T, N2, T2> > > areg;
+
+        friend class PartitioningMicroOp;
+
+        template<typename S>
+        REALM_ATTR_WARN_UNUSED(bool serialize_params(S& s) const);
+
+        // construct from received packet
+        template<typename S>
+        ImageMicroOp(NodeID _requestor, AsyncMicroOp *_async_microop, S &s);
+
+        template<typename BM>
+        void populate_bitmasks_ptrs(std::map<int, BM *> &bitmasks);
+
+        template<typename BM>
+        void populate_bitmasks_ranges(std::map<int, BM *> &bitmasks);
+
+        template<typename BM>
+        void populate_approx_bitmask_ptrs(BM &bitmask);
+
+        template<typename BM>
+        void populate_approx_bitmask_ranges(BM &bitmask);
+
+        IndexSpace<N, T> parent_space;
+        IndexSpace<N2, T2> inst_space;
+        RegionInstance inst;
+        size_t field_offset;
+        bool is_ranged;
+        bool is_intersection;
+        std::vector<IndexSpace<N2, T2> > sources;
+        std::vector<IndexSpace<N, T> > diff_rhss;
+        std::vector<SparsityMap<N, T> > sparsity_outputs;
+        int approx_output_index;
+        intptr_t approx_output_op;
+    };
+
+    template<int N, typename T, int N2, typename T2>
+    class ImageOperation : public PartitioningOperation {
+    public:
+        ImageOperation(const IndexSpace<N, T> &_parent,
+                       const DomainTransform<N, T, N2, T2> &_domain_transform,
+                       const ProfilingRequestSet &reqs, GenEventImpl *_finish_event,
+                       EventImpl::gen_t _finish_gen);
+
+        virtual ~ImageOperation(void);
+
+        IndexSpace<N, T> add_source(const IndexSpace<N2, T2> &source);
+
+        IndexSpace<N, T> add_source_with_difference(
+            const IndexSpace<N2, T2> &source, const IndexSpace<N, T> &diff_rhs);
+
+        IndexSpace<N, T> add_source_with_intersection(
+            const IndexSpace<N2, T2> &source, const IndexSpace<N, T> &diff_rhs);
+
+        virtual void execute(void);
+
+        virtual void print(std::ostream &os) const;
+
+        virtual void set_overlap_tester(void *tester);
+
+    protected:
+        IndexSpace<N, T> parent;
+        DomainTransform<N, T, N2, T2> domain_transform;
+        std::vector<IndexSpace<N2, T2> > sources;
+        std::vector<IndexSpace<N, T> > diff_rhss;
+        std::vector<SparsityMap<N, T> > images;
+        bool is_intersection;
+    };
+
+    template<int N, typename T, int N2, typename T2>
+    class StructuredImageMicroOp : public PartitioningMicroOp {
+    public:
+        StructuredImageMicroOp(
+            const IndexSpace<N, T> &_parent,
+            const StructuredTransform<N, T, N2, T2> &_transform);
+
+        virtual ~StructuredImageMicroOp(void);
+
+        virtual void execute(void);
+
+        virtual void populate(std::map<int, HybridRectangleList<N, T> *> &bitmasks);
+
+        void dispatch(PartitioningOperation *op, bool inline_ok);
+
+        void add_sparsity_output(IndexSpace<N2, T2> _source,
+                                 SparsityMap<N, T> _sparsity);
+
+    protected:
+        IndexSpace<N, T> parent_space;
+        StructuredTransform<N, T, N2, T2> transform;
+        std::vector<IndexSpace<N2, T2> > sources;
+        std::vector<SparsityMap<N, T> > sparsity_outputs;
+    };
+
+    template<int N, typename T, int N2, typename T2>
+      class GPUImageOperation : public PartitioningOperation {
+    public:
+      GPUImageOperation(const IndexSpace<N, T> &_parent,
+                     const DomainTransform<N, T, N2, T2> &_domain_transform,
+                     const ProfilingRequestSet &reqs,
+                     size_t _buffer_size,
+                     RegionInstance _buffer,
+                     GenEventImpl *_finish_event,
+                     EventImpl::gen_t _finish_gen);
+
+      virtual ~GPUImageOperation(void);
+
+      IndexSpace<N, T> add_source(const IndexSpace<N2, T2> &source);
+
+      virtual void execute(void);
+
+      virtual void print(std::ostream &os) const;
+
+    protected:
+      IndexSpace<N, T> parent;
+      DomainTransform<N, T, N2, T2> domain_transform;
+      std::vector<IndexSpace<N2, T2> > sources;
+      std::vector<SparsityMap<N, T> > images;
+      size_t buffer_size;
+      RegionInstance buffer;
+    };
+
+    template<int N, typename T, int N2, typename T2>
+    class GPUImageMicroOp : public GPUMicroOp<N, T> {
+    public:
+        GPUImageMicroOp(
+            const IndexSpace<N, T> &_parent,
+            const DomainTransform<N, T, N2, T2> &_domain_transform,
+            bool _exclusive, size_t fixed_buffer_size = 0, RegionInstance buffer = RegionInstance::NO_INST);
+
+        virtual ~GPUImageMicroOp(void);
+
+        virtual void execute(void);
+
+        virtual void gpu_populate_ptrs();
+
+        virtual void gpu_populate_rngs();
+
+        void dispatch(PartitioningOperation *op, bool inline_ok);
+
+        void add_sparsity_output(IndexSpace<N2, T2> _source,
+                                 SparsityMap<N, T> _sparsity);
+
+    protected:
+        IndexSpace<N, T> parent_space;
+        DomainTransform<N, T, N2, T2> domain_transform;
+        std::vector<IndexSpace<N2, T2> > sources;
+        std::vector<SparsityMap<N, T> > sparsity_outputs;
+        size_t fixed_buffer_size;
+        RegionInstance buffer;
+    };
+}; // namespace Realm
 
 #endif // REALM_DEPPART_IMAGE_H
diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp
new file mode 100644
index 0000000000..6abb27c043
--- /dev/null
+++ b/src/realm/deppart/image_gpu_impl.hpp
@@ -0,0 +1,446 @@
+#pragma once
+#include "realm/deppart/image.h"
+#include "realm/deppart/image_gpu_kernels.hpp"
+#include "realm/deppart/partitions_gpu_impl.hpp"
+#include <cub/cub.cuh>
+#include <thrust/device_ptr.h>
+#include <thrust/transform.h>
+#include "realm/nvtx.h"
+
+namespace Realm {
+
+//TODO: INTERSECTING INPUT/OUTPUT RECTS CAN BE DONE WITH BVH IF BECOME EXPENSIVE
+
+template <int N2, typename T2>
+struct RectDescVolumeOp {
+  __device__ __forceinline__
+  size_t operator()(const RectDesc<N2,T2>& rd) const {
+    return rd.rect.volume();
+  }
+};
+
+template <int N2, typename T2>
+struct SparsityMapEntryVolumeOp {
+  __device__ __forceinline__
+  size_t operator()(const SparsityMapEntry<N2,T2>& entry) const {
+    return entry.bounds.volume();
+  }
+};
+
+  /*
+   *  Input (stored in MicroOp): Array of field instances, a parent index space, and a list of source index spaces
+   *  Output: A list of (potentially overlapping) rectangles that result from chasing all the pointers in the source index spaces
+   *  through the provided instances and emitting only those that intersect the parent index space labeled by which source they came from,
+   *  which are then sent off to complete_rect_pipeline.
+   *  Approach: Intersect all instance rectangles with source rectangles in parallel. Prefix sum + binary search to iterate over these in
+   *  parallel and chase all the pointers in the source rectangles to their corresponding rectangle. Finally, intersect the output rectangles
+   *  with the parent rectangles in parallel.
+   */
+template <int N, typename T, int N2, typename T2>
+void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
+{
+
+    if (sources.size() == 0) {
+      return;
+    }
+
+    NVTX_DEPPART(gpu_image);
+
+    Memory my_mem = domain_transform.range_data[0].inst.get_location();
+
+    cudaStream_t stream = Cuda::get_task_cuda_stream();
+
+    const char* val = std::getenv("TILE_SIZE");  // or any env var
+    size_t tile_size = 100000000; //default
+    if (val) {
+      tile_size = atoi(val);
+    }
+
+    RegionInstance fixed_buffer = this->realm_malloc(tile_size, my_mem);
+    Arena buffer_arena(reinterpret_cast<void *>(AffineAccessor<char, 1>(fixed_buffer, 0).base), tile_size);
+
+    collapsed_space<N2, T2> src_space;
+    RegionInstance offsets_instance = this->realm_malloc((sources.size()+1) * sizeof(size_t), my_mem);
+    src_space.offsets = reinterpret_cast<size_t*>(AffineAccessor<char,1>(offsets_instance, 0).base);
+    src_space.num_children = sources.size();
+
+    GPUMicroOp<N2, T2>::collapse_multi_space(sources, src_space, buffer_arena, stream);
+
+    collapsed_space<N2, T2> inst_space;
+  
+    // We combine all of our instances into one to batch work, tracking the offsets between instances.
+    RegionInstance inst_offsets_instance = this->realm_malloc((domain_transform.range_data.size() + 1) * sizeof(size_t), my_mem);
+    inst_space.offsets = reinterpret_cast<size_t*>(AffineAccessor<char,1>(inst_offsets_instance, 0).base);
+    inst_space.num_children = domain_transform.range_data.size();
+  
+    GPUMicroOp<N2, T2>::collapse_multi_space(domain_transform.range_data, inst_space, buffer_arena, stream);
+
+    // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter
+    // to figure out where to write each rectangle.
+    RegionInstance inst_counters_instance = this->realm_malloc((2*domain_transform.range_data.size() + 1) * sizeof(uint32_t), my_mem);
+    uint32_t* d_inst_counters = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(inst_counters_instance, 0).base);
+  
+    // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second
+    // to track which instance each rectangle came from in the populate phase.
+    uint32_t* d_inst_prefix = d_inst_counters + domain_transform.range_data.size();
+    RegionInstance valid_rects_instance;
+    size_t num_valid_rects;
+    RectDesc<N2, T2>* d_valid_rects;
+  
+    // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from.
+    GPUMicroOp<N2, T2>::template construct_input_rectlist<RectDesc<N2, T2>>(inst_space, src_space, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream);
+    inst_offsets_instance.destroy();
+
+    if (num_valid_rects == 0) {
+      for (SparsityMap<N, T> it : sparsity_outputs) {
+            SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it);
+            if (this->exclusive) {
+              impl->gpu_finalize();
+            } else {
+              impl->contribute_nothing();
+            }
+      }
+      valid_rects_instance.destroy();
+      inst_counters_instance.destroy();
+      return;
+    }
+
+    // Prefix sum the valid rectangles by volume.
+    size_t* d_prefix_rects;
+    size_t total_pts;
+
+    GPUMicroOp<N2, T2>::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream);
+
+    RegionInstance rngs_instance = this->realm_malloc(total_pts * sizeof(RectDesc<N,T>), my_mem);
+    RectDesc<N,T>* d_rngs = reinterpret_cast<RectDesc<N,T>*>(AffineAccessor<char,1>(rngs_instance, 0).base);
+
+
+    Memory zcpy_mem;
+    assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM));
+    RegionInstance accessors_instance = this->realm_malloc(domain_transform.range_data.size() * sizeof(AffineAccessor<Rect<N,T>,N2,T2>), zcpy_mem);
+    AffineAccessor<Rect<N,T>,N2,T2>* d_accessors = reinterpret_cast<AffineAccessor<Rect<N,T>,N2,T2>*>(AffineAccessor<char,1>(accessors_instance, 0).base);
+    for (size_t i = 0; i < domain_transform.range_data.size(); ++i) {
+      d_accessors[i] = AffineAccessor<Rect<N,T>,N2,T2>(domain_transform.range_data[i].inst, domain_transform.range_data[i].field_offset);
+    }
+
+    image_gpuPopulateBitmasksRngsKernel<N,T,N2,T2><<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, total_pts, num_valid_rects, domain_transform.range_data.size(), d_rngs);
+    KERNEL_CHECK(stream);
+
+    RegionInstance parent_entries_instance;
+    collapsed_space<N, T> collapsed_parent;
+
+    // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs.
+    GPUMicroOp<N, T>::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream);
+  
+
+    RegionInstance src_counters_instance = this->realm_malloc(sources.size() * sizeof(uint32_t), my_mem);
+    uint32_t* d_src_counters = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(src_counters_instance, 0).base);
+    CUDA_CHECK(cudaMemsetAsync(d_src_counters, 0, sources.size() * sizeof(uint32_t), stream), stream);
+
+
+    //Finally, we do another two pass count + emit to intersect with the parent rectangles
+    image_intersect_output<N,T><<<COMPUTE_GRID(collapsed_parent.num_entries * total_pts), THREADS_PER_BLOCK, 0, stream>>>(collapsed_parent.entries_buffer, d_rngs, nullptr, collapsed_parent.num_entries, total_pts, d_src_counters, nullptr);
+    KERNEL_CHECK(stream);
+
+    std::vector<uint32_t> h_src_counters(sources.size()+1);
+    h_src_counters[0] = 0; // prefix sum starts at 0
+    CUDA_CHECK(cudaMemcpyAsync(h_src_counters.data()+1, d_src_counters, sources.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream);
+
+    CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+    valid_rects_instance.destroy();
+    accessors_instance.destroy();
+
+    for (size_t i = 0; i < sources.size(); ++i) {
+      h_src_counters[i+1] += h_src_counters[i];
+    }
+
+    size_t num_valid_output = h_src_counters[sources.size()];
+
+    if (num_valid_output == 0) {
+      for (SparsityMap<N, T> it : sparsity_outputs) {
+        SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it);
+        if (this->exclusive) {
+          impl->gpu_finalize();
+        } else {
+          impl->contribute_nothing();
+        }
+      }
+      parent_entries_instance.destroy();
+      src_counters_instance.destroy();
+      rngs_instance.destroy();
+      return;
+    }
+
+
+    RegionInstance valid_intersect_instance = this->realm_malloc(num_valid_output * sizeof(RectDesc<N,T>), my_mem);
+    RectDesc<N,T>* d_valid_intersect = reinterpret_cast<RectDesc<N,T>*>(AffineAccessor<char,1>(valid_intersect_instance, 0).base);
+
+    RegionInstance src_prefix_instance = this->realm_malloc((sources.size() + 1) * sizeof(uint32_t), my_mem);
+    uint32_t* d_src_prefix = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(src_prefix_instance, 0).base);
+    CUDA_CHECK(cudaMemcpyAsync(d_src_prefix, h_src_counters.data(), (sources.size() + 1) * sizeof(size_t), cudaMemcpyHostToDevice, stream), stream);
+
+    CUDA_CHECK(cudaMemsetAsync(d_src_counters, 0, sources.size() * sizeof(uint32_t), stream), stream);
+
+    image_intersect_output<N,T><<<COMPUTE_GRID(collapsed_parent.num_entries * total_pts), THREADS_PER_BLOCK, 0, stream>>>(collapsed_parent.entries_buffer, d_rngs, d_src_prefix, collapsed_parent.num_entries, total_pts, d_src_counters, d_valid_intersect);
+    KERNEL_CHECK(stream);
+
+    CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+
+    src_prefix_instance.destroy();
+    parent_entries_instance.destroy();
+    src_counters_instance.destroy();
+    rngs_instance.destroy();
+
+    size_t out_rects = 0;
+    RectDesc<N, T>* trash;
+    this->complete_rect_pipeline(d_valid_intersect, num_valid_output, trash, out_rects, buffer_arena,
+    /* the Container: */  sparsity_outputs,
+    /* getIndex: */       [&](auto const& elem){
+                            // elem is a SparsityMap<N,T> from the vector
+                            return size_t(&elem - sparsity_outputs.data());
+                         },
+    /* getMap: */         [&](auto const& elem){
+                          // return the SparsityMap key itself
+                          return elem;
+                       });
+
+  valid_intersect_instance.destroy();
+
+}
+
+  /*
+   *  Input (stored in MicroOp): Array of field instances, a parent index space, and a list of source index spaces
+   *  Output: A list of (potentially overlapping) points that result from chasing all the pointers in the source index spaces
+   *  through the provided instances and emitting only points in the parent index space labeled by which source they came from,
+   *  which are then sent off to complete_pipeline.
+   *  Approach: Intersect all instance rectangles with source rectangles in parallel. Prefix sum + binary search to iterate over these in
+   *  parallel and chase all the pointers in the source rectangles to their corresponding point. Here, the pointer chasing is also a count + emit,
+   *  where only points that are in the parent index space are counted/emitted.
+   */
+template <int N, typename T, int N2, typename T2>
+void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
+{
+    if (sources.size() == 0) {
+      return;
+    }
+
+    NVTX_DEPPART(gpu_image);
+
+    Memory sysmem;
+    find_memory(sysmem, Memory::SYSTEM_MEM);
+
+    cudaStream_t stream = Cuda::get_task_cuda_stream();
+
+    size_t tile_size = fixed_buffer_size;
+    RegionInstance fixed_buffer = buffer;
+    Arena buffer_arena(reinterpret_cast<void *>(AffineAccessor<char, 1>(fixed_buffer, 0).base), tile_size);
+
+    collapsed_space<N2, T2> src_space;
+    src_space.offsets = buffer_arena.alloc<size_t>(sources.size()+1);
+    src_space.num_children = sources.size();
+
+    GPUMicroOp<N2, T2>::collapse_multi_space(sources, src_space, buffer_arena, stream);
+
+    collapsed_space<N2, T2> inst_space;
+  
+    // We combine all of our instances into one to batch work, tracking the offsets between instances.
+    inst_space.offsets = buffer_arena.alloc<size_t>(domain_transform.ptr_data.size()+1);
+    inst_space.num_children = domain_transform.ptr_data.size();
+
+    Arena no;
+    GPUMicroOp<N2, T2>::collapse_multi_space(domain_transform.ptr_data, inst_space, no, stream);
+
+    // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter
+    // to figure out where to write each rectangle.
+    uint32_t* d_inst_counters = buffer_arena.alloc<uint32_t>(2*domain_transform.ptr_data.size()+1);
+
+  
+    // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second
+    // to track which instance each rectangle came from in the populate phase.
+    uint32_t* d_inst_prefix = d_inst_counters + domain_transform.ptr_data.size();
+    size_t num_valid_rects = tile_size;
+
+    //Uniform for all tiles
+    collapsed_space<N, T> collapsed_parent;
+
+    // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs.
+    GPUMicroOp<N, T>::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream);
+
+    Memory zcpy_mem;
+    assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM));
+    RegionInstance accessors_instance = this->realm_malloc(domain_transform.ptr_data.size() * sizeof(AffineAccessor<Point<N,T>,N2,T2>), zcpy_mem);
+    AffineAccessor<Point<N,T>,N2,T2>* d_accessors = reinterpret_cast<AffineAccessor<Point<N,T>,N2,T2>*>(AffineAccessor<char,1>(accessors_instance, 0).base);
+    for (size_t i = 0; i < domain_transform.ptr_data.size(); ++i) {
+      d_accessors[i] = AffineAccessor<Point<N,T>,N2,T2>(domain_transform.ptr_data[i].inst, domain_transform.ptr_data[i].field_offset);
+    }
+
+    uint32_t* d_prefix_points = buffer_arena.alloc<uint32_t>(domain_transform.ptr_data.size()+1);
+
+    buffer_arena.commit(false);
+
+    size_t left = buffer_arena.used();
+
+    //Here we iterate over the tiles
+    size_t num_output = 0;
+    RectDesc<N, T>* output_start = nullptr;
+    size_t num_completed = 0;
+    size_t curr_tile = tile_size / 2;
+    int count = 0;
+    while (num_completed < inst_space.num_entries) {
+      try {
+        std::cout << "Tile iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
+        buffer_arena.start();
+        std::cout << "Amount Used: " << buffer_arena.used() << std::endl;
+        std::cout << "Expected Amount Used: " << left + num_output * sizeof(RectDesc<N,T>) << std::endl;
+        if (num_completed + curr_tile > inst_space.num_entries) {
+          curr_tile = inst_space.num_entries - num_completed;
+        }
+        collapsed_space<N2, T2> inst_space_tile = inst_space;
+        inst_space_tile.num_entries = curr_tile;
+        inst_space_tile.entries_buffer = buffer_arena.alloc<SparsityMapEntry<N2,T2>>(curr_tile);
+        CUDA_CHECK(cudaMemcpyAsync(inst_space_tile.entries_buffer, inst_space.entries_buffer + num_completed, curr_tile * sizeof(SparsityMapEntry<N2,T2>), cudaMemcpyHostToDevice, stream), stream);
+
+        RectDesc<N2, T2>* d_valid_rects;
+        GPUMicroOp<N2, T2>::template construct_input_rectlist<RectDesc<N2, T2>>(inst_space_tile, src_space, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream);
+
+        if (num_valid_rects == std::numeric_limits<size_t>::max()) {
+          curr_tile /= 2;
+          continue;
+        }
+
+        if (num_valid_rects == 0) {
+          num_completed += curr_tile;
+          curr_tile = tile_size / 2;
+          subtract_const<<<COMPUTE_GRID(domain_transform.ptr_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile);
+          CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+          continue;
+        }
+
+        // Prefix sum the valid rectangles by volume.
+        size_t* d_prefix_rects;
+        size_t total_pts;
+
+        GPUMicroOp<N2, T2>::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream);
+
+        CUDA_CHECK(cudaMemsetAsync(d_inst_counters, 0, (domain_transform.ptr_data.size()) * sizeof(uint32_t), stream), stream);
+
+        //We do a two pass count + emit to chase all the pointers in parallel and check for membership in the parent index space
+        image_gpuPopulateBitmasksPtrsKernel<N,T,N2,T2><<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, collapsed_parent.entries_buffer, d_prefix_rects, d_inst_prefix, nullptr, total_pts, num_valid_rects, domain_transform.ptr_data.size(), collapsed_parent.num_entries, d_inst_counters, nullptr);
+        KERNEL_CHECK(stream);
+
+        std::vector<uint32_t> h_inst_counters(domain_transform.ptr_data.size()+1);
+        h_inst_counters[0] = 0; // prefix sum starts at 0
+        CUDA_CHECK(cudaMemcpyAsync(h_inst_counters.data()+1, d_inst_counters, domain_transform.ptr_data.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream);
+        CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+        for (size_t i = 0; i < domain_transform.ptr_data.size(); ++i) {
+          h_inst_counters[i+1] += h_inst_counters[i];
+        }
+
+        size_t num_valid_points = h_inst_counters[domain_transform.ptr_data.size()];
+
+        if (num_valid_points == 0) {
+          num_completed += curr_tile;
+          curr_tile = tile_size / 2;
+          subtract_const<<<COMPUTE_GRID(domain_transform.ptr_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile);
+          CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+          continue;
+        }
+
+        CUDA_CHECK(cudaMemcpyAsync(d_prefix_points, h_inst_counters.data(), (domain_transform.ptr_data.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream);
+
+        buffer_arena.flip_parity();
+        PointDesc<N,T>* d_valid_points = buffer_arena.alloc<PointDesc<N,T>>(num_valid_points);
+
+        CUDA_CHECK(cudaMemsetAsync(d_inst_counters, 0, (domain_transform.ptr_data.size()) * sizeof(uint32_t), stream), stream);
+
+        image_gpuPopulateBitmasksPtrsKernel<N,T,N2,T2><<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, collapsed_parent.entries_buffer, d_prefix_rects, d_inst_prefix, d_prefix_points, total_pts, num_valid_rects, domain_transform.ptr_data.size(), collapsed_parent.num_entries, d_inst_counters, d_valid_points);
+        KERNEL_CHECK(stream);
+        CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+
+
+        size_t num_new_rects = 1;
+        assert(!buffer_arena.get_parity());
+        RectDesc<N, T>* d_new_rects;
+
+        //Send it off for processing
+        this->complete_pipeline(d_valid_points, num_valid_points, d_new_rects, num_new_rects, buffer_arena,
+        /* the Container: */  sparsity_outputs,
+        /* getIndex: */       [&](auto const& elem){
+                                // elem is a SparsityMap<N,T> from the vector
+                                return size_t(&elem - sparsity_outputs.data());
+                             },
+        /* getMap: */         [&](auto const& elem){
+                              // return the SparsityMap key itself
+                              return elem;
+                           });
+
+        if (num_output==0) {
+          buffer_arena.flip_parity();
+          buffer_arena.reset(true);
+          output_start = buffer_arena.alloc<RectDesc<N, T>>(num_new_rects);
+          buffer_arena.commit(true);
+          CUDA_CHECK(cudaMemcpyAsync(output_start, d_new_rects, num_new_rects * sizeof(RectDesc<N,T>), cudaMemcpyDeviceToDevice, stream), stream);
+          num_output = num_new_rects;
+          num_completed += curr_tile;
+          subtract_const<<<COMPUTE_GRID(domain_transform.ptr_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile);
+          curr_tile = tile_size / 2;
+          CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+          continue;
+        }
+
+        RectDesc<N, T>* d_old_rects = buffer_arena.alloc<RectDesc<N, T>>(num_output);
+        assert(d_old_rects == d_new_rects + num_new_rects);
+        CUDA_CHECK(cudaMemcpyAsync(d_old_rects, output_start, num_output * sizeof(RectDesc<N,T>), cudaMemcpyDeviceToDevice, stream), stream);
+        CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+
+        size_t num_final_rects = 1;
+
+        //Send it off for processing
+        this->complete_rect_pipeline(d_new_rects, num_output + num_new_rects, output_start, num_final_rects, buffer_arena,
+        /* the Container: */  sparsity_outputs,
+        /* getIndex: */       [&](auto const& elem){
+                                // elem is a SparsityMap<N,T> from the vector
+                                return size_t(&elem - sparsity_outputs.data());
+                             },
+        /* getMap: */         [&](auto const& elem){
+                              // return the SparsityMap key itself
+                              return elem;
+                           });
+        num_completed += curr_tile;
+        num_output = num_final_rects;
+        subtract_const<<<COMPUTE_GRID(domain_transform.ptr_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile);
+        curr_tile = tile_size / 2;
+        CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+      }
+      catch (arena_oom&) {
+        std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl;
+        std::cout << buffer_arena.used() << " bytes used in arena." << std::endl;
+        curr_tile /= 2;
+        if (curr_tile == 0) {
+          throw;
+        }
+      }
+    }
+
+    if (num_output == 0) {
+      for (SparsityMap<N, T> it : sparsity_outputs) {
+        SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it);
+        if (this->exclusive) {
+          impl->gpu_finalize();
+        } else {
+          impl->contribute_nothing();
+        }
+      }
+      return;
+    }
+    this->send_output(output_start, num_output, buffer_arena, sparsity_outputs,
+      /* getIndex: */       [&](auto const& elem){
+                              // elem is a SparsityMap<N,T> from the vector
+                              return size_t(&elem - sparsity_outputs.data());
+                           },
+      /* getMap: */         [&](auto const& elem){
+                            // return the SparsityMap key itself
+                            return elem;
+                         });
+}
+}
\ No newline at end of file
diff --git a/src/realm/deppart/image_gpu_kernels.hpp b/src/realm/deppart/image_gpu_kernels.hpp
new file mode 100644
index 0000000000..146d4e781f
--- /dev/null
+++ b/src/realm/deppart/image_gpu_kernels.hpp
@@ -0,0 +1,167 @@
+#pragma once
+#include "realm/deppart/image.h"
+
+namespace Realm {
+
+//Device helper to check parent space for membership
+//TODO: if expensive, may benefit from BVH
+template<int N, typename T>
+__device__ bool image_isInIndexSpace(
+    const Point<N,T>& p,
+    const SparsityMapEntry<N,T>*  parent_entries,
+    size_t              numRects)
+{
+  // for each rectangle, check all dims…
+  for(size_t i = 0; i < numRects; ++i) {
+    const auto &r = parent_entries[i].bounds;
+    bool inside = true;
+    #pragma unroll
+    for(int d = 0; d < N; ++d) {
+      if(p[d] < r.lo[d] || p[d] > r.hi[d]) {
+        inside = false;
+        break;
+      }
+    }
+    if(inside) return true;
+  }
+  return false;
+}
+
+//Count + emit to chase pointers and check for membership in parent space
+template <
+  int N, typename T,
+  int N2, typename T2
+>
+__global__
+void image_gpuPopulateBitmasksPtrsKernel(
+  AffineAccessor<Point<N,T>,N2,T2> *accessors,
+  RectDesc<N2,T2>* rects,
+  SparsityMapEntry<N,T>* parent_entries,
+  size_t* prefix,
+  uint32_t *inst_offsets,
+  uint32_t *d_inst_prefix,
+  size_t numPoints,
+  size_t numRects,
+  size_t num_insts,
+  size_t numParentRects,
+  uint32_t* d_inst_counters,
+  PointDesc<N,T> *d_points
+) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= numPoints) return;
+  size_t low = 0, high = numRects;
+  while (low < high) {
+    size_t mid = (low + high) >> 1;
+    if (prefix[mid+1] <= idx) low = mid + 1;
+    else                      high = mid;
+  }
+  size_t r = low;
+  bool found = false;
+  size_t inst_idx;
+  for (inst_idx = 0; inst_idx < num_insts; ++inst_idx) {
+    if (inst_offsets[inst_idx] <= r && inst_offsets[inst_idx+1] > r) {
+      found = true;
+      break;
+    }
+  }
+  assert(found);
+  size_t offset = idx - prefix[r];
+  Point<N2, T2> p;
+  for (int k = N2-1; k >= 0; --k) {
+    size_t dim = rects[r].rect.hi[k] + 1 - rects[r].rect.lo[k];
+    p[k]  = rects[r].rect.lo[k] + (offset % dim);
+    offset /= dim;
+  }
+  Point<N,T> ptr = accessors[inst_idx].read(p);
+  if (image_isInIndexSpace<N,T>(ptr, parent_entries, numParentRects)) {
+    uint32_t local = atomicAdd(&d_inst_counters[inst_idx], 1);
+    if (d_points != nullptr) {
+      uint32_t out_idx = d_inst_prefix[inst_idx] + local;
+      PointDesc<N,T> point_desc;
+      point_desc.src_idx = rects[r].src_idx;
+      point_desc.point = ptr;
+      d_points[out_idx] = point_desc;
+    }
+  }
+  
+}
+
+//Same as image_intersect_input, but for output rectangles and parent entries
+//rather than input rectangles and parent rectangles
+  template <int N, typename T>
+__global__ void image_intersect_output(
+  const SparsityMapEntry<N,T>* d_parent_entries,
+  const RectDesc<N,T>* d_output_rngs,
+  const uint32_t* d_src_prefix,
+  size_t numParentRects,
+  size_t numOutputRects,
+  uint32_t* d_src_counters,
+  RectDesc<N,T>* d_rects
+) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= numParentRects * numOutputRects) return;
+  size_t idx_x = idx % numParentRects;
+  size_t idx_y = idx / numParentRects;
+  const auto parent_entry = d_parent_entries[idx_x];
+  const auto output_entry = d_output_rngs[idx_y];
+  RectDesc<N,T> rect_output;
+  rect_output.rect = parent_entry.bounds.intersection(output_entry.rect);
+  if (!rect_output.rect.empty()) {
+    uint32_t local = atomicAdd(&d_src_counters[output_entry.src_idx], 1);
+    if (d_rects != nullptr) {
+      rect_output.src_idx = output_entry.src_idx;
+      size_t out_idx = d_src_prefix[output_entry.src_idx] + local;
+      d_rects[out_idx] = rect_output;
+    }
+  }
+}
+
+//Single pass function to chase pointers to rectangles.
+  template <
+  int N, typename T,
+  int N2, typename T2
+>
+__global__
+void image_gpuPopulateBitmasksRngsKernel(
+  AffineAccessor<Rect<N,T>,N2,T2> *accessors,
+  RectDesc<N2,T2>* rects,
+  size_t* prefix,
+  uint32_t *inst_offsets,
+  size_t numPoints,
+  size_t numRects,
+  size_t num_insts,
+  RectDesc<N,T> *d_rects
+) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= numPoints) return;
+  size_t low = 0, high = numRects;
+  while (low < high) {
+    size_t mid = (low + high) >> 1;
+    if (prefix[mid+1] <= idx) low = mid + 1;
+    else                      high = mid;
+  }
+  size_t r = low;
+  bool found = false;
+  size_t inst_idx;
+  for (inst_idx = 0; inst_idx < num_insts; ++inst_idx) {
+    if (inst_offsets[inst_idx] <= r && inst_offsets[inst_idx+1] > r) {
+      found = true;
+      break;
+    }
+  }
+  assert(found);
+  size_t offset = idx - prefix[r];
+  Point<N2, T2> p;
+  for (int k = N2-1; k >= 0; --k) {
+    size_t dim = rects[r].rect.hi[k] + 1 - rects[r].rect.lo[k];
+    p[k]  = rects[r].rect.lo[k] + (offset % dim);
+    offset /= dim;
+  }
+  Rect<N,T> rng = accessors[inst_idx].read(p);
+  RectDesc<N,T> rect_desc;
+  rect_desc.src_idx = rects[r].src_idx;
+  rect_desc.rect = rng;
+  d_rects[idx] = rect_desc;
+}
+
+} // namespace Realm
\ No newline at end of file
diff --git a/src/realm/deppart/image_gpu_tmpl.cu b/src/realm/deppart/image_gpu_tmpl.cu
new file mode 100644
index 0000000000..6af4dcde61
--- /dev/null
+++ b/src/realm/deppart/image_gpu_tmpl.cu
@@ -0,0 +1,62 @@
+/* Copyright 2024 Stanford University, NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// per‐dimension instantiator for the GPU Image Operation
+// Mirrors CPU Approach (image_tmpl.cc)
+
+
+#include "realm/deppart/image_gpu_kernels.hpp"
+#include "realm/deppart/image_gpu_impl.hpp"
+#include "realm/deppart/inst_helper.h"
+
+#ifndef INST_N1
+  #error "INST_N1 must be defined before including image_gpu_tmpl.cu"
+#endif
+#ifndef INST_N2
+  #error "INST_N2 must be defined before including image_gpu_tmpl.cu"
+#endif
+
+#define FOREACH_TT(__func__)       \
+  __func__(int,    int)            \
+  __func__(int,    unsigned)       \
+  __func__(int,    long long)      \
+  __func__(unsigned,int)           \
+  __func__(unsigned,unsigned)      \
+  __func__(unsigned,long long)     \
+  __func__(long long, int)         \
+  __func__(long long, unsigned)    \
+  __func__(long long, long long)
+
+#define FOREACH_T(__func__)       \
+  __func__(int)            \
+  __func__(unsigned)       \
+  __func__(long long)
+
+namespace Realm {
+  #define N1 INST_N1
+  #define N2 INST_N2
+
+
+  #define DO_DOUBLE(T1,T2) \
+    template class ImageMicroOp<N1,T1,N2,T2>; \
+    template class GPUImageMicroOp<N1,T1,N2,T2>;
+
+  FOREACH_TT(DO_DOUBLE)
+
+  #undef DO_DOUBLE
+  #undef N1
+  #undef N2
+
+} // namespace Realm
\ No newline at end of file
diff --git a/src/realm/deppart/image_tmpl.cc b/src/realm/deppart/image_tmpl.cc
index 578a78226b..c12dfdb138 100644
--- a/src/realm/deppart/image_tmpl.cc
+++ b/src/realm/deppart/image_tmpl.cc
@@ -46,13 +46,18 @@ namespace Realm {
 
 #define DOIT(T1,T2)			                                                                                             \
   template class StructuredImageMicroOp<N1,T1,N2,T2>;                                                                \
-  template class ImageMicroOp<N1,T1,N2,T2>;                                                                          \
+  template class ImageMicroOp<N1,T1,N2,T2>;																			 \
+  template class GPUImageMicroOp<N1, T1, N2, T2>;																	 \
   template class ImageOperation<N1,T1,N2,T2>;                                                                        \
   template ImageMicroOp<N1,T1,N2,T2>::ImageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \
   template Event IndexSpace<N1, T1>::create_subspaces_by_image(                                                      \
       const DomainTransform<N1, T1, N2, T2> &, const std::vector<IndexSpace<N2, T2> > &,                             \
       std::vector<IndexSpace<N1, T1> > &, const ProfilingRequestSet &, Event)                                        \
       const;                                                                                                         \
+  template Event IndexSpace<N1, T1>::gpu_subspaces_by_image(                                                         \
+      const DomainTransform<N1, T1, N2, T2> &, const std::vector<IndexSpace<N2, T2> > &,                             \
+      std::vector<IndexSpace<N1, T1> > &, const ProfilingRequestSet &, std::pair<size_t, size_t> &,                  \
+      RegionInstance, Event) const;                                                                                  \
   template Event IndexSpace<N1,T1>::create_subspaces_by_image_with_difference(                                       \
       const DomainTransform<N1, T1, N2, T2> &,                                                                       \
 									       const std::vector<IndexSpace<N2,T2> >&,                                                     \
diff --git a/src/realm/deppart/partitions.cc b/src/realm/deppart/partitions.cc
index b023f468fc..f342519f71 100644
--- a/src/realm/deppart/partitions.cc
+++ b/src/realm/deppart/partitions.cc
@@ -71,7 +71,7 @@ namespace Realm {
 				      size_t start, size_t count, size_t volume,
 				      IndexSpace<N,T> *results,
 				      size_t first_result, size_t last_result,
-				      const std::vector<SparsityMapEntry<N,T> >& entries)
+				      const span<SparsityMapEntry<N,T> >& entries)
   {
     // should never be here with empty bounds
     assert(!bounds.empty());
@@ -111,13 +111,11 @@ namespace Realm {
     size_t lo_volume[N];
     for(int i = 0; i < N; i++)
       lo_volume[i] = 0;
-    for(typename std::vector<SparsityMapEntry<N,T> >::const_iterator it = entries.begin();
-	it != entries.end();
-	it++) {
+    for(size_t j = 0; j < entries.size(); j++) {
       for(int i = 0; i < N; i++)
-	lo_volume[i] += it->bounds.intersection(lo_half[i]).volume();
+	lo_volume[i] += entries[j].bounds.intersection(lo_half[i]).volume();
     }
-    // now compute how many subspaces would fall in each half and the 
+    // now compute how many subspaces would fall in each half and the
     //  inefficiency of the split
     size_t lo_count[N], inefficiency[N];
     for(int i = 0; i < N; i++) {
@@ -233,7 +231,7 @@ namespace Realm {
     // TODO: sparse case where we have to wait
     SparsityMapPublicImpl<N,T> *impl = sparsity.impl();
     assert(impl->is_valid());
-    const std::vector<SparsityMapEntry<N,T> >& entries = impl->get_entries();
+    const span<SparsityMapEntry<N,T> >& entries = impl->get_entries();
     // initially every subspace will be a copy of this one, and then
     //  we'll decompose the bounds
     subspace = *this;
@@ -307,7 +305,7 @@ namespace Realm {
     // TODO: sparse case where we have to wait
     SparsityMapPublicImpl<N,T> *impl = sparsity.impl();
     assert(impl->is_valid());
-    const std::vector<SparsityMapEntry<N,T> >& entries = impl->get_entries();
+    span<SparsityMapEntry<N, T>> entries = impl->get_entries();
     // initially every subspace will be a copy of this one, and then
     //  we'll decompose the bounds
     subspaces.resize(count, *this);
@@ -498,7 +496,7 @@ namespace Realm {
   template <typename T>
   class RectListAdapter {
   public:
-    RectListAdapter(const std::vector<Rect<1,T> >& _rects)
+    RectListAdapter(const span<Rect<1,T> >& _rects)
       : rects(_rects.empty() ? 0 : &_rects[0]), count(_rects.size()) {}
     RectListAdapter(const Rect<1,T> *_rects, size_t _count)
       : rects(_rects), count(_count) {}
@@ -583,7 +581,6 @@ namespace Realm {
     os << "AsyncMicroOp(" << (void *)uop << ")";
   }
 
-
   ////////////////////////////////////////////////////////////////////////
   //
   // class PartitioningMicroOp
@@ -666,6 +663,16 @@ namespace Realm {
     }
   }
 
+  RegionInstance PartitioningMicroOp::realm_malloc(size_t size, Memory location) {
+      assert(location != Memory::NO_MEMORY);
+      assert(size > 0);
+      std::vector<size_t> byte_fields = {sizeof(char)};
+      IndexSpace<1> instance_index_space(Rect<1>(0, size-1));
+      RegionInstance result;
+      RegionInstance::create_instance(result, location, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
+      return result;
+  }
+
   ////////////////////////////////////////////////////////////////////////
   //
   // class ComputeOverlapMicroOp<N,T>
diff --git a/src/realm/deppart/partitions.h b/src/realm/deppart/partitions.h
index 7bb68c3630..4ec4560984 100644
--- a/src/realm/deppart/partitions.h
+++ b/src/realm/deppart/partitions.h
@@ -35,11 +35,211 @@
 #include "realm/deppart/inst_helper.h"
 #include "realm/bgwork.h"
 
+struct CUstream_st;
+typedef CUstream_st* cudaStream_t;
+
 namespace Realm {
 
   class PartitioningMicroOp;
   class PartitioningOperation;
 
+  template <typename T>
+  constexpr std::string_view type_name() {
+  #if defined(__clang__)
+      std::string_view p = __PRETTY_FUNCTION__;
+      return {p.data() + 34, p.size() - 34 - 1};
+  #elif defined(__GNUC__)
+      std::string_view p = __PRETTY_FUNCTION__;
+      return {p.data() + 49, p.size() - 49 - 1};
+  #elif defined(_MSC_VER)
+      std::string_view p = __FUNCSIG__;
+      return {p.data() + 84, p.size() - 84 - 7};
+  #else
+      return "unknown";
+  #endif
+  }
+
+  template<typename T>
+  struct HiFlag {
+    T hi;
+    uint8_t head;
+  };
+
+  struct DeltaFlag {
+    int32_t delta;
+    uint8_t head;
+  };
+
+  // Data representations for GPU micro-ops
+  // src idx tracks which subspace each rect/point
+  // belongs to and allows multiple subspaces to be
+  // computed together in a micro-op
+  template<int N, typename T>
+  struct RectDesc {
+    Rect<N,T> rect;
+    size_t src_idx;
+  };
+
+  template<int N, typename T>
+  struct PointDesc {
+    Point<N,T> point;
+    size_t src_idx;
+  };
+
+  // Combines one or multiple index spaces into a single struct
+  // If multiple, offsets tracks transitions between spaces
+  template<int N, typename T>
+  struct collapsed_space {
+    SparsityMapEntry<N, T>* entries_buffer;
+    size_t num_entries;
+    size_t* offsets;
+    size_t num_children;
+    Rect<N, T> bounds;
+  };
+
+  // Stores everything necessary to query a BVH
+  // Used with GPUMicroOp<N, T>::build_bvh
+  template<int N, typename T>
+  struct BVH {
+    int root;
+    size_t num_leaves;
+    Rect<N,T>* boxes;
+    uint64_t* indices;
+    size_t* labels;
+    int* childLeft;
+    int* childRight;
+  };
+
+  struct arena_oom : std::bad_alloc {
+    const char* what() const noexcept override { return "arena_oom"; }
+  };
+
+  class Arena {
+  public:
+    using byte = std::byte;
+
+    Arena() noexcept : base_(nullptr), cap_(0), parity_(false), left_(0), right_(0), base_left_(0), base_right_(0) {}
+    Arena(void* buffer, size_t bytes) noexcept
+      : base_(reinterpret_cast<byte*>(buffer)), cap_(bytes), parity_(false), left_(0), right_(0), base_left_(0), base_right_(0) {}
+
+    size_t capacity() const noexcept { return cap_; }
+    size_t used() const noexcept { return left_ + right_; }
+
+    size_t mark() const noexcept {
+      return parity_ ? right_ : left_;
+    }
+
+    void rollback(size_t mark) noexcept {
+      if (parity_) {
+        right_ = mark;
+      } else {
+        left_ = mark;
+      }
+    }
+
+    template <typename T>
+    T* alloc(size_t count = 1) {
+      try {
+        if (parity_) {
+          return alloc_right<T>(count);
+        } else {
+          return alloc_left<T>(count);
+        }
+      } catch (arena_oom&) {
+        std::cout << "Arena OOM: requested " << count << " of " << type_name<T>()
+                  << " capacity " << cap_ << " bytes, "
+                  << " used " << used() << " bytes, "
+                  << " left " << (cap_ - left_ - right_) << " bytes.\n";
+        throw arena_oom{};
+      }
+    }
+
+    void flip_parity(void) noexcept {
+      if (parity_) {
+        // switching from right to left
+        left_ = base_left_;
+      } else {
+        // switching from left to right
+        right_ = base_right_;
+      }
+      parity_ = !parity_;
+    }
+
+    void commit(bool parity) noexcept {
+      if (parity) {
+        base_right_ = right_;
+      } else {
+        base_left_ = left_;
+      }
+    }
+
+    void reset(bool parity) noexcept {
+      if (parity) {
+        base_right_ = 0;
+        right_ = 0;
+      } else {
+        base_left_ = 0;
+        left_ = 0;
+      }
+    }
+
+    bool get_parity(void) const noexcept {
+      return parity_;
+    }
+
+    void start(void) noexcept {
+      left_ = base_left_;
+      right_ = base_right_;
+      parity_ = false;
+    }
+
+  private:
+
+    void* alloc_left_bytes(size_t bytes, size_t align = alignof(std::max_align_t)) {
+      const size_t aligned = align_up(left_, align);
+      if (aligned + bytes + right_ > cap_) throw arena_oom{};
+      void* p = base_ + aligned;
+      left_ = aligned + bytes;
+      return p;
+    }
+
+    void* alloc_right_bytes(size_t bytes, size_t align = alignof(std::max_align_t)) {
+      if (bytes + right_ > cap_) throw arena_oom{};
+      const size_t aligned = align_down(cap_ - right_ - bytes, align);
+      if (aligned < left_) throw arena_oom{};
+      void *p = base_ + aligned;
+      right_ = cap_ - aligned;
+      return p;
+    }
+
+    template <typename T>
+    T* alloc_left(size_t count = 1) {
+      static_assert(!std::is_void_v<T>, "alloc<void> is invalid");
+      return reinterpret_cast<T*>(alloc_left_bytes(sizeof(T) * count, alignof(T)));
+    }
+
+    template <typename T>
+    T* alloc_right(size_t count = 1) {
+      static_assert(!std::is_void_v<T>, "alloc<void> is invalid");
+      return reinterpret_cast<T*>(alloc_right_bytes(sizeof(T) * count, alignof(T)));
+    }
+
+    static size_t align_up(size_t x, size_t a) noexcept {
+      return (x + (a - 1)) & ~(a - 1);
+    }
+
+    static size_t align_down(size_t x, size_t a) noexcept {
+      return x & ~(a - 1);
+    }
+
+    byte* base_;
+    size_t cap_;
+    bool parity_;
+    size_t left_;
+    size_t right_;
+    size_t base_left_;
+    size_t base_right_;
+  };
 
   template <int N, typename T>
   class OverlapTester {
@@ -108,6 +308,8 @@ namespace Realm {
     template <int N, typename T>
     void sparsity_map_ready(SparsityMapImpl<N,T> *sparsity, bool precise);
 
+    static RegionInstance realm_malloc(size_t size, Memory location = Memory::NO_MEMORY);
+
     IntrusiveListLink<PartitioningMicroOp> uop_link;
     REALM_PMTA_DEFN(PartitioningMicroOp,IntrusiveListLink<PartitioningMicroOp>,uop_link);
     typedef IntrusiveList<PartitioningMicroOp, REALM_PMTA_USE(PartitioningMicroOp,uop_link), DummyLock> MicroOpList;
@@ -147,6 +349,45 @@ namespace Realm {
     std::vector<SparsityMapImpl<N,T> *> extra_deps;
   };
 
+  //The parent class for all GPU partitioning micro-ops. Provides output utility functions
+
+  template<int N, typename T>
+  class GPUMicroOp : public PartitioningMicroOp {
+  public:
+    GPUMicroOp(void) = default;
+    virtual ~GPUMicroOp(void) = default;
+
+    virtual void execute(void) = 0;
+
+    template <typename space_t>
+    static void collapse_multi_space(const std::vector<space_t>& field_data, collapsed_space<N, T> &out_space, Arena &my_arena, cudaStream_t stream);
+
+    static void collapse_parent_space(const IndexSpace<N, T>& parent_space, collapsed_space<N, T> &out_space, Arena &my_arena, cudaStream_t stream);
+
+    static void build_bvh(const collapsed_space<N, T> &space, BVH<N, T> &bvh, Arena &my_arena, cudaStream_t stream);
+
+    template <typename out_t>
+    static void construct_input_rectlist(const collapsed_space<N, T> &lhs, const collapsed_space<N, T> &rhs, out_t* &d_valid_rects, size_t& out_size, uint32_t* counters, uint32_t* out_offsets, Arena &my_arena, cudaStream_t stream);
+
+    template <typename out_t>
+    static void volume_prefix_sum(const out_t* d_rects, size_t total_rects, size_t* &d_prefix_rects, size_t& num_pts, Arena &my_arena, cudaStream_t stream);
+
+    template<typename Container, typename IndexFn, typename MapFn>
+    void complete_pipeline(PointDesc<N, T>* d_points, size_t total_pts, RectDesc<N, T>* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap);
+
+    template<typename Container, typename IndexFn, typename MapFn>
+    void complete_rect_pipeline(RectDesc<N, T>* d_rects, size_t total_rects, RectDesc<N, T>* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap);
+
+    template<typename Container, typename IndexFn, typename MapFn>
+    void complete1d_pipeline(RectDesc<N, T>* d_rects, size_t total_rects, RectDesc<N, T>* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap);
+
+    template<typename Container, typename IndexFn, typename MapFn>
+    void send_output(RectDesc<N, T>* d_rects, size_t total_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap);
+
+    bool exclusive = false;
+
+  };
+
   ////////////////////////////////////////
   //
   
diff --git a/src/realm/deppart/partitions_gpu.cu b/src/realm/deppart/partitions_gpu.cu
new file mode 100644
index 0000000000..b842e93f58
--- /dev/null
+++ b/src/realm/deppart/partitions_gpu.cu
@@ -0,0 +1,29 @@
+/* Copyright 2024 Stanford University, NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// per‐dimension instantiator for the GPU version of
+// ImageMicroOp<…>::gpu_populate_bitmasks_ptrs
+
+
+#include "realm/deppart/partitions_gpu_impl.hpp"
+#include "realm/deppart/inst_helper.h"
+
+namespace Realm {
+  #define DOIT(N,T) \
+    template class GPUMicroOp<N, T>;
+
+  FOREACH_NT(DOIT)
+
+} // namespace Realm
\ No newline at end of file
diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp
new file mode 100644
index 0000000000..678102b56f
--- /dev/null
+++ b/src/realm/deppart/partitions_gpu_impl.hpp
@@ -0,0 +1,1604 @@
+#pragma once
+#include "deppart_config.h"
+#include "partitions.h"
+#ifdef REALM_USE_NVTX
+#include "realm/nvtx.h"
+#endif
+#include "realm/cuda/cuda_internal.h"
+#include "realm/deppart/partitions_gpu_kernels.hpp"
+#include <cub/cub.cuh>
+
+//CUDA ERROR CHECKING MACROS
+
+#define CUDA_CHECK(call, stream)                                                \
+  do {                                                                          \
+    cudaError_t err = (call);                                                   \
+    if (err != cudaSuccess) {                                                   \
+      std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__             \
+                << " '" #call "' failed with "                                 \
+                << cudaGetErrorString(err) << " (" << err << ")\n";            \
+      assert(false);                                                  \
+    }                                                                           \
+  } while (0)
+
+#define KERNEL_CHECK(stream)                                                    \
+  do {                                                                          \
+    cudaError_t err = cudaGetLastError();                                       \
+    if (err != cudaSuccess) {                                                   \
+      std::cerr << "Kernel launch failed at " << __FILE__ << ":" << __LINE__   \
+                << ": " << cudaGetErrorString(err) << "\n";                    \
+      assert(false);                                                \
+    }                                                                        \
+  } while (0)
+
+#define THREADS_PER_BLOCK 256
+
+#define COMPUTE_GRID(num_items) \
+  (((num_items) + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK)
+
+
+//NVTX macros to only add ranges if defined.
+#ifdef REALM_USE_NVTX
+
+  #define NVTX_CAT(a,b)  a##b
+
+  #define NVTX_DEPPART(message) \
+  nvtxScopedRange NVTX_CAT(nvtx_, message)("cuda", #message, 0)
+
+#else
+
+  #define NVTX_DEPPART(message) do { } while (0)
+
+#endif
+
+namespace Realm {
+
+  // Used by cub::DeviceReduce to compute bad GPU approximation.
+  template<int N, typename T>
+  struct UnionRectOp {
+    __host__ __device__
+    Rect<N,T> operator()(const Rect<N,T>& a,
+                         const Rect<N,T>& b) const {
+      Rect<N,T> r;
+      for(int d=0; d<N; d++){
+        r.lo[d] = a.lo[d] <  b.lo[d] ? a.lo[d] : b.lo[d];
+        r.hi[d] = a.hi[d] > b.hi[d] ? a.hi[d] : b.hi[d];
+      }
+      return r;
+    }
+  };
+
+  // Used to compute prefix sum by volume for an array of Rects or RectDescs.
+  template <int N, typename T, typename out_t>
+  struct RectVolumeOp {
+    __device__ __forceinline__
+    size_t operator()(const out_t& r) const {
+      if constexpr (std::is_same_v<Rect<N, T>, out_t>) {
+        return r.volume();
+      } else {
+        return r.rect.volume();
+      }
+    }
+  };
+
+  // Finds a memory of the specified kind. Returns true on success, false otherwise.
+  inline bool find_memory(Memory &output, Memory::Kind kind)
+  {
+    bool found = false;
+    Machine machine = Machine::get_machine();
+    std::set<Memory> all_memories;
+    machine.get_all_memories(all_memories);
+    for(auto& memory : all_memories) {
+      if(memory.kind() == kind) {
+        output = memory;
+        found = true;
+        break;
+      }
+    }
+    return found;
+  }
+
+  //Given a list of spaces, compacts them all into one collapsed_space
+  template<int N, typename T>
+  template<typename space_t>
+  void GPUMicroOp<N,T>::collapse_multi_space(const std::vector<space_t>& spaces, collapsed_space<N, T> &out_space, Arena &my_arena, cudaStream_t stream)
+  {
+
+    char *val = std::getenv("SHATTER_SIZE");  // or any env var
+    int shatter_size = 1; //default
+    if (val) {
+      shatter_size = atoi(val);
+    }
+    // We need space_offsets to preserve which space each rectangle came from
+    std::vector<size_t> space_offsets(spaces.size() + 1);
+
+    // Determine size of allocation for combined rects.
+    out_space.num_entries = 0;
+
+    for (size_t i = 0; i < spaces.size(); ++i) {
+      space_offsets[i] = out_space.num_entries;
+      IndexSpace<N,T> my_space;
+      if constexpr (std::is_same_v<space_t, IndexSpace<N,T>>) {
+        my_space = spaces[i];
+      } else {
+        my_space = spaces[i].index_space;
+      }
+      if (my_space.dense()) {
+        if constexpr (std::is_same_v<space_t, IndexSpace<N,T>>) {
+          out_space.num_entries += 1;
+        } else {
+          out_space.num_entries += shatter_size;
+        }
+      } else {
+        out_space.num_entries += my_space.sparsity.impl()->get_entries().size();
+      }
+    }
+    space_offsets[spaces.size()] = out_space.num_entries;
+
+    //We copy into one contiguous host buffer, then copy to device
+    Memory sysmem;
+    assert(find_memory(sysmem, Memory::SYSTEM_MEM));
+
+
+    RegionInstance h_instance = realm_malloc(out_space.num_entries * sizeof(SparsityMapEntry<N,T>), sysmem);
+    SparsityMapEntry<N, T>* h_entries = reinterpret_cast<SparsityMapEntry<N,T>*>(AffineAccessor<char,1>(h_instance, 0).base);
+
+    if (my_arena.capacity()==0) {
+      out_space.entries_buffer = reinterpret_cast<SparsityMapEntry<N,T>*>(AffineAccessor<char,1>(h_instance, 0).base);
+    } else {
+      out_space.entries_buffer = my_arena.alloc<SparsityMapEntry<N,T> >(out_space.num_entries);
+    }
+
+
+    //Now we fill the host array with all rectangles
+    size_t pos = 0;
+    for (size_t i = 0; i < spaces.size(); ++i) {
+      IndexSpace<N,T> my_space;
+      if constexpr (std::is_same_v<space_t, IndexSpace<N,T>>) {
+        my_space = spaces[i];
+      } else {
+        my_space = spaces[i].index_space;
+      }
+      if (my_space.dense()) {
+        if constexpr (std::is_same_v<space_t, IndexSpace<N,T>>) {
+          SparsityMapEntry<N,T> entry;
+          entry.bounds = my_space.bounds;
+          memcpy(h_entries + pos, &entry, sizeof(SparsityMapEntry<N,T>));
+          ++pos;
+        } else {
+          std::vector<SparsityMapEntry<N,T> > tmp(shatter_size);
+          int ppt = (my_space.bounds.hi[0] - my_space.bounds.lo[0]+1) / shatter_size;
+          for (int i = 0; i < shatter_size; ++i) {
+            Rect<N,T> new_rect = my_space.bounds;
+            new_rect.lo[0] = my_space.bounds.lo[0] + i * ppt;
+            new_rect.hi[0] = (i == shatter_size - 1) ? my_space.bounds.hi[0] : (new_rect.lo[0] + ppt - 1);
+            SparsityMapEntry<N,T> entry;
+            entry.bounds = new_rect;
+            entry.sparsity.id = 0;
+            entry.bitmap = 0;
+            tmp[i] = entry;
+          }
+          memcpy(h_entries + pos, tmp.data(), tmp.size() * sizeof(SparsityMapEntry<N,T>));
+          pos += shatter_size;
+        }
+      } else {
+        span<SparsityMapEntry<N, T>> tmp = my_space.sparsity.impl()->get_entries();
+        memcpy(h_entries + pos, tmp.data(), tmp.size() * sizeof(SparsityMapEntry<N,T>));
+        pos += tmp.size();
+      }
+    }
+
+    //Now we copy our entries and offsets to the device
+    CUDA_CHECK(cudaMemcpyAsync(out_space.offsets, space_offsets.data(), (spaces.size() + 1) * sizeof(size_t), cudaMemcpyHostToDevice, stream), stream);
+    if (my_arena.capacity() != 0) {
+      CUDA_CHECK(cudaMemcpyAsync(out_space.entries_buffer, h_entries, out_space.num_entries * sizeof(SparsityMapEntry<N,T>), cudaMemcpyHostToDevice, stream), stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+      h_instance.destroy();
+    }
+    CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+
+  }
+
+  // Only real work here is getting dense/sparse into a single collapsed_space.
+  template<int N, typename T>
+  void GPUMicroOp<N,T>::collapse_parent_space(const IndexSpace<N, T>& parent_space, collapsed_space<N, T> &out_space, Arena &my_arena, cudaStream_t stream)
+  {
+    if (parent_space.dense()) {
+      SparsityMapEntry<N,T> entry;
+      entry.bounds = parent_space.bounds;
+      out_space.entries_buffer = my_arena.alloc<SparsityMapEntry<N, T>>(1);
+      out_space.num_entries = 1;
+      CUDA_CHECK(cudaMemcpyAsync(out_space.entries_buffer, &entry, sizeof(SparsityMapEntry<N,T>), cudaMemcpyHostToDevice, stream), stream);
+    } else {
+      span<SparsityMapEntry<N, T>> tmp =  parent_space.sparsity.impl()->get_entries();
+      out_space.num_entries = tmp.size();
+      out_space.entries_buffer = my_arena.alloc<SparsityMapEntry<N, T>>(tmp.size());
+      out_space.bounds = parent_space.bounds;
+      CUDA_CHECK(cudaMemcpyAsync(out_space.entries_buffer, tmp.data(), tmp.size() * sizeof(SparsityMapEntry<N,T>), cudaMemcpyHostToDevice, stream), stream);
+    }
+    out_space.offsets = nullptr;
+    out_space.num_children = 1;
+  }
+
+  // Given a collapsed space, builds a (potentially marked) bvh over that space.
+  // Based on Tero Karras' Maximizing Parallelism in the Construction of BVHs, Octrees, and k-d Trees
+  template<int N, typename T>
+  void GPUMicroOp<N, T>::build_bvh(const collapsed_space<N, T> &space, BVH<N, T> &result, Arena &my_arena, cudaStream_t stream)
+  {
+
+      //We want to keep the entire BVH that we return in one instance for convenience.
+      size_t indices_instance_size = space.num_entries * sizeof(uint64_t);
+      size_t labels_instance_size = space.offsets == nullptr ? 0 : space.num_entries * sizeof(size_t);
+      size_t boxes_instance_size =  (2*space.num_entries - 1) * sizeof(Rect<N, T>);
+      size_t child_instance_size = (2*space.num_entries - 1) * sizeof(int);
+
+      size_t total_instance_size = indices_instance_size + labels_instance_size + boxes_instance_size + 2 * child_instance_size;
+      char* bvh_ptr = my_arena.alloc<char>(total_instance_size);
+
+      result.num_leaves = space.num_entries;
+
+      size_t curr_idx = 0;
+      result.indices = reinterpret_cast<uint64_t*>(bvh_ptr + curr_idx);
+      curr_idx += indices_instance_size;
+      result.labels = space.offsets == nullptr ? nullptr : reinterpret_cast<size_t*>(bvh_ptr + curr_idx);
+      curr_idx += labels_instance_size;
+      result.boxes = reinterpret_cast<Rect<N,T>*>(bvh_ptr + curr_idx);
+      curr_idx += boxes_instance_size;
+      result.childLeft = reinterpret_cast<int*>(bvh_ptr + curr_idx);
+      curr_idx += child_instance_size;
+      result.childRight = reinterpret_cast<int*>(bvh_ptr + curr_idx);
+
+      size_t prev = my_arena.mark();
+
+      // Bounds used for morton code computation.
+      Rect<N,T>* d_global_bounds = my_arena.alloc<Rect<N,T>>(1);
+      CUDA_CHECK(cudaMemcpyAsync(d_global_bounds, &space.bounds, sizeof(Rect<N,T>), cudaMemcpyHostToDevice, stream), stream);
+
+      // These are intermediate instances we'll destroy before returning.
+      char* d_morton_visit = my_arena.alloc<char>(2 * space.num_entries * max(sizeof(uint64_t), sizeof(int)));
+      uint64_t* d_morton_codes = reinterpret_cast<uint64_t*>(d_morton_visit);
+
+      size_t intermed = my_arena.mark();
+
+      uint64_t* d_indices_in = my_arena.alloc<uint64_t>(space.num_entries);
+
+      // We compute morton codes for each leaf and sort, labeling if necessary.
+      bvh_build_morton_codes<N, T><<<COMPUTE_GRID(space.num_entries), THREADS_PER_BLOCK, 0, stream>>>(space.entries_buffer, space.offsets, d_global_bounds, space.num_entries, space.num_children, d_morton_codes, d_indices_in, result.labels);
+      KERNEL_CHECK(stream);
+
+      uint64_t* d_morton_codes_out = d_morton_codes + space.num_entries;
+      uint64_t* d_indices_out = result.indices;
+
+      void *bvh_temp = nullptr;
+      size_t bvh_temp_bytes = 0;
+      cub::DeviceRadixSort::SortPairs(bvh_temp, bvh_temp_bytes, d_morton_codes, d_morton_codes_out, d_indices_in,
+                                      d_indices_out, space.num_entries, 0, 64, stream);
+      bvh_temp = reinterpret_cast<void*>(my_arena.alloc<char>(bvh_temp_bytes));
+      cub::DeviceRadixSort::SortPairs(bvh_temp, bvh_temp_bytes, d_morton_codes, d_morton_codes_out, d_indices_in,
+                                      d_indices_out, space.num_entries, 0, 64, stream);
+
+      std::swap(d_morton_codes, d_morton_codes_out);
+
+      my_arena.rollback(intermed);
+
+
+      // Another temporary instance.
+      int* d_parent = my_arena.alloc<int>(2*space.num_entries - 1);
+      CUDA_CHECK(cudaMemsetAsync(d_parent, -1, (2*space.num_entries - 1) * sizeof(int), stream), stream);
+
+      // Here's where we actually build the BVH
+      int n = (int) space.num_entries;
+      bvh_build_radix_tree_kernel<<< COMPUTE_GRID(space.num_entries - 1), THREADS_PER_BLOCK, 0, stream>>>(d_morton_codes, result.indices, n, result.childLeft, result.childRight, d_parent);
+      KERNEL_CHECK(stream);
+
+      // Figure out which node didn't get its parent set.
+      int* d_root = my_arena.alloc<int>(1);
+
+      CUDA_CHECK(cudaMemsetAsync(d_root, -1, sizeof(int), stream), stream);
+
+      bvh_build_root_kernel<<< COMPUTE_GRID(2 * space.num_entries - 1), THREADS_PER_BLOCK, 0, stream>>>(d_root, d_parent, space.num_entries);
+      KERNEL_CHECK(stream);
+
+      CUDA_CHECK(cudaMemcpyAsync(&result.root, d_root, sizeof(int), cudaMemcpyDeviceToHost, stream), stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+
+      // Now we materialize the tree into something the client can query.
+      bvh_init_leaf_boxes_kernel<N, T><<<COMPUTE_GRID(space.num_entries), THREADS_PER_BLOCK, 0, stream>>>(space.entries_buffer, result.indices, space.num_entries, result.boxes);
+      KERNEL_CHECK(stream);
+
+      int* d_visitCount = reinterpret_cast<int*>(d_morton_visit);
+      CUDA_CHECK(cudaMemsetAsync(d_visitCount, 0, (2*space.num_entries - 1) * sizeof(int), stream), stream);
+
+      bvh_merge_internal_boxes_kernel < N, T ><<< COMPUTE_GRID(space.num_entries), THREADS_PER_BLOCK, 0, stream>>>(space.num_entries, result.childLeft, result.childRight, d_parent, result.boxes, d_visitCount);
+      KERNEL_CHECK(stream);
+
+      // Cleanup.
+      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+      my_arena.rollback(prev);
+
+  }
+
+  // Intersects two collapsed spaces, where lhs is always instances and rhs is either parent or sources/targets.
+  // If rhs is sources/targets, we mark the intersected rectangles by where they came from.
+  // If the intersection is costly, we accelerate with a BVH.
+  template<int N, typename T>
+  template<typename out_t>
+  void GPUMicroOp<N,T>::construct_input_rectlist(const collapsed_space<N, T> &lhs, const collapsed_space<N, T> &rhs, out_t* &d_valid_rects, size_t& out_size, uint32_t* counters, uint32_t* out_offsets, Arena &my_arena, cudaStream_t stream)
+  {
+
+    CUDA_CHECK(cudaMemsetAsync(counters, 0, (lhs.num_children) * sizeof(uint32_t), stream), stream);
+
+    BVH<N, T> my_bvh;
+    bool bvh_valid = rhs.num_children < rhs.num_entries;
+    if (bvh_valid) {
+      build_bvh(rhs, my_bvh, my_arena, stream);
+    }
+
+    // First pass: figure out how many rectangles survive intersection.
+    if (!bvh_valid) {
+      intersect_input_rects<N, T, out_t><<<COMPUTE_GRID(lhs.num_entries * rhs.num_entries), THREADS_PER_BLOCK, 0, stream>>>(lhs.entries_buffer, rhs.entries_buffer, lhs.offsets, nullptr, rhs.offsets, lhs.num_entries, rhs.num_entries, lhs.num_children, rhs.num_children, counters, nullptr);
+    } else {
+      query_input_bvh<N, T, out_t><<<COMPUTE_GRID(lhs.num_entries), THREADS_PER_BLOCK, 0, stream>>>(lhs.entries_buffer, lhs.offsets, my_bvh.root, my_bvh.childLeft, my_bvh.childRight, my_bvh.indices, my_bvh.labels, my_bvh.boxes, lhs.num_entries, my_bvh.num_leaves, lhs.num_children, nullptr, counters, nullptr);
+    }
+    KERNEL_CHECK(stream);
+
+
+    // Prefix sum over instances (small enough to keep on host).
+    std::vector<uint32_t> h_inst_counters(lhs.num_children+1);
+    h_inst_counters[0] = 0; // prefix sum starts at 0
+    CUDA_CHECK(cudaMemcpyAsync(h_inst_counters.data()+1, counters, lhs.num_children * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+    for (size_t i = 0; i < lhs.num_children; ++i) {
+      h_inst_counters[i+1] += h_inst_counters[i];
+    }
+
+    out_size = h_inst_counters[lhs.num_children];
+
+    if (out_size==0) {
+      return;
+    }
+
+    //Moving on...
+    my_arena.flip_parity();
+
+    // Non-empty rectangles from the intersection.
+    d_valid_rects = my_arena.alloc<out_t>(out_size);
+
+    // Where each instance should start writing its rectangles.
+    CUDA_CHECK(cudaMemcpyAsync(out_offsets, h_inst_counters.data(), (lhs.num_children + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream);
+
+    // Reset counters.
+    CUDA_CHECK(cudaMemsetAsync(counters, 0, lhs.num_children * sizeof(uint32_t), stream), stream);
+
+    // Second pass: recompute intersection, but this time write to output.
+    if (!bvh_valid) {
+      intersect_input_rects<N, T, out_t><<<COMPUTE_GRID(lhs.num_entries * rhs.num_entries), THREADS_PER_BLOCK, 0, stream>>>(lhs.entries_buffer, rhs.entries_buffer, lhs.offsets, out_offsets, rhs.offsets, lhs.num_entries, rhs.num_entries, lhs.num_children, rhs.num_children, counters, d_valid_rects);
+    } else {
+      query_input_bvh<N, T, out_t><<<COMPUTE_GRID(lhs.num_entries), THREADS_PER_BLOCK, 0, stream>>>(lhs.entries_buffer, lhs.offsets, my_bvh.root, my_bvh.childLeft, my_bvh.childRight, my_bvh.indices, my_bvh.labels, my_bvh.boxes, lhs.num_entries, my_bvh.num_leaves, lhs.num_children, out_offsets, counters, d_valid_rects);
+    }
+    KERNEL_CHECK(stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+  }
+
+  // Prefix sum an array of Rects or RectDescs by volume.
+  template<int N, typename T>
+  template<typename out_t>
+  void GPUMicroOp<N, T>::volume_prefix_sum(const out_t* d_rects, size_t total_rects, size_t* &d_prefix_rects, size_t& num_pts, Arena &my_arena, cudaStream_t stream)
+  {
+    d_prefix_rects = my_arena.alloc<size_t>(total_rects+1);
+    CUDA_CHECK(cudaMemsetAsync(d_prefix_rects, 0, sizeof(size_t), stream), stream);
+
+    size_t prev = my_arena.mark();
+
+    // Build the CUB transform‐iterator.
+    using VolIter = cub::TransformInputIterator<
+                      size_t,              // output type
+                      RectVolumeOp<N,T,out_t>,            // functor
+                      const out_t*     // underlying input iterator
+                    >;
+    VolIter d_volumes(d_rects, RectVolumeOp<N,T,out_t>());
+
+    void*   d_temp = nullptr;
+    size_t rect_temp_bytes = 0;
+    cub::DeviceScan::InclusiveSum(
+        /* d_temp_storage */  nullptr,
+        /* temp_bytes */      rect_temp_bytes,
+        /* d_in */            d_volumes,
+        /* d_out */           d_prefix_rects + 1,   // shift by one so prefix[1]..prefix[n]
+        /* num_items */       total_rects, stream);
+
+    d_temp = reinterpret_cast<void*>(my_arena.alloc<char>(rect_temp_bytes));
+    cub::DeviceScan::InclusiveSum(
+        /* d_temp_storage */  d_temp,
+        /* temp_bytes */      rect_temp_bytes,
+        /* d_in */            d_volumes,
+        /* d_out */           d_prefix_rects + 1,
+        /* num_items */       total_rects, stream);
+
+
+    //Number of points across all rectangles (also our total output count).
+    CUDA_CHECK(cudaMemcpyAsync(&num_pts, &d_prefix_rects[total_rects], sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream);
+
+    CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+    my_arena.rollback(prev);
+  }
+
+  template<typename T>
+  struct SegmentedMax {
+    __device__ __forceinline__
+    HiFlag<T> operator()(HiFlag<T> a, HiFlag<T> b) const {
+      // if b.head==1, start new segment at b; otherwise merge with running max
+      return b.head
+        ? b
+      : HiFlag<T>{ a.hi > b.hi ? a.hi : b.hi , a.head };
+    }
+  };
+
+  struct SegmentedSum {
+    __device__ __forceinline__
+    DeltaFlag operator()(DeltaFlag a, DeltaFlag b) const {
+      // if b.head==1, start new segment at b; otherwise merge with running max
+      return b.head
+        ? b
+      : DeltaFlag{ a.delta + b.delta , a.head };
+    }
+  };
+
+  struct CustomSum
+  {
+    template <typename T>
+    __device__ __forceinline__
+    T operator()(const T &a, const T &b) const {
+      return b+a;
+    }
+  };
+
+
+ /*
+  *  Input: An array of rectangles (potentially overlapping) with associated
+  *  src indices, where all the rectangles with a given src idx together represent an exact covering
+  *  of the partitioning output for that index.
+  *  Output: A disjoint, coalesced array of rectangles sorted by src idx that it then sends off
+  *  to the send output function, which constructs the final sparsity map.
+  *  Approach: The difficult part is constructing a disjoint covering. To do so, collect all the corners from all the
+  *  rectangles as the unique "boundaries" for each dimension and mark them with the parity for the number of dimensions
+  *  in which they are the hi+1 coord (we add 1 to make intervals half-open). This means that if you prefix sum in each dimension,
+  *  for any given rectangle anything internal will sum to 1, and anything external will sum to 0.  To understand the intuition,
+  *  see the illustration below for the rectangle [(0,0), (2,2)]
+  *  Corners: (0,0), (0,3), (3,0), (3,3)
+  *  Parities: 0 hi-> +1, 1 hi -> -1, 1 hi -> -1, 2 hi -> +1
+  *  Computation:
+  *  Initial Markings
+  *    0  1  2  3  4 ...
+  *  0 +1      -1
+  *  1
+  *  2
+  *  3 -1      +1
+  *  4
+  *  ...
+  *  Prefix sum by Y
+  *    0  1  2  3  4 ...
+  *  0 +1      -1
+  *  1  1      -1
+  *  2  1      -1
+  *  3  0       0
+  *  4  0       0
+  *  ...
+  *  Prefix sum by X
+  *    0   1  2  3  4 ...
+  *  0 +1  1  1  0  0 ...
+  *  1  1  1  1  0  0 ...
+  *  2  1  1  1  0  0 ...
+  *  3  0  0  0  0  0 ...
+  *  4  0  0  0  0  0 ...
+  *  ...
+  *  Note that all the points in the rectangle end up labeled 1, and all the points outside labeled 0. In the actual computation, we use segments
+  *  rather than points, where a segment accounts for all points between two consecutive boundaries. Because a prefix sum is a linear operator, when
+  *  we extend the computation above to multiple overlapping rectangles, you end up with included segments labeled with a count of how many rectangles include them,
+  *  and excluded segments labeled with 0. Thus, for the last dimension, we emit all segments with sums > 0 as disjoint output rectangles. We can then dump these
+  *  into the sort + coalesce pipeline.
+  */
+  template<int N, typename T>
+  template<typename Container, typename IndexFn, typename MapFn>
+  void GPUMicroOp<N,T>::complete_rect_pipeline(RectDesc<N, T>* d_rects, size_t total_rects, RectDesc<N, T>* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap)
+  {
+
+    //1D case is much simpler
+    if (N==1) {
+      this->complete1d_pipeline(d_rects, total_rects, d_out_rects, out_rects, my_arena, ctr, getIndex, getMap);
+      return;
+    }
+    NVTX_DEPPART(complete_rect_pipeline);
+    cudaStream_t stream = Cuda::get_task_cuda_stream();
+
+    Memory my_mem;
+    bool found = find_memory(my_mem, Memory::GPU_FB_MEM);
+    assert(found);
+
+    RegionInstance srcs_instance = this->realm_malloc(4*total_rects*sizeof(int32_t), my_mem);
+    RegionInstance crds_instance = this->realm_malloc(4*total_rects*sizeof(T), my_mem);
+    RegionInstance heads_instance = this->realm_malloc(2*total_rects * sizeof(uint8_t), my_mem);
+    RegionInstance sum_instance = this->realm_malloc(2*total_rects * sizeof(size_t), my_mem);
+
+    RegionInstance B_src_inst[N];
+    RegionInstance B_coord_inst[N];
+
+    size_t *B_starts[N];
+    size_t *B_ends[N];
+
+    T* B_coord[N];
+    size_t B_size[N];
+
+    RegionInstance B_ptrs_instance = this->realm_malloc(2 * N * sizeof(size_t*), my_mem);
+    size_t** B_start_ptrs = reinterpret_cast<size_t**>(AffineAccessor<char,1>(B_ptrs_instance, 0).base);
+    size_t** B_end_ptrs = reinterpret_cast<size_t**>(AffineAccessor<char,1>(B_ptrs_instance, 0).base) + N;
+
+    RegionInstance B_coord_ptrs_instance = this->realm_malloc(N * sizeof(T*), my_mem);
+    T** B_coord_ptrs = reinterpret_cast<T**>(AffineAccessor<char,1>(B_coord_ptrs_instance, 0).base);
+    
+    int threads_per_block = 256;
+    size_t grid_size = (total_rects + threads_per_block - 1) / threads_per_block;
+
+    RegionInstance tmp_instance;
+    size_t orig_tmp = 0;
+    void *tmp_storage = nullptr;
+
+    //Our first step is to find all the unique "boundaries" in each dimension (lo coord or hi+1 coord)
+    {
+      NVTX_DEPPART(mark_endpoints);
+      for (int d = 0; d < N; ++d) {
+
+        //We need the coordinates to be sorted by our curent dim and separated by src idx
+        grid_size = (total_rects + threads_per_block - 1) / threads_per_block;
+        uint32_t* d_srcs_in = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(srcs_instance, 0).base);
+        uint32_t* d_srcs_out = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(srcs_instance, 0).base) + 2* total_rects;
+        T* d_coord_keys_in = reinterpret_cast<T*>(AffineAccessor<char,1>(crds_instance,0).base);
+        T* d_coord_keys_out = reinterpret_cast<T*>(AffineAccessor<char,1>(crds_instance,0).base) + 2 * total_rects;
+        mark_endpoints<<<grid_size, threads_per_block, 0, stream>>>(d_rects, total_rects, d, d_srcs_in, d_coord_keys_in);
+        KERNEL_CHECK(stream);
+        size_t temp_bytes;
+        cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes,
+                                            d_coord_keys_in, d_coord_keys_out,
+                                            d_srcs_in, d_srcs_out,
+                                            2 * total_rects, 0, 8*sizeof(T), stream);
+        if (temp_bytes > orig_tmp) {
+          if (orig_tmp > 0) {
+            tmp_instance.destroy();
+          }
+          tmp_instance = this->realm_malloc(temp_bytes, my_mem);
+          orig_tmp = temp_bytes;
+          tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
+        }
+        cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes,
+                                                  d_coord_keys_in, d_coord_keys_out,
+                                                  d_srcs_in, d_srcs_out,
+                                                  2 * total_rects, 0, 8*sizeof(T), stream);
+        std::swap(d_srcs_in, d_srcs_out);
+        std::swap(d_coord_keys_in, d_coord_keys_out);
+        cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes,
+                                            d_srcs_in, d_srcs_out,
+                                            d_coord_keys_in, d_coord_keys_out,
+                                            2 * total_rects, 0, 8*sizeof(uint32_t), stream);
+        if (temp_bytes > orig_tmp) {
+          if (orig_tmp > 0) {
+            tmp_instance.destroy();
+          }
+          tmp_instance = this->realm_malloc(temp_bytes, my_mem);
+          orig_tmp = temp_bytes;
+          tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
+        }
+        cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes,
+                                            d_srcs_in, d_srcs_out,
+                                            d_coord_keys_in, d_coord_keys_out,
+                                            2 * total_rects, 0, 8*sizeof(uint32_t), stream);
+
+        //Now mark the unique keys
+        grid_size = (2*total_rects + threads_per_block - 1) / threads_per_block;
+        uint8_t * d_heads = reinterpret_cast<uint8_t*>(AffineAccessor<char,1>(heads_instance, 0).base);
+        size_t *d_output = reinterpret_cast<size_t *>(AffineAccessor<char,1>(sum_instance, 0).base);
+        mark_heads<<<grid_size, threads_per_block, 0, stream>>>(d_srcs_out, d_coord_keys_out, 2 * total_rects, d_heads);
+        KERNEL_CHECK(stream);
+
+        cub::DeviceScan::ExclusiveSum(nullptr, temp_bytes, d_heads, d_output, 2 * total_rects, stream);
+        if (temp_bytes > orig_tmp) {
+          if (orig_tmp > 0) {
+            tmp_instance.destroy();
+          }
+          tmp_instance = this->realm_malloc(temp_bytes, my_mem);
+          orig_tmp = temp_bytes;
+          tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
+        }
+        cub::DeviceScan::ExclusiveSum(tmp_storage, temp_bytes, d_heads, d_output, 2 * total_rects, stream);
+
+        size_t num_unique;
+        uint8_t last_bit;
+        CUDA_CHECK(cudaMemcpyAsync(&num_unique, &d_output[2*total_rects-1], sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream);
+        CUDA_CHECK(cudaMemcpyAsync(&last_bit, &d_heads[2*total_rects-1], sizeof(uint8_t), cudaMemcpyDeviceToHost, stream), stream);
+        CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+        num_unique += last_bit;
+
+        //Collect all the data we'll need later for this dimension - starts/ends by src, unique boundaries, unique boundaries count
+        B_coord_inst[d] = this->realm_malloc(num_unique * sizeof(T), my_mem);
+        B_src_inst[d] = this->realm_malloc(2*ctr.size() * sizeof(size_t), my_mem);
+        B_starts[d] = reinterpret_cast<size_t*>(AffineAccessor<char,1>(B_src_inst[d], 0).base);
+        B_ends[d] = reinterpret_cast<size_t*>(AffineAccessor<char,1>(B_src_inst[d], 0).base) + ctr.size();
+        B_coord[d] = reinterpret_cast<T*>(AffineAccessor<char,1>(B_coord_inst[d], 0).base);
+        B_size[d] = num_unique;
+        CUDA_CHECK(cudaMemsetAsync(B_starts[d], 0, ctr.size() * sizeof(size_t), stream), stream);
+        CUDA_CHECK(cudaMemsetAsync(B_ends[d], 0, ctr.size() * sizeof(size_t), stream), stream);
+        scatter_unique<<<grid_size, threads_per_block, 0, stream>>>(d_srcs_out, d_coord_keys_out, d_output, d_heads, 2 * total_rects, B_starts[d], B_ends[d], B_coord[d]);
+        KERNEL_CHECK(stream);
+        std::vector<size_t> d_starts_host(ctr.size()), d_ends_host(ctr.size());
+        CUDA_CHECK(cudaMemcpyAsync(d_starts_host.data(), B_starts[d], ctr.size() * sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream);
+        CUDA_CHECK(cudaMemcpyAsync(d_ends_host.data(), B_ends[d], ctr.size() * sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream);
+        CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+        for (size_t i = 1; i < ctr.size(); i++) {
+          if (d_starts_host[i] < d_ends_host[i-1]) {
+            d_starts_host[i] = d_ends_host[i-1];
+            d_ends_host[i] = d_ends_host[i-1];
+          }
+        }
+        CUDA_CHECK(cudaMemcpyAsync(B_starts[d], d_starts_host.data(), ctr.size() * sizeof(size_t), cudaMemcpyHostToDevice, stream), stream);
+        CUDA_CHECK(cudaMemcpyAsync(B_ends[d], d_ends_host.data(), ctr.size() * sizeof(size_t), cudaMemcpyHostToDevice, stream), stream);
+      }
+
+      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+    }
+    srcs_instance.destroy();
+    crds_instance.destroy();
+    heads_instance.destroy();
+    sum_instance.destroy();
+
+
+    //We need the arrays themselves on the device
+    CUDA_CHECK(cudaMemcpyAsync(B_coord_ptrs, B_coord, N * sizeof(T*), cudaMemcpyHostToDevice, stream), stream);
+    CUDA_CHECK(cudaMemcpyAsync(B_start_ptrs, B_starts, N * sizeof(size_t*), cudaMemcpyHostToDevice, stream), stream);
+    CUDA_CHECK(cudaMemcpyAsync(B_end_ptrs, B_ends, N * sizeof(size_t*), cudaMemcpyHostToDevice, stream), stream);
+
+    //Next up, we generate all the corners of all the rectangles and mark them by parity
+    size_t num_corners = (1 << N);
+    RegionInstance corners_instance = this->realm_malloc(2 * num_corners * total_rects * sizeof(CornerDesc<N, T>), my_mem);
+    CornerDesc<N, T>* d_corners_in = reinterpret_cast<CornerDesc<N, T>*>(AffineAccessor<char,1>(corners_instance, 0).base);
+    CornerDesc<N, T>* d_corners_out = reinterpret_cast<CornerDesc<N, T>*>(AffineAccessor<char,1>(corners_instance, 0).base) + num_corners * total_rects;
+
+    populate_corners<<<grid_size, threads_per_block, 0, stream>>>(d_rects, total_rects, d_corners_in);
+    KERNEL_CHECK(stream);
+
+
+    // We have a LOT of bookkeeping to do
+    std::set<Event> RLE_alloc_events;
+
+    size_t alloc_size_1 = std::max({sizeof(size_t), sizeof(T), sizeof(int32_t), sizeof(DeltaFlag)});
+
+    RegionInstance shared_instance = this->realm_malloc(2 * num_corners * total_rects * alloc_size_1, my_mem);
+
+    RegionInstance flags_instance = this->realm_malloc(num_corners * total_rects * sizeof(uint8_t), my_mem);
+
+    RegionInstance exc_sum_instance = this->realm_malloc(num_corners * total_rects * sizeof(size_t), my_mem);
+
+    size_t per_elem_size = 2*alloc_size_1 + sizeof(uint8_t) + sizeof(size_t);
+
+    size_t* d_src_keys_in = reinterpret_cast<size_t*>(AffineAccessor<char,1>(shared_instance, 0).base);
+    size_t* d_src_keys_out = reinterpret_cast<size_t*>(AffineAccessor<char,1>(shared_instance, 0).base) + num_corners * total_rects;
+    T* d_coord_keys_in = reinterpret_cast<T*>(AffineAccessor<char,1>(shared_instance, 0).base);
+    T* d_coord_keys_out = reinterpret_cast<T*>(AffineAccessor<char,1>(shared_instance, 0).base) + num_corners * total_rects;
+    int32_t* d_deltas = reinterpret_cast<int32_t*>(AffineAccessor<char,1>(shared_instance, 0).base);
+    int32_t* d_deltas_out = reinterpret_cast<int32_t*>(AffineAccessor<char,1>(shared_instance, 0).base) + num_corners * total_rects;
+    DeltaFlag* d_delta_flags_in = reinterpret_cast<DeltaFlag*>(AffineAccessor<char,1>(shared_instance, 0).base);
+    DeltaFlag* d_delta_flags_out = reinterpret_cast<DeltaFlag*>(AffineAccessor<char,1>(shared_instance, 0).base) + num_corners * total_rects;
+    uint8_t* d_flags = reinterpret_cast<uint8_t*>(AffineAccessor<char,1>(flags_instance, 0).base);
+    size_t* d_exc_sum = reinterpret_cast<size_t*>(AffineAccessor<char,1>(exc_sum_instance, 0).base);
+
+    RegionInstance seg_bound_instance;
+    size_t* seg_starts;
+    size_t* seg_ends;
+
+    RegionInstance seg_counters;
+    uint32_t* d_seg_counters;
+
+    RegionInstance seg_counters_out;
+    uint32_t* d_seg_counters_out;
+
+    grid_size = (num_corners * total_rects + threads_per_block - 1) / threads_per_block;
+
+    //We need to reduce duplicate corners by their parity, so we sort to get duplicates next to each other and then reduce by key
+    {
+      NVTX_DEPPART(sort_corners);
+      for (int dim = 0; dim < N; dim++) {
+        build_coord_key<<<grid_size, threads_per_block, 0, stream>>>(d_coord_keys_in, d_corners_in, num_corners * total_rects, dim);
+        KERNEL_CHECK(stream);
+        size_t temp_bytes;
+        cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes,
+                                            d_coord_keys_in, d_coord_keys_out,
+                                            d_corners_in, d_corners_out,
+                                            num_corners * total_rects, 0, 8*sizeof(T), stream);
+        if (temp_bytes > orig_tmp) {
+          tmp_instance.destroy();
+          tmp_instance = this->realm_malloc(temp_bytes, my_mem);
+          orig_tmp = temp_bytes;
+          tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
+        }
+        cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes,
+                                                        d_coord_keys_in, d_coord_keys_out,
+                                                        d_corners_in, d_corners_out,
+                                                        num_corners * total_rects, 0, 8*sizeof(T), stream);
+
+        std::swap(d_corners_in, d_corners_out);
+
+      }
+    }
+
+    size_t temp_bytes;
+    build_src_key<<<grid_size, threads_per_block, 0, stream>>>(d_src_keys_in, d_corners_in, num_corners * total_rects);
+    KERNEL_CHECK(stream);
+    cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes,
+                                            d_src_keys_in, d_src_keys_out,
+                                            d_corners_in, d_corners_out,
+                                            num_corners * total_rects, 0, 8*sizeof(size_t), stream);
+    if (temp_bytes > orig_tmp) {
+      tmp_instance.destroy();
+      tmp_instance = this->realm_malloc(temp_bytes, my_mem);
+      orig_tmp = temp_bytes;
+      tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
+    }
+    cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes,
+                                                    d_src_keys_in, d_src_keys_out,
+                                                    d_corners_in, d_corners_out,
+                                                    num_corners * total_rects, 0, 8*sizeof(size_t), stream);
+
+    std::swap(d_corners_in, d_corners_out);
+    get_delta<<<grid_size, threads_per_block, 0, stream>>>(d_deltas, d_corners_in, num_corners * total_rects);
+    KERNEL_CHECK(stream);
+
+    RegionInstance num_runs_instance = this->realm_malloc(sizeof(int), my_mem);
+    int* d_num_runs = reinterpret_cast<int*>(AffineAccessor<char,1>(num_runs_instance, 0).base);
+
+    //See above, we have custom equality and reduction operators for CornerDesc
+    CustomSum red_op;
+    cub::DeviceReduce::ReduceByKey(
+        nullptr, temp_bytes,
+        d_corners_in, d_corners_out,
+        d_deltas, d_deltas_out,
+        d_num_runs,
+        red_op,
+        /*num_items=*/(int) (num_corners * total_rects),
+         /*stream=*/stream);
+
+    if (temp_bytes > orig_tmp) {
+      tmp_instance.destroy();
+      tmp_instance = this->realm_malloc(temp_bytes, my_mem);
+      orig_tmp = temp_bytes;
+      tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
+    }
+    cub::DeviceReduce::ReduceByKey(
+        tmp_storage, temp_bytes,
+        d_corners_in, d_corners_out,
+        d_deltas, d_deltas_out,
+        d_num_runs,
+        red_op,
+        /*num_items=*/(int) (num_corners * total_rects),
+         /*stream=*/stream);
+
+    int num_unique_corners;
+    CUDA_CHECK(cudaMemcpyAsync(&num_unique_corners, d_num_runs, sizeof(int), cudaMemcpyDeviceToHost, stream), stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+
+    num_runs_instance.destroy();
+
+    grid_size = (num_unique_corners + threads_per_block - 1) / threads_per_block;
+    set_delta<<<grid_size, threads_per_block, 0, stream>>>(d_deltas_out, d_corners_out, num_unique_corners);
+    KERNEL_CHECK(stream);
+
+    std::swap(d_corners_out, d_corners_in);
+
+    size_t num_intermediate = num_unique_corners;
+    size_t num_segments;
+
+    //This is where the real work is done. In each dimension, we do a segmented prefix sum of the parity markings keyed on (src idx, {every dim but d}) for all active segments.
+    // Then, for each unique boundary b in dim d, for each segment s keyed on (src idx, {every dim but d}), we evaluate s's prefix sum value at b. If nonzero, we emit a segment
+    // for s between b and the next boundary in d with all the other coords set to s's coords. These become the active segments for the next pass. In the last pass (d = 0), rather
+    // than emitting segments, we emit rectangles for all segments with nonzero prefix sums (in fact they must also be nonnegative - recall the model is > 0 for included, 0 for excluded
+    // by the end).
+    {
+      NVTX_DEPPART(collapse_higher_dims);
+      for (int d = N-1; d >= 0; d--) {
+        grid_size = (num_intermediate + threads_per_block - 1) / threads_per_block;
+
+        //Our least significant sort is by d.
+        build_coord_key<<<grid_size, threads_per_block, 0, stream>>>(d_coord_keys_in, d_corners_in, num_intermediate, d);
+        KERNEL_CHECK(stream);
+        size_t temp_bytes;
+        cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes,
+                                            d_coord_keys_in, d_coord_keys_out,
+                                            d_corners_in, d_corners_out,
+                                            num_intermediate, 0, 8*sizeof(T), stream);
+        if (temp_bytes > orig_tmp) {
+          tmp_instance.destroy();
+          tmp_instance = this->realm_malloc(temp_bytes, my_mem);
+          orig_tmp = temp_bytes;
+          tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
+        }
+        cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes,
+                                                        d_coord_keys_in, d_coord_keys_out,
+                                                        d_corners_in, d_corners_out,
+                                                        num_intermediate, 0, 8*sizeof(T), stream);
+
+        std::swap(d_corners_in, d_corners_out);
+
+        //We need to key segments on every dimension but d and src idx, so we do a series of stable sorts to get there
+        for (int dim = 0; dim < N; dim++) {
+          if (dim == d) {
+            continue;
+          }
+          build_coord_key<<<grid_size, threads_per_block, 0, stream>>>(d_coord_keys_in, d_corners_in, num_intermediate, dim);
+          KERNEL_CHECK(stream);
+          size_t temp_bytes;
+          cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes,
+                                              d_coord_keys_in, d_coord_keys_out,
+                                              d_corners_in, d_corners_out,
+                                              num_intermediate, 0, 8*sizeof(T), stream);
+          if (temp_bytes > orig_tmp) {
+            tmp_instance.destroy();
+            tmp_instance = this->realm_malloc(temp_bytes, my_mem);
+            orig_tmp = temp_bytes;
+            tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
+          }
+          cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes,
+                                                          d_coord_keys_in, d_coord_keys_out,
+                                                          d_corners_in, d_corners_out,
+                                                          num_intermediate, 0, 8*sizeof(T), stream);
+
+          std::swap(d_corners_in, d_corners_out);
+
+        }
+
+        build_src_key<<<grid_size, threads_per_block, 0, stream>>>(d_src_keys_in, d_corners_in, num_intermediate);
+        KERNEL_CHECK(stream);
+        cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes,
+                                                d_src_keys_in, d_src_keys_out,
+                                                d_corners_in, d_corners_out,
+                                                num_intermediate, 0, 8*sizeof(size_t), stream);
+        if (temp_bytes > orig_tmp) {
+          tmp_instance.destroy();
+          tmp_instance = this->realm_malloc(temp_bytes, my_mem);
+          orig_tmp = temp_bytes;
+          tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
+        }
+        cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes,
+                                                        d_src_keys_in, d_src_keys_out,
+                                                        d_corners_in, d_corners_out,
+                                                        num_intermediate, 0, 8*sizeof(size_t), stream);
+
+        std::swap(d_corners_in, d_corners_out);
+
+        //This serves 2 purposes
+        // 1) Our segmented prefix sum needs to know where to start and stop
+        // 2) We need to know how many unique segments (keyed on (src_idx, {every dimension but d}) we have
+        mark_deltas_heads<<<grid_size, threads_per_block, 0, stream>>>(d_corners_in, num_intermediate, d, d_flags, d_delta_flags_in);
+        KERNEL_CHECK(stream);
+
+        cub::DeviceScan::InclusiveSum(nullptr, temp_bytes, d_flags, d_exc_sum, num_intermediate, stream);
+        if (temp_bytes > orig_tmp) {
+          if (orig_tmp > 0) {
+            tmp_instance.destroy();
+          }
+          tmp_instance = this->realm_malloc(temp_bytes, my_mem);
+          orig_tmp = temp_bytes;
+          tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
+        }
+        cub::DeviceScan::InclusiveSum(tmp_storage, temp_bytes, d_flags, d_exc_sum, num_intermediate, stream);
+
+        CUDA_CHECK(cudaMemcpyAsync(&num_segments, &d_exc_sum[num_intermediate-1], sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream);
+        CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+
+        //Mark the beginning and end of each segment for our kernel to use in binary search
+        seg_bound_instance = this->realm_malloc(2 * num_segments * sizeof(size_t), my_mem);
+        seg_starts = reinterpret_cast<size_t*>(AffineAccessor<char,1>(seg_bound_instance, 0).base);
+        seg_ends = reinterpret_cast<size_t*>(AffineAccessor<char,1>(seg_bound_instance, 0).base) + num_segments;
+
+        seg_boundaries<<<grid_size, threads_per_block, 0, stream>>>(d_flags, d_exc_sum, num_intermediate, seg_starts, seg_ends);
+        KERNEL_CHECK(stream);
+
+        //Segmented prefix sum using our flags constructed above
+        cub::DeviceScan::InclusiveScan(
+          /*d_temp=*/    nullptr,
+          /*bytes=*/     temp_bytes,
+          /*in=*/        d_delta_flags_in,
+          /*out=*/       d_delta_flags_out,
+          /*op=*/        SegmentedSum(),
+          /*num_items=*/ num_intermediate,
+          /*stream=*/    stream
+        );
+
+        if (temp_bytes > orig_tmp) {
+          tmp_instance.destroy();
+          tmp_instance = this->realm_malloc(temp_bytes, my_mem);
+          orig_tmp = temp_bytes;
+          tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
+        }
+
+        cub::DeviceScan::InclusiveScan(
+          /*d_temp=*/    tmp_storage,
+          /*bytes=*/     temp_bytes,
+          /*in=*/        d_delta_flags_in,
+          /*out=*/       d_delta_flags_out,
+          /*op=*/        SegmentedSum(),
+          /*num_items=*/ num_intermediate,
+          /*stream=*/    stream
+        );
+
+        CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+
+        //Per usual, we do a count + emit pass to track active segments and limit memory usage. If the evaluated prefix sum for a boundary within a segment
+        //is 0, we can skip it because it won't contribute anything to future sums and also won't be emitted.
+        seg_counters = this->realm_malloc(num_segments * sizeof(uint32_t), my_mem);
+        d_seg_counters = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(seg_counters, 0).base);
+        CUDA_CHECK(cudaMemsetAsync(d_seg_counters, 0, num_segments * sizeof(uint32_t), stream), stream);
+
+        grid_size = ((num_segments*B_size[d]) + threads_per_block - 1) / threads_per_block;
+        count_segments<<<grid_size, threads_per_block, 0, stream>>>(d_delta_flags_out, seg_starts, seg_ends, B_starts[d], B_ends[d], d_corners_in, B_coord[d], B_size[d], num_segments, d, d_seg_counters);
+        KERNEL_CHECK(stream);
+
+        seg_counters_out = this->realm_malloc(num_segments * sizeof(uint32_t), my_mem);
+        d_seg_counters_out = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(seg_counters_out, 0).base);
+
+        cub::DeviceScan::ExclusiveSum(nullptr, temp_bytes, d_seg_counters, d_seg_counters_out, num_segments, stream);
+        if (temp_bytes > orig_tmp) {
+          if (orig_tmp > 0) {
+            tmp_instance.destroy();
+          }
+          tmp_instance = this->realm_malloc(temp_bytes, my_mem);
+          orig_tmp = temp_bytes;
+          tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
+        }
+        cub::DeviceScan::ExclusiveSum(tmp_storage, temp_bytes, d_seg_counters, d_seg_counters_out, num_segments, stream);
+
+        uint32_t next_round;
+        uint32_t last_count;
+        CUDA_CHECK(cudaMemcpyAsync(&next_round, &d_seg_counters_out[num_segments-1], sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream);
+        CUDA_CHECK(cudaMemcpyAsync(&last_count, &d_seg_counters[num_segments-1], sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream);
+        CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+        next_round += last_count;
+        if (out_rects > 0 && (next_round + last_count) * per_elem_size > out_rects) {
+          shared_instance.destroy();
+          flags_instance.destroy();
+          exc_sum_instance.destroy();
+          seg_bound_instance.destroy();
+          seg_counters.destroy();
+          seg_counters_out.destroy();
+          corners_instance.destroy();
+          out_rects = std::numeric_limits<size_t>::max();
+          return;
+        }
+
+        num_intermediate = next_round;
+
+        //In this case we exit out to emit rectangles rather than segments
+        if (d==0) {
+          break;
+        }
+
+        RegionInstance next_corners_instance = this->realm_malloc(2 * next_round * sizeof(CornerDesc<N, T>), my_mem);
+        CornerDesc<N, T>* d_next_corners = reinterpret_cast<CornerDesc<N, T>*>(AffineAccessor<char,1>(next_corners_instance, 0).base);
+        CUDA_CHECK(cudaMemsetAsync(d_seg_counters, 0, num_segments*sizeof(uint32_t), stream), stream);
+
+        write_segments<<<grid_size, threads_per_block, 0, stream>>>(d_delta_flags_out, seg_starts, seg_ends, B_starts[d], B_ends[d], d_corners_in, B_coord[d], d_seg_counters_out, B_size[d], num_segments, d, d_seg_counters, d_next_corners);
+        KERNEL_CHECK(stream);
+
+        CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+        corners_instance.destroy();
+        corners_instance = next_corners_instance;
+        d_corners_in = d_next_corners;
+        d_corners_out = d_next_corners + next_round;
+
+        //The segment count in each iter is not monotonic, so we have to realloc each time
+
+        shared_instance.destroy();
+        flags_instance.destroy();
+        exc_sum_instance.destroy();
+        seg_bound_instance.destroy();
+        seg_counters.destroy();
+        seg_counters_out.destroy();
+
+        shared_instance = this->realm_malloc(2 * num_intermediate * alloc_size_1, my_mem);
+        flags_instance = this->realm_malloc(num_intermediate * sizeof(uint8_t), my_mem);
+        exc_sum_instance = this->realm_malloc(num_intermediate * sizeof(size_t), my_mem);
+
+        d_src_keys_in = reinterpret_cast<size_t*>(AffineAccessor<char,1>(shared_instance, 0).base);
+        d_src_keys_out = reinterpret_cast<size_t*>(AffineAccessor<char,1>(shared_instance, 0).base) + num_intermediate;
+        d_coord_keys_in = reinterpret_cast<T*>(AffineAccessor<char,1>(shared_instance, 0).base);
+        d_coord_keys_out = reinterpret_cast<T*>(AffineAccessor<char,1>(shared_instance, 0).base) + num_intermediate;
+        d_deltas = reinterpret_cast<int32_t*>(AffineAccessor<char,1>(shared_instance, 0).base);
+        d_deltas_out = reinterpret_cast<int32_t*>(AffineAccessor<char,1>(shared_instance, 0).base) + num_intermediate;
+
+        d_flags = reinterpret_cast<uint8_t*>(AffineAccessor<char,1>(flags_instance, 0).base);
+        d_exc_sum = reinterpret_cast<size_t*>(AffineAccessor<char,1>(exc_sum_instance, 0).base);
+        d_delta_flags_in = reinterpret_cast<DeltaFlag*>(AffineAccessor<char,1>(shared_instance, 0).base);
+        d_delta_flags_out = reinterpret_cast<DeltaFlag*>(AffineAccessor<char,1>(shared_instance, 0).base) + num_intermediate;
+
+      }
+    }
+
+
+    //For our last dim, we emit rectangles rather than segments. These rectangles are a disjoint, precise covering of the original set.
+    RegionInstance rects_out_instance = this->realm_malloc(2 * num_intermediate * sizeof(RectDesc<N,T>), my_mem);
+    RectDesc<N,T>* d_rects_out = reinterpret_cast<RectDesc<N,T>*>(AffineAccessor<char,1>(rects_out_instance, 0).base);
+    RectDesc<N, T>* d_rects_in = reinterpret_cast<RectDesc<N,T>*>(AffineAccessor<char,1>(rects_out_instance, 0).base) + num_intermediate;
+    CUDA_CHECK(cudaMemsetAsync(d_seg_counters, 0, num_segments*sizeof(uint32_t), stream), stream);
+
+    write_segments<<<grid_size, threads_per_block, 0, stream>>>(d_delta_flags_out, seg_starts, seg_ends, B_start_ptrs, B_end_ptrs, d_corners_in, B_coord_ptrs, d_seg_counters_out, B_size[0], num_segments, d_seg_counters, d_rects_out);
+    KERNEL_CHECK(stream);
+
+    CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+
+    //Don't need these anymore
+    flags_instance.destroy();
+    exc_sum_instance.destroy();
+    seg_bound_instance.destroy();
+    seg_counters.destroy();
+    seg_counters_out.destroy();
+    corners_instance.destroy();
+    for (int d = 0; d < N; d++) {
+      B_coord_inst[d].destroy();
+      B_src_inst[d].destroy();
+    }
+    B_ptrs_instance.destroy();
+    B_coord_ptrs_instance.destroy();
+
+    std::swap(d_rects_out, d_rects_in);
+
+    shared_instance.destroy();
+    size_t alloc_size_2 = max(sizeof(size_t), sizeof(T));
+
+    shared_instance = this->realm_malloc(2 * num_intermediate * alloc_size_2, my_mem);
+
+    d_src_keys_in = reinterpret_cast<size_t*>(AffineAccessor<char,1>(shared_instance, 0).base);
+    d_src_keys_out = reinterpret_cast<size_t*>(AffineAccessor<char,1>(shared_instance, 0).base) + num_intermediate;
+    d_coord_keys_in = reinterpret_cast<T*>(AffineAccessor<char,1>(shared_instance, 0).base);
+    d_coord_keys_out = reinterpret_cast<T*>(AffineAccessor<char,1>(shared_instance, 0).base) + num_intermediate;
+
+    RegionInstance break_points_instance = this->realm_malloc(num_intermediate * sizeof(uint8_t), my_mem);
+    uint8_t* break_points = reinterpret_cast<uint8_t*>(AffineAccessor<char,1>(break_points_instance, 0).base);
+
+    size_t* group_ids = reinterpret_cast<size_t*>(AffineAccessor<char,1>(shared_instance, 0).base);
+
+    //Now that we have disjoint rectangles, we can do our usual sort and coalesce pass
+    size_t last = INT_MAX;
+    {
+      NVTX_DEPPART(compact_disjoint_rects);
+      while (last > num_intermediate) {
+        last = num_intermediate;
+
+        bool done = false;
+        for (int dim = 1; !done; dim++) {
+          if (dim == N) {
+            dim = 0; // wrap around to 0
+            done = true;
+          }
+          grid_size = (num_intermediate + threads_per_block - 1) / threads_per_block;
+
+          build_lo_key<<<grid_size, threads_per_block, 0, stream>>>(d_coord_keys_in, d_rects_in, num_intermediate, dim);
+          KERNEL_CHECK(stream);
+          size_t temp_bytes;
+          cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes,
+                                              d_coord_keys_in, d_coord_keys_out,
+                                              d_rects_in, d_rects_out,
+                                              num_intermediate, 0, 8*sizeof(T), stream);
+          if (temp_bytes > orig_tmp) {
+            tmp_instance.destroy();
+            tmp_instance = this->realm_malloc(temp_bytes, my_mem);
+            orig_tmp = temp_bytes;
+            tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
+          }
+          cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes,
+                                                          d_coord_keys_in, d_coord_keys_out,
+                                                          d_rects_in, d_rects_out,
+                                                          num_intermediate, 0, 8*sizeof(T), stream);
+
+          std::swap(d_rects_in, d_rects_out);
+          for (int d = 0; d < N; d++) {
+            if (d == dim) {
+              continue;
+            }
+            build_hi_key<<<grid_size, threads_per_block, 0, stream>>>(d_coord_keys_in, d_rects_in, num_intermediate, d);
+            KERNEL_CHECK(stream);
+            size_t temp_bytes;
+            cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes,
+                                                d_coord_keys_in, d_coord_keys_out,
+                                                d_rects_in, d_rects_out,
+                                                num_intermediate, 0, 8*sizeof(T), stream);
+            if (temp_bytes > orig_tmp) {
+              tmp_instance.destroy();
+              tmp_instance = this->realm_malloc(temp_bytes, my_mem);
+              orig_tmp = temp_bytes;
+              tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
+            }
+            cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes,
+                                                            d_coord_keys_in, d_coord_keys_out,
+                                                            d_rects_in, d_rects_out,
+                                                            num_intermediate, 0, 8*sizeof(T), stream);
+
+            std::swap(d_rects_in, d_rects_out);
+            build_lo_key<<<grid_size, threads_per_block, 0, stream>>>(d_coord_keys_in, d_rects_in, num_intermediate, d);
+            KERNEL_CHECK(stream);
+            cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes,
+                                                d_coord_keys_in, d_coord_keys_out,
+                                                d_rects_in, d_rects_out,
+                                                num_intermediate, 0, 8*sizeof(T), stream);
+            if (temp_bytes > orig_tmp) {
+              tmp_instance.destroy();
+              tmp_instance = this->realm_malloc(temp_bytes, my_mem);
+              orig_tmp = temp_bytes;
+              tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
+            }
+            cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes,
+                                                            d_coord_keys_in, d_coord_keys_out,
+                                                            d_rects_in, d_rects_out,
+                                                            num_intermediate, 0, 8*sizeof(T), stream);
+
+            std::swap(d_rects_in, d_rects_out);
+
+          }
+
+          build_src_key<<<grid_size, threads_per_block, 0, stream>>>(d_src_keys_in, d_rects_in, num_intermediate);
+          KERNEL_CHECK(stream);
+          cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes,
+                                                  d_src_keys_in, d_src_keys_out,
+                                                  d_rects_in, d_rects_out,
+                                                  num_intermediate, 0, 8*sizeof(size_t), stream);
+          if (temp_bytes > orig_tmp) {
+            tmp_instance.destroy();
+            tmp_instance = this->realm_malloc(temp_bytes, my_mem);
+            orig_tmp = temp_bytes;
+            tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
+          }
+          cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes,
+                                                          d_src_keys_in, d_src_keys_out,
+                                                          d_rects_in, d_rects_out,
+                                                          num_intermediate, 0, 8*sizeof(size_t), stream);
+
+          std::swap(d_rects_in, d_rects_out);
+
+          mark_breaks_dim<<<grid_size, threads_per_block, 0, stream>>>(d_rects_in, break_points, num_intermediate, dim);
+          KERNEL_CHECK(stream);
+
+          cub::DeviceScan::InclusiveSum(nullptr, temp_bytes, break_points, group_ids, num_intermediate, stream);
+
+          if (temp_bytes > orig_tmp) {
+            tmp_instance.destroy();
+            tmp_instance = this->realm_malloc(temp_bytes, my_mem);
+            orig_tmp = temp_bytes;
+            tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
+          }
+
+          cub::DeviceScan::InclusiveSum(tmp_storage, temp_bytes, break_points, group_ids, num_intermediate, stream);
+
+          size_t last_grp;
+          CUDA_CHECK(cudaMemcpyAsync(&last_grp, &group_ids[num_intermediate-1], sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream);
+          CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+
+          init_rects_dim<<<grid_size, threads_per_block, 0, stream>>>(d_rects_in, break_points, group_ids, d_rects_out, num_intermediate, dim);
+          KERNEL_CHECK(stream);
+
+          num_intermediate = last_grp;
+          std::swap(d_rects_in, d_rects_out);
+        }
+      }
+      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+    }
+
+    heads_instance.destroy();
+    shared_instance.destroy();
+    tmp_instance.destroy();
+
+    //And... we're done
+    if (out_rects > 0) {
+      d_out_rects = d_rects_in;
+      out_rects = num_intermediate;
+    } else {
+      this->send_output(d_rects_in, num_intermediate, my_arena, ctr, getIndex, getMap);
+      rects_out_instance.destroy();
+    }
+
+  }
+
+  /*
+ *  Input: An array of 1D rectangles (potentially overlapping) with associated
+ *  src indices, where all the rectangles with a given src idx together represent an exact covering
+ *  of the partitioning output for that index.
+ *  Output: A disjoint, coalesced array of rectangles sorted by src idx that it then sends off
+ *  to the send output function, which constructs the final sparsity map.
+ *  Approach: The canonical 1D rectangle merge, in parallel. Sort the rectangles by (src_idx, lo). Then
+ *  prefix max by hi segmented by src_idx to find overlapping rectangles. Then, RLE by starting a new rectangle
+ *  when in a new src or lo > current max hi and merging otherwise.
+ */
+  template<int N, typename T>
+  template<typename Container, typename IndexFn, typename MapFn>
+  void GPUMicroOp<N,T>::complete1d_pipeline(RectDesc<N, T>* d_rects, size_t total_rects, RectDesc<N, T>* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap)
+  {
+
+    NVTX_DEPPART(complete1d_pipeline);
+    cudaStream_t stream = Cuda::get_task_cuda_stream();
+
+    RectDesc<N,T>* d_rects_in = d_rects;
+
+    size_t bytes_T   = total_rects * sizeof(T);
+    size_t bytes_S   = total_rects * sizeof(size_t);
+    size_t bytes_HF  = total_rects * sizeof(HiFlag<T>);
+    size_t max_bytes = std::max({bytes_T, bytes_HF, bytes_S});
+
+    char* aux_ptr = my_arena.alloc<char>(2 * max_bytes);
+
+    uint8_t* break_points = my_arena.alloc<uint8_t>(total_rects);
+    size_t* group_ids = my_arena.alloc<size_t>(total_rects);
+
+    T* d_keys_in = reinterpret_cast<T*>(aux_ptr);
+    T* d_keys_out = reinterpret_cast<T*>(aux_ptr + max_bytes);
+
+    size_t* d_src_keys_in = reinterpret_cast<size_t*>(aux_ptr);
+    size_t* d_src_keys_out = reinterpret_cast<size_t*>(aux_ptr + max_bytes);
+
+    HiFlag<T>* d_hi_flags_in = reinterpret_cast<HiFlag<T>*>(aux_ptr);
+    HiFlag<T>* d_hi_flags_out = reinterpret_cast<HiFlag<T>*>(aux_ptr + max_bytes);
+
+    size_t num_intermediate = total_rects;
+
+    const size_t prev = my_arena.mark();
+    RectDesc<N, T>* d_rects_out = my_arena.alloc<RectDesc<N, T>>(total_rects);
+
+    size_t t1=0, t2 = 0, t3 = 0, t4 = 0;
+    cub::DeviceRadixSort::SortPairs(nullptr, t1,
+    d_keys_in, d_keys_out, d_rects_in, d_rects_out, num_intermediate,
+    0, 8*sizeof(T), stream);
+    // exclusive scan
+    cub::DeviceScan::ExclusiveScan(nullptr, t2,
+      d_hi_flags_in, d_hi_flags_out,
+      SegmentedMax<T>(), HiFlag<T>{std::numeric_limits<T>::min(), 0},
+      num_intermediate, stream);
+    // inclusive sum
+    cub::DeviceScan::InclusiveSum(nullptr, t3,
+      break_points, group_ids,
+      num_intermediate, stream);
+
+    cub::DeviceRadixSort::SortPairs(nullptr, t4, d_src_keys_in, d_src_keys_out, d_rects_in, d_rects_out, num_intermediate, 0, 8*sizeof(size_t), stream);
+
+    size_t temp_bytes = std::max({t1, t2, t3, t4});
+    size_t use_bytes = temp_bytes;
+    void *temp_storage = my_arena.alloc<char>(temp_bytes);
+
+    int threads_per_block = 256;
+    size_t grid_size = (num_intermediate + threads_per_block - 1) / threads_per_block;
+
+    //Sort the rectangles keyed by (src, lo)
+    {
+      NVTX_DEPPART(sort_rects);
+
+      build_lo_key<<<grid_size, threads_per_block, 0, stream>>>(d_keys_in, d_rects_in, num_intermediate, 0);
+      KERNEL_CHECK(stream);
+      cub::DeviceRadixSort::SortPairs(temp_storage, use_bytes, d_keys_in, d_keys_out, d_rects_in, d_rects_out, num_intermediate, 0, 8*sizeof(T), stream);
+      std::swap(d_rects_in, d_rects_out);
+
+      build_src_key<<<grid_size, threads_per_block, 0, stream>>>(d_src_keys_in, d_rects_in, num_intermediate);
+      KERNEL_CHECK(stream);
+
+      use_bytes = temp_bytes;
+      cub::DeviceRadixSort::SortPairs(temp_storage, use_bytes, d_src_keys_in, d_src_keys_out, d_rects_in, d_rects_out, num_intermediate, 0, 8*sizeof(size_t), stream);
+      std::swap(d_rects_in, d_rects_out);
+    }
+
+    //Prefix max by hi segmented by src, then RLE to merge.
+    {
+      NVTX_DEPPART(run_length_encode);
+      build_hi_flag<<<grid_size, threads_per_block, 0, stream>>>(d_hi_flags_in, d_rects_in, num_intermediate, 0);
+      KERNEL_CHECK(stream);
+
+
+      use_bytes = temp_bytes;
+      cub::DeviceScan::ExclusiveScan(
+        /*d_temp=*/    temp_storage,
+        /*bytes=*/     use_bytes,
+        /*in=*/        d_hi_flags_in,
+        /*out=*/       d_hi_flags_out,
+        /*op=*/        SegmentedMax<T>(),
+                       HiFlag<T>{std::numeric_limits<T>::min(), 0},
+        /*num_items=*/ num_intermediate,
+        /*stream=*/    stream
+      );
+
+      threads_per_block = 256;
+      grid_size = (num_intermediate + threads_per_block - 1) / threads_per_block;
+      mark_breaks_dim<<<grid_size, threads_per_block, 0, stream>>>(d_hi_flags_in, d_hi_flags_out, d_rects_in, break_points, num_intermediate, 0);
+      KERNEL_CHECK(stream);
+      use_bytes = temp_bytes;
+      cub::DeviceScan::InclusiveSum(temp_storage, use_bytes, break_points, group_ids, num_intermediate, stream);
+
+      size_t last_grp;
+      CUDA_CHECK(cudaMemcpyAsync(&last_grp, &group_ids[num_intermediate-1], sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+
+      my_arena.rollback(prev);
+      my_arena.flip_parity();
+      assert(my_arena.get_parity());
+      my_arena.reset(true);
+      d_rects_out = my_arena.alloc<RectDesc<N,T>>(last_grp);
+      my_arena.commit(true);
+
+      init_rects_dim<<<grid_size, threads_per_block, 0, stream>>>(d_rects_in, d_hi_flags_out, break_points, group_ids, d_rects_out, num_intermediate, 0);
+      KERNEL_CHECK(stream);
+
+      num_intermediate = last_grp;
+      std::swap(d_rects_in, d_rects_out);
+
+      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+    }
+
+    if (out_rects > 0) {
+      d_out_rects = d_rects_in;
+      out_rects = num_intermediate;
+    } else {
+      this->send_output(d_rects_in, num_intermediate, my_arena, ctr, getIndex, getMap);
+    }
+  }
+
+   /*
+  *  Input: An array of points (potentially with duplicates) with associated
+  *  src indices, where all the points with a given src idx together represent an exact covering
+  *  of the partitioning output for that index.
+  *  Output: A disjoint, coalesced array of rectangles sorted by src idx that it then sends off
+  *  to the send output function, which constructs the final sparsity map.
+  *  Approach: Sort the points by (x0,x1,...,xN-1,src) (right is MSB). Convert them to singleton rects.
+  *  Run-length encode along each dimension (N-1...0).
+  */
+  template<int N, typename T>
+  template<typename Container, typename IndexFn, typename MapFn>
+  void GPUMicroOp<N,T>::complete_pipeline(PointDesc<N, T>* d_points, size_t total_pts, RectDesc<N, T>* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap)
+  {
+
+    NVTX_DEPPART(complete_pipeline);
+
+    size_t prev = my_arena.mark();
+
+    cudaStream_t stream = Cuda::get_task_cuda_stream();
+
+    size_t bytes_T   = total_pts * sizeof(T);
+    size_t bytes_S   = total_pts * sizeof(size_t);
+    size_t bytes_R  =  total_pts * sizeof(RectDesc<N,T>);
+    size_t bytes_p = total_pts * sizeof(PointDesc<N,T>);
+    size_t max_aux_bytes = std::max({bytes_T, bytes_S, bytes_R});
+    size_t max_pg_bytes = std::max({bytes_p, bytes_S});
+
+
+    // Instance shared by coordinate keys, source keys, and rectangle outputs
+    char* aux_ptr = my_arena.alloc<char>(2 * max_aux_bytes);
+
+    //Instance shared by group ids (RLE) and intermediate points in sorting
+    char* pg_ptr = my_arena.alloc<char>(max_pg_bytes);
+
+    uint8_t* break_points = my_arena.alloc<uint8_t>(total_pts);
+
+    T* d_keys_in = reinterpret_cast<T*>(aux_ptr);
+    T* d_keys_out = reinterpret_cast<T*>(aux_ptr + max_aux_bytes);
+
+    PointDesc<N,T>* d_points_in = d_points;
+    PointDesc<N,T>* d_points_out = reinterpret_cast<PointDesc<N,T>*>(pg_ptr);
+
+    size_t* group_ids = reinterpret_cast<size_t*>(pg_ptr);
+
+    RectDesc<N,T>* d_rects_in = reinterpret_cast<RectDesc<N,T>*>(aux_ptr);
+    RectDesc<N, T> *d_rects_out = reinterpret_cast<RectDesc<N,T>*>(aux_ptr + max_aux_bytes);
+
+    size_t* d_src_keys_in = reinterpret_cast<size_t*>(aux_ptr);
+    size_t* d_src_keys_out = reinterpret_cast<size_t*>(aux_ptr + max_aux_bytes);
+
+    size_t t1=0, t2=0, t3=0;
+    cub::DeviceRadixSort::SortPairs(nullptr, t1, d_keys_in, d_keys_out, d_points_in, d_points_out, total_pts, 0, 8*sizeof(T), stream);
+    cub::DeviceRadixSort::SortPairs(nullptr, t2, d_src_keys_in, d_src_keys_out, d_points_in, d_points_out, total_pts, 0, 8*sizeof(size_t), stream);
+    cub::DeviceScan::InclusiveSum(nullptr, t3, break_points, group_ids, total_pts, stream);
+
+    //Temporary storage instance shared by CUB operations.
+    size_t temp_bytes = std::max({t1, t2, t3});
+    void *temp_storage = my_arena.alloc<char>(temp_bytes);
+
+
+    //Sort along each dimension from LSB to MSB (0 to N-1)
+    size_t use_bytes = temp_bytes;
+
+    {
+      NVTX_DEPPART(sort_valid_points);
+      for (int dim = 0; dim < N; ++dim) {
+        build_coord_key<<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_keys_in, d_points_in, total_pts, dim);
+        KERNEL_CHECK(stream);
+        cub::DeviceRadixSort::SortPairs(temp_storage, use_bytes, d_keys_in, d_keys_out, d_points_in, d_points_out, total_pts, 0, 8*sizeof(T), stream);
+        std::swap(d_keys_in, d_keys_out);
+        std::swap(d_points_in, d_points_out);
+      }
+
+      //Sort by source index now to keep individual partitions separate
+      build_src_key<<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_src_keys_in, d_points_in, total_pts);
+      KERNEL_CHECK(stream);
+      use_bytes = temp_bytes;
+      cub::DeviceRadixSort::SortPairs(temp_storage, use_bytes, d_src_keys_in, d_src_keys_out, d_points_in, d_points_out, total_pts, 0, 8*sizeof(size_t), stream);
+    }
+
+
+    points_to_rects<<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_points_out, d_rects_in, total_pts);
+    KERNEL_CHECK(stream);
+
+    size_t num_intermediate = total_pts;
+
+    {
+      NVTX_DEPPART(run_length_encode);
+
+      for (int dim = N-1; dim >= 0; --dim) {
+
+        // Step 1: Mark rectangle starts
+        // e.g. [1, 2, 4, 5, 6, 8] -> [1, 0, 1, 0, 0, 1]
+        mark_breaks_dim<<<COMPUTE_GRID(num_intermediate), THREADS_PER_BLOCK, 0, stream>>>(d_rects_in, break_points, num_intermediate, dim);
+        KERNEL_CHECK(stream);
+
+        // Step 2: Inclusive scan of break points to get group ids
+        // e.g. [1, 0, 1, 0, 0, 1] -> [1, 1, 2, 2, 2, 3]
+        use_bytes = temp_bytes;
+        cub::DeviceScan::InclusiveSum(temp_storage, use_bytes, break_points, group_ids, num_intermediate, stream);
+
+        //Determine new number of intermediate rectangles
+        size_t last_grp;
+        CUDA_CHECK(cudaMemcpyAsync(&last_grp, &group_ids[num_intermediate-1], sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream);
+        CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+
+        //Step 3: Write output rectangles, where rect starts write lo and rect ends write hi
+        init_rects_dim<<<COMPUTE_GRID(num_intermediate), THREADS_PER_BLOCK, 0, stream>>>(d_rects_in, break_points, group_ids, d_rects_out, num_intermediate, dim);
+        KERNEL_CHECK(stream);
+
+        num_intermediate = last_grp;
+        std::swap(d_rects_in, d_rects_out);
+      }
+      my_arena.rollback(prev);
+      d_out_rects = my_arena.alloc<RectDesc<N, T>>(num_intermediate);
+      CUDA_CHECK(cudaMemcpyAsync(d_out_rects, d_rects_in, num_intermediate * sizeof(RectDesc<N,T>), cudaMemcpyDeviceToDevice, stream), stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+    }
+
+    if (out_rects==1) {
+      out_rects = num_intermediate;
+    } else {
+      this->send_output(d_rects_in, num_intermediate, my_arena, ctr, getIndex, getMap);
+      my_arena.rollback(prev);
+    }
+  }
+
+  /*
+   *  Input: An array of disjoint rectangles sorted by src idx.
+   *  Output: Fills the sparsity output for each src with a host region instance
+   *  containing the entries/approx entries and calls gpu_finalize on the SparsityMapImpl.
+   *  Approach: Segments the rectangles by their src idx and copies them back to the host,
+   */
+
+  template<int N, typename T>
+  template<typename Container, typename IndexFn, typename MapFn>
+  void GPUMicroOp<N,T>::send_output(RectDesc<N, T>* d_rects, size_t total_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap)
+  {
+    NVTX_DEPPART(send_output);
+
+    size_t prev = my_arena.mark();
+
+    cudaStream_t stream = Cuda::get_task_cuda_stream();
+
+    std::set<Event> output_allocs;
+
+    SparsityMapEntry<N,T>* final_entries = my_arena.alloc<SparsityMapEntry<N,T>>(total_rects);
+    Rect<N,T>* final_rects = my_arena.alloc<Rect<N,T>>(total_rects);
+
+    size_t* d_starts = my_arena.alloc<size_t>(2 * ctr.size());
+    size_t* d_ends = d_starts + ctr.size();
+
+    CUDA_CHECK(cudaMemsetAsync(d_starts, 0, ctr.size()*sizeof(size_t),stream), stream);
+    CUDA_CHECK(cudaMemsetAsync(d_ends, 0, ctr.size()*sizeof(size_t),stream), stream);
+
+
+    //Convert RectDesc to SparsityMapEntry and determine where each src's rectangles start and end.
+    build_final_output<<<COMPUTE_GRID(total_rects), THREADS_PER_BLOCK, 0, stream>>>(d_rects, final_entries, final_rects, d_starts, d_ends, total_rects);
+    KERNEL_CHECK(stream);
+
+
+    //Copy starts and ends back to host and handle empty partitions
+    std::vector<size_t> d_starts_host(ctr.size()), d_ends_host(ctr.size());
+    CUDA_CHECK(cudaMemcpyAsync(d_starts_host.data(), d_starts, ctr.size() * sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream);
+    CUDA_CHECK(cudaMemcpyAsync(d_ends_host.data(), d_ends, ctr.size() * sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+    for (size_t i = 1; i < ctr.size(); i++) {
+      if (d_starts_host[i] < d_ends_host[i-1]) {
+        d_starts_host[i] = d_ends_host[i-1];
+        d_ends_host[i] = d_ends_host[i-1];
+      }
+    }
+
+    if (!this->exclusive) {
+      for (auto const& elem : ctr) {
+        size_t idx = getIndex(elem);
+        auto mapOpj = getMap(elem);
+        SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(mapOpj);
+        if (d_ends_host[idx] > d_starts_host[idx]) {
+          size_t end = d_ends_host[idx];
+          size_t start = d_starts_host[idx];
+          std::vector<Rect<N, T>> h_rects(end - start);
+          CUDA_CHECK(cudaMemcpyAsync(h_rects.data(), final_rects + start, (end - start) * sizeof(Rect<N,T>), cudaMemcpyDeviceToHost, stream), stream);
+          CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+          impl->contribute_dense_rect_list(h_rects, true);
+        } else {
+          impl->contribute_nothing();
+        }
+      }
+    } else {
+      Memory sysmem;
+      assert(find_memory(sysmem, Memory::SYSTEM_MEM));
+
+      //Use provided lambdas to iterate over sparsity output container (map or vector)
+      for (auto const& elem : ctr) {
+        size_t idx = getIndex(elem);
+        auto mapOpj = getMap(elem);
+        SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(mapOpj);
+        if (d_ends_host[idx] > d_starts_host[idx]) {
+          size_t end = d_ends_host[idx];
+          size_t start = d_starts_host[idx];
+          RegionInstance entries = this->realm_malloc((end - start) * sizeof(SparsityMapEntry<N,T>), sysmem);
+          SparsityMapEntry<N, T> *h_entries = reinterpret_cast<SparsityMapEntry<N, T> *>(AffineAccessor<char, 1>(entries, 0).base);
+          CUDA_CHECK(cudaMemcpyAsync(h_entries, final_entries + start, (end - start) * sizeof(SparsityMapEntry<N,T>), cudaMemcpyDeviceToHost, stream), stream);
+
+          Rect<N,T> *approx_rects;
+          size_t num_approx;
+          if (end - start <= ((size_t) DeppartConfig::cfg_max_rects_in_approximation)) {
+            approx_rects = final_rects + start;
+            num_approx = end - start;
+          } else {
+            //TODO: Maybe add a better GPU approx here when given more rectangles
+            //Use CUB to compute a bad approx on the GPU (union of all rectangles)
+            approx_rects = my_arena.alloc<Rect<N,T>>(1);
+            num_approx = 1;
+            void*  d_temp   = nullptr;
+            size_t temp_sz  = 0;
+            Rect<N, T> identity_rect;
+            for(int d=0; d<N; d++){
+              identity_rect.lo[d] =  std::numeric_limits<T>::max();
+              identity_rect.hi[d] =  std::numeric_limits<T>::min();
+            }
+            cub::DeviceReduce::Reduce(
+              d_temp, temp_sz,
+              final_rects + start,
+              approx_rects,
+              (end - start),
+              UnionRectOp<N,T>(),
+              identity_rect,
+              stream
+            );
+            d_temp = reinterpret_cast<void*>(my_arena.alloc<char>(temp_sz));
+            cub::DeviceReduce::Reduce(
+              d_temp, temp_sz,
+              final_rects + start,
+              approx_rects,
+              end - start,
+              UnionRectOp<N,T>(),
+              identity_rect,
+              stream
+            );
+            CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+          }
+          RegionInstance approx_entries = this->realm_malloc(num_approx * sizeof(Rect<N,T>), sysmem);
+          SparsityMapEntry<N, T> *h_approx_entries = reinterpret_cast<SparsityMapEntry<N, T> *>(AffineAccessor<char, 1>(approx_entries, 0).base);
+          CUDA_CHECK(cudaMemcpyAsync(h_approx_entries, approx_rects, num_approx * sizeof(Rect<N,T>), cudaMemcpyDeviceToHost, stream), stream);
+          CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+          impl->set_instance(entries, end - start);
+          impl->set_approx_instance(approx_entries, num_approx);
+        }
+      }
+      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+      for (auto const& elem : ctr) {
+        SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(getMap(elem));
+        impl->gpu_finalize();
+      }
+    }
+    my_arena.rollback(prev);
+  }
+
+
+}
\ No newline at end of file
diff --git a/src/realm/deppart/partitions_gpu_kernels.hpp b/src/realm/deppart/partitions_gpu_kernels.hpp
new file mode 100644
index 0000000000..f3c1dd514e
--- /dev/null
+++ b/src/realm/deppart/partitions_gpu_kernels.hpp
@@ -0,0 +1,811 @@
+#pragma once
+#include "realm/deppart/partitions.h"
+
+namespace Realm {
+
+template <typename T>
+__device__ __forceinline__ size_t bsearch(const T* arr, size_t len, T val) {
+  size_t low = 0, high = len;
+  while (low < high) {
+    size_t mid = low + ((high - low) >> 1);
+    if (arr[mid + 1] <= val)
+      low = mid + 1;
+    else
+      high = mid;
+  }
+  return low;
+}
+
+template<typename T>
+__global__ void subtract_const(
+  T* d_data,
+  size_t num_elems,
+  T value
+) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= num_elems) return;
+  d_data[idx] = d_data[idx] <= value ? 0 : d_data[idx] - value;
+}
+
+// Intersect all instance rectangles with all parent rectangles in parallel.
+// Used for both count and emit depending on whether the output array is null.
+
+template <int N, typename T, typename out_t>
+__global__ void intersect_input_rects(
+  const SparsityMapEntry<N,T>* d_lhs_entries,
+  const SparsityMapEntry<N,T>* d_rhs_entries,
+  const size_t *d_lhs_offsets,
+  const uint32_t *d_lhs_prefix,
+  const size_t* d_rhs_offsets,
+  size_t numLHSRects,
+  size_t numRHSRects,
+  size_t numLHSChildren,
+  size_t numRHSChildren,
+  uint32_t *d_lhs_counters,
+  out_t* d_rects
+) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= numLHSRects * numRHSRects) return;
+  size_t idx_x = idx % numRHSRects;
+  size_t idx_y = idx / numRHSRects;
+  assert(idx_x < numRHSRects);
+  assert(idx_y < numLHSRects);
+  const SparsityMapEntry<N, T> rhs_entry = d_rhs_entries[idx_x];
+  const SparsityMapEntry<N, T> lhs_entry = d_lhs_entries[idx_y];
+  Rect<N,T> rect_output = lhs_entry.bounds.intersection(rhs_entry.bounds);
+  if (rect_output.empty()) {
+    return;
+  }
+  size_t lhs_idx = bsearch(d_lhs_offsets, numLHSChildren, idx_y);
+  uint32_t local = atomicAdd(&d_lhs_counters[lhs_idx], 1);
+  if (d_rects != nullptr) {
+    // If d_rects is not null, we write the output rect
+    uint32_t out_idx = d_lhs_prefix[lhs_idx] + local;
+    if constexpr (std::is_same_v<out_t, RectDesc<N, T>>) {
+      d_rects[out_idx].src_idx = bsearch(d_rhs_offsets, numRHSChildren, idx_x);
+      d_rects[out_idx].rect = rect_output;
+    } else {
+      d_rects[out_idx] = rect_output;
+    }
+  }
+}
+
+template <int N, typename T>
+__device__ __forceinline__ uint64_t bvh_morton_code(const Rect<N,T>& rect,
+                            const Rect<N,T>& globalBounds) {
+  // bits per axis (floor)
+  constexpr int bits     = 64 / N;
+  constexpr uint64_t maxQ = (bits == 64 ? ~0ULL
+                                       : (1ULL << bits) - 1);
+
+  uint64_t coords[N];
+#pragma unroll
+  for(int d = 0; d < N; ++d) {
+    // 1) compute centroid in dimension d
+    float center = 0.5f * (float(rect.lo[d]) + float(rect.hi[d]) + 1.0f);
+
+    // 2) normalize into [0,1] using globalBounds
+    float span = float(globalBounds.hi[d] + 1 - globalBounds.lo[d]);
+    float norm = (center - float(globalBounds.lo[d])) / span;
+
+    // 3) quantize to [0 … maxQ]
+    uint64_t q = uint64_t(norm * float(maxQ) + 0.5f);
+    coords[d] = (q > maxQ ? maxQ : q);
+  }
+
+  // 4) interleave bits MSB→LSB across all dims
+  uint64_t code = 0;
+  for(int b = bits - 1; b >= 0; --b) {
+#pragma unroll
+    for(int d = 0; d < N; ++d) {
+      code = (code << 1) | ((coords[d] >> b) & 1ULL);
+    }
+  }
+
+  return code;
+}
+
+template <int N, typename T>
+__global__ void bvh_build_morton_codes(
+  const SparsityMapEntry<N,T>* d_targets_entries,
+  const size_t* d_offsets_rects,
+  const Rect<N,T>* d_global_bounds,
+  size_t total_rects,
+  size_t num_targets,
+  uint64_t* d_morton_codes,
+  uint64_t* d_indices,
+  uint64_t* d_targets_indices) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= total_rects) return;
+  const auto &entry = d_targets_entries[idx];
+  d_morton_codes[idx] = bvh_morton_code(entry.bounds, *d_global_bounds);
+  d_indices[idx] = idx;
+  if (d_offsets_rects != nullptr) {
+    d_targets_indices[idx] = bsearch(d_offsets_rects, num_targets, idx);
+  }
+}
+
+  __global__
+void bvh_build_radix_tree_kernel(
+    const uint64_t *morton,    // [n]
+    const uint64_t *leafIdx,   // [n]  (unused here but kept for symmetry)
+    int n,
+    int *childLeft,            // [2n−1]
+    int *childRight,           // [2n−1]
+    int *parent);               // [2n−1], pre‐initialized to −1
+
+__global__
+void bvh_build_root_kernel(
+    int *root,
+    int *parent,
+    size_t total_rects);
+
+template<int N, typename T>
+__global__
+void bvh_init_leaf_boxes_kernel(
+    const SparsityMapEntry<N,T> *rects,    // [G] all flattened Rects
+    const uint64_t    *leafIdx, // [n] maps leaf→orig Rect index
+    size_t total_rects,
+    Rect<N,T> *boxes)                 // [(2n−1)]
+{
+  int k = blockIdx.x*blockDim.x + threadIdx.x;
+  if (k >= total_rects) return;
+
+  size_t orig = leafIdx[k];
+  boxes[k + total_rects - 1] = rects[orig].bounds;
+}
+
+template<int N, typename T>
+__global__
+void bvh_merge_internal_boxes_kernel(
+    size_t total_rects,
+    const int *childLeft,      // [(2n−1)]
+    const int *childRight,     // [(2n−1)]
+    const int *parent,         // [(2n−1)]
+    Rect<N,T> *boxes,                 // [(2n−1)×N]
+    int *visitCount)           // [(2n−1)] initialized to zero
+{
+  int leaf = blockIdx.x*blockDim.x + threadIdx.x;
+  if (leaf >= total_rects) return;
+
+  int cur = leaf + total_rects - 1;
+  int p   = parent[cur];
+
+  while(p >= 0) {
+    // increment visit count; the second arrival merges
+    int prev = atomicAdd(&visitCount[p], 1);
+    if (prev == 1) {
+      // both children ready, do the merge
+      int c0 = childLeft[p], c1 = childRight[p];
+      boxes[p] = boxes[c0].union_bbox(boxes[c1]);
+      // climb
+      cur = p;
+      p   = parent[cur];
+    } else {
+      // first child arrived, wait for sibling
+      break;
+    }
+  }
+}
+
+template <int N, typename T, typename out_t>
+__global__
+void query_input_bvh(
+  SparsityMapEntry<N, T>* queries,
+  size_t* d_query_offsets,
+  int root,
+  int *childLeft,
+  int *childRight,
+  uint64_t *indices,
+  uint64_t *labels,
+  Rect<N,T> *boxes,
+  size_t numQueries,
+  size_t numBoxes,
+  size_t numLHSChildren,
+  uint32_t* d_inst_prefix,
+  uint32_t* d_inst_counters,
+  out_t *d_rects
+) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= numQueries) return;
+  Rect<N, T> in_rect = queries[idx].bounds;
+  size_t lhs_idx = bsearch(d_query_offsets, numLHSChildren, idx);
+
+  constexpr int MAX_STACK = 64; // max stack size for BVH traversal
+  int stack[MAX_STACK];
+  int sp = 0;
+
+  // start at the root
+  stack[sp++] = -1;
+  int node = root;
+  do
+  {
+
+    int left = childLeft[node];
+    int right = childRight[node];
+
+    bool overlapL = boxes[left].overlaps(in_rect);
+    bool overlapR = boxes[right].overlaps(in_rect);
+
+    if (overlapL && left >= numBoxes - 1) {
+      uint64_t rect_idx = indices[left - (numBoxes - 1)];
+      uint32_t local = atomicAdd(&d_inst_counters[lhs_idx], 1);
+      if (d_rects != nullptr) {
+        uint32_t out_idx = d_inst_prefix[lhs_idx] + local;
+        Rect<N, T> out_rect = boxes[left].intersection(in_rect);
+        if constexpr (std::is_same_v<out_t, RectDesc<N, T>>) {
+          d_rects[out_idx].rect = out_rect;
+          d_rects[out_idx].src_idx = labels[rect_idx];
+        } else {
+          d_rects[out_idx] = out_rect;
+        }
+      }
+    }
+    if (overlapR && right >= numBoxes - 1) {
+      uint64_t rect_idx = indices[right - (numBoxes - 1)];
+      uint32_t local = atomicAdd(&d_inst_counters[lhs_idx], 1);
+      if (d_rects != nullptr) {
+        uint32_t out_idx = d_inst_prefix[lhs_idx] + local;
+        Rect<N, T> out_rect = boxes[right].intersection(in_rect);
+        if constexpr (std::is_same_v<out_t, RectDesc<N, T>>) {
+          d_rects[out_idx].rect = out_rect;
+          d_rects[out_idx].src_idx = labels[rect_idx];
+        } else {
+          d_rects[out_idx] = out_rect;
+        }
+      }
+    }
+
+    bool traverseL = overlapL && left < numBoxes - 1;
+    bool traverseR = overlapR && right < numBoxes - 1;
+
+    if (!traverseL && !traverseR) {
+      node = stack[--sp];
+    } else {
+      node = (traverseL ? left : right);
+      if (traverseL && traverseR) {
+        stack[sp++] = right;
+      }
+    }
+  } while (node != -1);
+}
+
+template<int N, typename T>
+struct CornerDesc {
+    uint32_t src_idx;
+    T        coord[N];
+    int32_t  delta;
+
+    // Equality for ReduceByKey: compare key fields only (src_idx, coords)
+    __host__ __device__ __forceinline__
+    bool operator==(const CornerDesc& rhs) const {
+      if (src_idx != rhs.src_idx) return false;
+      for (int d = 0; d < N; ++d)
+        if (coord[d] != rhs.coord[d]) return false;
+      return true;
+    }
+};
+
+template<int N, typename T>
+__global__ void mark_endpoints(const RectDesc<N,T>* d_rects,
+                                size_t            M,
+                                int               dim,
+                                uint32_t*       d_src_keys,
+                                T*       d_crd_keys) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i >= M) return;
+  d_src_keys[2*i] = d_rects[i].src_idx;
+  d_src_keys[2*i+1] = d_rects[i].src_idx;
+  d_crd_keys[2*i] = d_rects[i].rect.lo[dim];
+  d_crd_keys[2*i+1] = d_rects[i].rect.hi[dim] + 1;
+}
+
+template<typename T>
+__global__ void mark_heads(const uint32_t* d_src_keys,
+                                  const T* d_crd_keys,
+                                  size_t            M,
+                                  uint8_t* d_heads) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i >= M) return;
+  if (i==0) d_heads[0] = 1;
+  else {
+    d_heads[i] = d_src_keys[i] != d_src_keys[i-1] || d_crd_keys[i] != d_crd_keys[i-1];
+  }
+}
+
+template<typename T>
+__global__ void seg_boundaries(const uint8_t* d_flags,
+                              const T* d_exc_sum,
+                              size_t            M,
+                              size_t *d_starts,
+                              size_t *d_ends) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i >= M) return;
+  if (d_flags[i]) {
+    d_starts[d_exc_sum[i]-1] = i;
+  }
+  if (i== M-1 || d_flags[i+1]) {
+    d_ends[d_exc_sum[i]-1] = i + 1;
+  }
+}
+
+template<typename T>
+__global__ void scatter_unique(const uint32_t* d_src_keys,
+                                const T* d_crd_keys,
+                                const size_t* d_output,
+                                const uint8_t* d_heads,
+                                size_t            M,
+                                size_t *d_starts,
+                                size_t *d_ends,
+                                T* d_boundaries) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i >= M) return;
+  size_t u = d_output[i] - (d_heads[i] ? 0 : 1);
+  d_boundaries[u] = d_crd_keys[i];
+  if (i == 0 || d_src_keys[i] != d_src_keys[i-1]) {
+    d_starts[d_src_keys[i]] = u;
+  }
+  if (i== M-1 || d_src_keys[i] != d_src_keys[i+1]) {
+    d_ends[d_src_keys[i]] = u + 1;
+  }
+}
+
+template<int N, typename T>
+__global__ void mark_deltas_heads(const CornerDesc<N, T>* d_corners,
+                                size_t            M,
+                                int dim,
+                                uint8_t* d_heads,
+                                DeltaFlag* d_deltas) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i >= M) return;
+  uint8_t head = 1;
+  if (i>0) {
+    head = 0;
+    for (int j = 0; j < N; j++) {
+      if (j== dim) continue;
+      if (d_corners[i].coord[j] != d_corners[i-1].coord[j]) {
+        head = 1;
+        break;
+      }
+    }
+    head = head || d_corners[i].src_idx != d_corners[i-1].src_idx;
+  }
+  d_heads[i] = head;
+  d_deltas[i].delta = d_corners[i].delta;
+  d_deltas[i].head = head;
+}
+
+// For each segment and each boundary, determine whether to emit a new subsegment
+template<int N, typename T>
+__global__ void count_segments(const DeltaFlag* d_delta_flags,
+                                const size_t *d_segment_starts,
+                                const size_t *d_segment_ends,
+                                const size_t *d_boundary_starts,
+                                const size_t *d_boundary_ends,
+                                const CornerDesc<N, T>* d_corners,
+                                const T* d_boundaries,
+                                size_t num_boundaries,
+                                size_t num_segments,
+                                int dim,
+                                uint32_t *seg_counters) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i >= num_segments * num_boundaries) return;
+  size_t bnd_idx = i % num_boundaries;
+  size_t seg_idx = i / num_boundaries;
+  int my_src = d_corners[d_segment_starts[seg_idx]].src_idx;
+
+  //No boundaries for this src
+  if (d_boundary_starts[my_src]>= d_boundary_ends[my_src]) return;
+
+  //This boundary is not a subsegment start for this segment's src
+  if (bnd_idx < d_boundary_starts[my_src] || bnd_idx >= d_boundary_ends[my_src]-1) return;
+
+  //Binary search the segment to find the first subsegment whose start is > boundary
+  size_t low = d_segment_starts[seg_idx];
+  size_t high = d_segment_ends[seg_idx];
+  while (low < high) {
+    int mid = (low + high) / 2;
+    if (d_corners[mid].coord[dim] <= d_boundaries[bnd_idx]) {
+      low = mid + 1;
+    } else {
+      high = mid;
+    }
+  }
+
+  //The prefix sum for this boundary within this segment is the delta of the corner just before it (if any)
+  int my_delta = (low == d_segment_starts[seg_idx] ? 0 : d_delta_flags[low-1].delta);
+
+  //We emit if it's non-zero, and strengthen the requirement to > 0 for dim 0.
+  if (my_delta != 0 && (dim !=0 || my_delta > 0)) {
+    atomicAdd(&seg_counters[seg_idx], 1);
+  }
+}
+
+//Do the same computation as above, but this time emit the actual subsegment
+template<int N, typename T>
+__global__ void write_segments(const DeltaFlag* d_delta_flags,
+                                const size_t *d_segment_starts,
+                                const size_t *d_segment_ends,
+                                const size_t *d_boundary_starts,
+                                const size_t *d_boundary_ends,
+                                const CornerDesc<N, T>* d_corners,
+                                const T* d_boundaries,
+                                const uint32_t *seg_offsets,
+                                size_t num_boundaries,
+                                size_t num_segments,
+                                int dim,
+                                uint32_t *seg_counters,
+                                CornerDesc<N, T>* d_out_corners) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i >= num_segments * num_boundaries) return;
+  size_t bnd_idx = i % num_boundaries;
+  size_t seg_idx = i / num_boundaries;
+  int my_src = d_corners[d_segment_starts[seg_idx]].src_idx;
+  if (d_boundary_starts[my_src]>= d_boundary_ends[my_src]) return;
+  if (bnd_idx < d_boundary_starts[my_src] || bnd_idx >= d_boundary_ends[my_src]-1) return;
+  size_t low = d_segment_starts[seg_idx];
+  size_t high = d_segment_ends[seg_idx];
+  while (low < high) {
+    int mid = (low + high) / 2;
+    if (d_corners[mid].coord[dim] <= d_boundaries[bnd_idx]) {
+      low = mid + 1;
+    } else {
+      high = mid;
+    }
+  }
+  int my_delta = (low == d_segment_starts[seg_idx] ? 0 : d_delta_flags[low-1].delta);
+
+  //To emit, we keep everything the same except the current dim - set that to the boundary value
+  if (my_delta != 0 && (dim !=0 || my_delta > 0)) {
+    uint32_t my_idx = seg_offsets[seg_idx] + atomicAdd(&seg_counters[seg_idx], 1);
+    CornerDesc<N, T> my_corner = d_corners[low-1];
+    my_corner.coord[dim] = d_boundaries[bnd_idx];
+    my_corner.delta = my_delta;
+    d_out_corners[my_idx] = my_corner;
+  }
+}
+
+//Again, do the same computation as above, but this time emit the actual rectangle
+template<int N, typename T>
+__global__ void write_segments(const DeltaFlag* d_delta_flags,
+                                const size_t *d_segment_starts,
+                                const size_t *d_segment_ends,
+                                size_t **d_boundary_starts,
+                                size_t **d_boundary_ends,
+                                const CornerDesc<N, T>* d_corners,
+                                T** d_boundaries,
+                                const uint32_t *seg_offsets,
+                                size_t num_boundaries,
+                                size_t num_segments,
+                                uint32_t *seg_counters,
+                                RectDesc<N, T>* d_out_rects) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i >= num_segments * num_boundaries) return;
+  size_t bnd_idx = i % num_boundaries;
+  size_t seg_idx = i / num_boundaries;
+  int my_src = d_corners[d_segment_starts[seg_idx]].src_idx;
+  if (d_boundary_starts[0][my_src]>= d_boundary_ends[0][my_src]) return;
+  if (bnd_idx < d_boundary_starts[0][my_src] || bnd_idx >= d_boundary_ends[0][my_src]-1) return;
+
+  size_t low = d_segment_starts[seg_idx];
+  size_t high = d_segment_ends[seg_idx];
+  while (low < high) {
+    int mid = (low + high) / 2;
+    if (d_corners[mid].coord[0] <= d_boundaries[0][bnd_idx]) {
+      low = mid + 1;
+    } else {
+      high = mid;
+    }
+  }
+  int my_delta = (low == d_segment_starts[seg_idx] ? 0 : d_delta_flags[low-1].delta);
+  if (my_delta==0) return;
+  int my_corner_idx = low - 1;
+  uint32_t my_idx = seg_offsets[seg_idx] + atomicAdd(&seg_counters[seg_idx], 1);
+  RectDesc<N, T> my_output;
+  my_output.src_idx = my_src;
+  my_output.rect.lo[0] = d_boundaries[0][bnd_idx];
+
+  //Remember we marked each boundary as hi+1, so need to revert
+  my_output.rect.hi[0] = d_boundaries[0][bnd_idx+1] - 1;
+
+  //For every other dimension, map segment -> rect by finding the two boundaries that surround the segment's corner
+  for (int d = 1; d < N; d++) {
+    low = d_boundary_starts[d][my_src];
+    high = d_boundary_ends[d][my_src];
+    while (low < high) {
+      int mid = (low + high) / 2;
+      if (d_boundaries[d][mid] <= d_corners[my_corner_idx].coord[d]) {
+        low = mid + 1;
+      } else {
+        high = mid;
+      }
+    }
+    my_output.rect.lo[d] = d_boundaries[d][low-1];
+    my_output.rect.hi[d] = d_boundaries[d][low] - 1;
+  }
+  d_out_rects[my_idx] = my_output;
+}
+
+  template<int N, typename T>
+  __global__ void populate_corners(const RectDesc<N, T>* __restrict__ d_rects,
+                                   size_t M,
+                                   CornerDesc<N, T>* __restrict__ d_corners)
+{
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= M) return;
+
+  const auto& r = d_rects[i];            // assumes r.rect.lo[d], r.rect.hi[d], r.src_idx
+  const uint32_t src = r.src_idx;
+
+  const size_t corners_per_rect = size_t(1) << N;
+  const size_t base = i * corners_per_rect;
+
+  // emit 2^N corners. Each 1 in the mask -> use hi[d]+1, each 0 -> use lo[d]
+  for (unsigned mask = 0; mask < corners_per_rect; ++mask) {
+    CornerDesc<N,T> c;
+    c.src_idx = src;
+    // sign = +1 for even popcount(mask), -1 for odd
+    c.delta = (__popc(mask) & 1) ? -1 : +1;
+
+    #pragma unroll
+    for (int d = 0; d < N; ++d) {
+      const T lo   = r.rect.lo[d];
+      const T hip1 = r.rect.hi[d] + T(1);   // half-open (hi+1)
+      c.coord[d]   = ( (mask & (1u << d)) ? hip1 : lo );
+    }
+
+    d_corners[base + mask] = c;
+  }
+}
+
+
+template<int N, typename T>
+__global__ void build_coord_key(T*        d_keys,
+                                const PointDesc<N,T>* d_pts,
+                                size_t            M,
+                                int               dim) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i < M) d_keys[i] = d_pts[i].point[dim];
+}
+
+
+template<int N, typename T>
+__global__ void build_coord_key(T*        d_keys,
+                                const CornerDesc<N,T>* d_corners,
+                                size_t            M,
+                                int               dim) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i < M) d_keys[i] = d_corners[i].coord[dim];
+}
+
+template<int N, typename T>
+__global__ void get_delta(int32_t*        d_deltas,
+                                const CornerDesc<N,T>* d_corners,
+                                size_t            M) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i < M) d_deltas[i] = d_corners[i].delta;
+}
+
+template<int N, typename T>
+__global__ void set_delta(const int32_t*        d_deltas,
+                                CornerDesc<N,T>* d_corners,
+                                size_t            M) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i < M) d_corners[i].delta = d_deltas[i];
+}
+
+
+  template<int N, typename T>
+__global__ void build_lo_key(T*        d_keys,
+                                const RectDesc<N,T>* d_rects,
+                                size_t            M,
+                                int               dim) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i < M) d_keys[i] = d_rects[i].rect.lo[dim];
+}
+
+  template<int N, typename T>
+__global__ void build_hi_key(T*        d_keys,
+                                const RectDesc<N,T>* d_rects,
+                                size_t            M,
+                                int               dim) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i < M) d_keys[i] = d_rects[i].rect.hi[dim];
+}
+
+  template<int N, typename T>
+__global__ void build_hi_flag(HiFlag<T>*        d_flags,
+                              const RectDesc<N,T>* d_rects,
+                              size_t            M,
+                              int               dim) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i >= M) return;
+  d_flags[i].hi = d_rects[i].rect.hi[dim];
+  d_flags[i].head = i==0 || d_rects[i].src_idx != d_rects[i-1].src_idx;
+}
+
+  template<int N, typename T>
+__global__ void build_src_key(size_t*        d_keys,
+                              const RectDesc<N,T>* d_rects,
+                              size_t            M) {
+  size_t i = blockIdx.x*blockDim.x + threadIdx.x;
+  if(i < M) d_keys[i] = d_rects[i].src_idx;
+}
+
+  template<int N, typename T>
+__global__ void build_src_key(size_t*        d_keys,
+                              const CornerDesc<N, T> *d_corners,
+                              size_t            M) {
+  size_t i = blockIdx.x*blockDim.x + threadIdx.x;
+  if(i < M) d_keys[i] = d_corners[i].src_idx;
+}
+
+template<int N, typename T>
+__global__ void build_src_key(size_t*        d_keys,
+                              const PointDesc<N,T>* d_pts,
+                              size_t            M) {
+  size_t i = blockIdx.x*blockDim.x + threadIdx.x;
+  if(i < M) d_keys[i] = d_pts[i].src_idx;
+}
+  
+
+template<int N, typename T>
+__global__
+void points_to_rects(const PointDesc<N,T>* pts,
+                     RectDesc<N,T>*        rects,
+                     size_t                M)
+{
+  size_t i = blockIdx.x*blockDim.x + threadIdx.x;
+  if(i >= M) return;
+  rects[i].src_idx = pts[i].src_idx;
+  rects[i].rect.lo     = pts[i].point;
+  rects[i].rect.hi     = pts[i].point;
+}
+
+// 1) mark breaks on RectDesc array at pass d
+// Starts a new rectangle if src or lo/hi in any dimension but d doesn't match,
+// or if dim d doesn't match or advance by +1
+//NOTE: ONLY WORKS IF WE STARTED WITH DISJOINT RECTANGLES
+template<int N, typename T>
+__global__
+void mark_breaks_dim(const RectDesc<N,T>* in,
+                     uint8_t*              brk,
+                     size_t                M,
+                     int                   d)
+{
+  size_t i = blockIdx.x*blockDim.x + threadIdx.x;
+  if(i >= M) return;
+  if(i == 0) { brk[0] = 1; return; }
+
+  const auto &p = in[i].rect, &q = in[i-1].rect;
+  bool split = (in[i].src_idx != in[i-1].src_idx);
+
+  // more‐significant dims 0..d-1 must match [lo,hi]
+  #pragma unroll
+  for(int k = 0; k < d && !split; ++k)
+    if(p.lo[k] != q.lo[k] || p.hi[k] != q.hi[k]) split = true;
+
+  // already‐processed dims d+1..N-1 must match [lo,hi]
+  #pragma unroll
+  for(int k = d+1; k < N && !split; ++k)
+    if((p.lo[k] != q.lo[k]) || (p.hi[k] != q.hi[k]))
+      split = true;
+
+  // current dim d must equal or advance by +1 in lo
+  if(!split && (p.lo[d] != (q.hi[d] + 1)) && (p.lo[d] != q.lo[d]))
+    split = true;
+
+  brk[i] = split ? 1 : 0;
+}
+
+//1) Mark breaks for 1D rectangle merge - if low > hi + 1, must start new rect
+  template<int N, typename T>
+__global__
+void mark_breaks_dim(const HiFlag<T>* hi_flag_in,
+                     const HiFlag<T>* hi_flag_out,
+                     const RectDesc<N,T>* in,
+                     uint8_t*              brk,
+                     size_t                M,
+                     int                   d)
+{
+  size_t i = blockIdx.x*blockDim.x + threadIdx.x;
+  if(i >= M) return;
+  brk[i] = hi_flag_in[i].head || in[i].rect.lo[d] > hi_flag_out[i].hi + 1;
+}
+
+// 2) Write output rectangles for ND disjoint rects RLE
+// Starts write lo, ends write hi, everyone else no-ops
+template<int N, typename T>
+__global__
+void init_rects_dim(const RectDesc<N,T>* in,
+                    const uint8_t*        brk,
+                    const size_t*         gid,
+                    RectDesc<N,T>*        out,
+                    size_t                M,
+                    int                   d)
+{
+  size_t i = blockIdx.x*blockDim.x + threadIdx.x;
+  if(i >= M) return;
+
+  bool is_end = (i == M-1) || (gid[i+1] != gid[i]);
+  if (!brk[i] && !is_end) return;
+
+  size_t g = gid[i] - 1;  // zero-based rectangle index
+  const Rect<N, T> &r = in[i].rect;
+  out[g].src_idx = in[i].src_idx;
+
+  #pragma unroll
+  for(int k = 0; k < N; ++k) {
+    if (brk[i]) {
+        out[g].rect.lo[k] = r.lo[k];
+    }
+    if (is_end) {
+        out[g].rect.hi[k] = r.hi[k];
+    }
+  }
+}
+
+  // 2) Write output rectangles for 1D rects RLE
+  // Starts write lo, ends write max(hi, prefix max hi) because the max was exclusive
+  template<int N, typename T>
+  __global__
+  void init_rects_dim(const RectDesc<N,T>* in,
+                      const HiFlag<T> *hi_flag_out,
+                      const uint8_t*        brk,
+                      const size_t*         gid,
+                      RectDesc<N,T>*        out,
+                      size_t                M,
+                      int                   d)
+{
+  size_t i = blockIdx.x*blockDim.x + threadIdx.x;
+  if(i >= M) return;
+
+  bool is_end = (i == M-1) || (gid[i+1] != gid[i]);
+  if (!brk[i] && !is_end) return;
+
+  size_t g = gid[i] - 1;  // zero-based
+  const auto &r = in[i].rect;
+  out[g].src_idx = in[i].src_idx;
+
+  // copy dims ≠ d
+#pragma unroll
+  for(int k = 0; k < N; ++k) {
+    if (brk[i]) {
+      out[g].rect.lo[k] = r.lo[k];
+    }
+    if (k != d || (brk[i] && is_end)) {
+      out[g].rect.hi[k] = r.hi[k];
+    } else if (is_end) {
+      out[g].rect.hi[k] = r.hi[k] > hi_flag_out[i].hi ? r.hi[k] : hi_flag_out[i].hi;
+    }
+  }
+}
+
+//Convert RectDesc to sparsity output and determine [d_start[i], d_end[i]) for each src i
+template<int N, typename T>
+__global__
+void build_final_output(const RectDesc<N,T>* d_rects,
+                              SparsityMapEntry<N,T>* d_entries_out,
+                              Rect<N,T>* d_rects_out,
+                              size_t* d_starts,
+                              size_t* d_ends,
+                              size_t numRects) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= numRects) return;
+  d_rects_out[idx] = d_rects[idx].rect;
+  d_entries_out[idx].bounds = d_rects[idx].rect;
+  d_entries_out[idx].sparsity.id = 0;
+  d_entries_out[idx].bitmap = 0;
+
+  //Checks if we're the first value for a given src
+  if (idx == 0 || d_rects[idx].src_idx != d_rects[idx-1].src_idx) {
+    d_starts[d_rects[idx].src_idx] = idx;
+  }
+
+  //Checks if we're the last value for a given src
+  if (idx == numRects-1 || d_rects[idx].src_idx != d_rects[idx+1].src_idx) {
+    d_ends[d_rects[idx].src_idx] = idx+1;
+  }
+}
+
+} // namespace Realm
\ No newline at end of file
diff --git a/src/realm/deppart/setops.cc b/src/realm/deppart/setops.cc
index 2ab367f13a..d8cdbc902d 100644
--- a/src/realm/deppart/setops.cc
+++ b/src/realm/deppart/setops.cc
@@ -1073,15 +1073,14 @@ namespace Realm {
 	bitmask.add_rect(it->bounds);
       } else {
 	SparsityMapImpl<N,T> *impl = SparsityMapImpl<N,T>::lookup(it->sparsity);
-	const std::vector<SparsityMapEntry<N,T> >& entries = impl->get_entries();
-	for(typename std::vector<SparsityMapEntry<N,T> >::const_iterator it2 = entries.begin();
-	    it2 != entries.end();
-	    it2++) {
-	  Rect<N,T> isect = it->bounds.intersection(it2->bounds);
+	span<SparsityMapEntry<N, T>> entries = impl->get_entries();
+	for(size_t i = 0; i < entries.size(); i++) {
+          SparsityMapEntry<N, T> entry = entries[i];
+	  Rect<N,T> isect = it->bounds.intersection(entry.bounds);
 	  if(isect.empty())
 	    continue;
-	  assert(!it2->sparsity.exists());
-	  assert(it2->bitmap == 0);
+	  assert(!entry.sparsity.exists());
+	  assert(entry.bitmap == 0);
 	  bitmask.add_rect(isect);
 	}
       }
@@ -1440,15 +1439,14 @@ namespace Realm {
       todo.push_back(lhs.bounds);
     } else {
       SparsityMapImpl<N,T> *l_impl = SparsityMapImpl<N,T>::lookup(lhs.sparsity);
-      const std::vector<SparsityMapEntry<N,T> >& entries = l_impl->get_entries();
-      for(typename std::vector<SparsityMapEntry<N,T> >::const_iterator it = entries.begin();
-	  it != entries.end();
-	  it++) {
-	Rect<N,T> isect = lhs.bounds.intersection(it->bounds);
+      span<SparsityMapEntry<N, T>> entries = l_impl->get_entries();
+      for(size_t i = 0; i < entries.size(); i++) {
+        SparsityMapEntry<N, T> entry = entries[i];
+	Rect<N,T> isect = lhs.bounds.intersection(entry.bounds);
 	if(isect.empty())
 	  continue;
-	assert(!it->sparsity.exists());
-	assert(it->bitmap == 0);
+	assert(!entry.sparsity.exists());
+	assert(entry.bitmap == 0);
 	todo.push_back(isect);
       }
     }
diff --git a/src/realm/deppart/sparsity_impl.cc b/src/realm/deppart/sparsity_impl.cc
index e1cf66c2c9..c674a98b32 100644
--- a/src/realm/deppart/sparsity_impl.cc
+++ b/src/realm/deppart/sparsity_impl.cc
@@ -353,6 +353,7 @@ namespace Realm {
 
     if(map_impl.compare_exchange(impl, new_impl)) {
       map_deleter = [](void *map_impl) {
+
         delete static_cast<SparsityMapImpl<N, T> *>(map_impl);
       };
       return new_impl;
@@ -416,36 +417,30 @@ namespace Realm {
     // full cross-product test for now - for larger rectangle lists, consider
     //  an acceleration structure?
     if(approx) {
-      const std::vector<Rect<N, T>> &rects1 = get_approx_rects();
-      const std::vector<Rect<N, T>> &rects2 = other->get_approx_rects();
-      for(typename std::vector<Rect<N, T>>::const_iterator it1 = rects1.begin();
-          it1 != rects1.end(); it1++) {
-        Rect<N, T> isect = it1->intersection(bounds);
+      span<Rect<N, T>> rects1 = get_approx_rects();
+      span<Rect<N, T>> rects2 = other->get_approx_rects();
+      for(size_t i = 0; i < rects1.size(); i++) {
+        Rect<N, T> isect = rects1[i].intersection(bounds);
         if(isect.empty())
           continue;
-        for(typename std::vector<Rect<N, T>>::const_iterator it2 = rects2.begin();
-            it2 != rects2.end(); it2++) {
-          if(it2->overlaps(isect))
+        for(size_t j = 0; j < rects2.size(); j++) {
+          if(rects2[j].overlaps(isect))
             return true;
         }
       }
     } else {
-      const std::vector<SparsityMapEntry<N, T>> &entries1 = get_entries();
-      const std::vector<SparsityMapEntry<N, T>> &entries2 = other->get_entries();
-      for(typename std::vector<SparsityMapEntry<N, T>>::const_iterator it1 =
-              entries1.begin();
-          it1 != entries1.end(); it1++) {
-        Rect<N, T> isect = it1->bounds.intersection(bounds);
+      span<SparsityMapEntry<N, T>> entries1 = get_entries();
+      span<SparsityMapEntry<N, T>> entries2 = other->get_entries();
+      for(size_t i = 0; i < entries1.size(); i++) {
+        Rect<N, T> isect = entries1[i].bounds.intersection(bounds);
         if(isect.empty())
           continue;
-        for(typename std::vector<SparsityMapEntry<N, T>>::const_iterator it2 =
-                entries2.begin();
-            it2 != entries2.end(); it2++) {
-          if(!it2->bounds.overlaps(isect))
+        for(size_t j = 0; j < entries2.size(); j++) {
+          if(!entries2[j].bounds.overlaps(isect))
             continue;
           // TODO: handle further sparsity in either side
-          assert(!it1->sparsity.exists() && (it1->bitmap == 0) &&
-                 !it2->sparsity.exists() && (it2->bitmap == 0));
+          assert(!entries1[i].sparsity.exists() && (entries1[i].bitmap == 0) &&
+                 !entries2[j].sparsity.exists() && (entries2[j].bitmap == 0));
           return true;
         }
       }
@@ -907,6 +902,18 @@ namespace Realm {
     , sparsity_comm(_sparsity_comm)
   {}
 
+template<int N, typename T>
+SparsityMapImpl<N, T>::~SparsityMapImpl(void)
+{
+     //We are responsible for our instances
+     //if (this->entries_instance.exists()) {
+     //    this->entries_instance.destroy();
+     //}
+     //if (this->approx_instance.exists()) {
+     //    this->approx_instance.destroy();
+     //}
+}
+
   template <int N, typename T>
   inline /*static*/ SparsityMapImpl<N, T> *
   SparsityMapImpl<N, T>::lookup(SparsityMap<N, T> sparsity)
@@ -1192,8 +1199,7 @@ namespace Realm {
           old_data.swap(this->entries);
           size_t i = 0;
           size_t n = 0;
-          typename std::vector<SparsityMapEntry<N, T>>::const_iterator old_it =
-              old_data.begin();
+          typename std::vector<SparsityMapEntry<N, T>>::iterator old_it = old_data.begin();
           while((i < count) && (old_it != old_data.end())) {
             if(rects[i].hi[0] < (old_it->bounds.lo[0] - 1)) {
               this->entries.resize(n + 1);
@@ -1494,17 +1500,16 @@ namespace Realm {
         assert(false);
       // scan the entry list, sending bitmaps first and making a list of rects
       std::vector<Rect<N, T>> rects;
-      for(typename std::vector<SparsityMapEntry<N, T>>::const_iterator it =
-              this->entries.begin();
-          it != this->entries.end(); it++) {
-        if(it->bitmap) {
+      for(size_t i = 0; i < this->get_entries().size(); i++) {
+        const SparsityMapEntry<N, T> &entry = this->get_entries()[i];
+        if(entry.bitmap) {
           // TODO: send bitmap
           assert(0);
-        } else if(it->sparsity.exists()) {
+        } else if(entry.sparsity.exists()) {
           // TODO: ?
           assert(0);
         } else {
-          rects.push_back(it->bounds);
+          rects.push_back(entry.bounds);
         }
       }
 
@@ -1557,7 +1562,7 @@ namespace Realm {
   };
 
   template <int N, typename T>
-  static void compute_approximation(const std::vector<SparsityMapEntry<N, T>> &entries,
+  static void compute_approximation(const span<SparsityMapEntry<N, T>> &entries,
                                     std::vector<Rect<N, T>> &approx_rects, int max_rects)
   {
     size_t n = entries.size();
@@ -1579,7 +1584,7 @@ namespace Realm {
   }
 
   template <typename T>
-  static void compute_approximation(const std::vector<SparsityMapEntry<1, T>> &entries,
+  static void compute_approximation(const span<SparsityMapEntry<1, T>> &entries,
                                     std::vector<Rect<1, T>> &approx_rects, int max_rects)
   {
     int n = entries.size();
@@ -1693,6 +1698,9 @@ namespace Realm {
   template <int N, typename T>
   void SparsityMapImpl<N, T>::finalize(void)
   {
+
+    this->from_gpu = false;
+
     // in order to organize the data a little better and handle common coalescing
     //  cases, we do N sort/merging passes, with each dimension appearing last
     //  in the sort order at least once (so that we can merge in that dimension)
@@ -1748,7 +1756,7 @@ namespace Realm {
     // now that we've got our entries nice and tidy, build a bounded approximation of them
     if(true /*ID(me).sparsity_creator_node() == Network::my_node_id*/) {
       assert(!this->approx_valid.load());
-      compute_approximation(this->entries, this->approx_rects,
+      compute_approximation(span<SparsityMapEntry<N,T>>(this->entries.data(), this->entries.size()), this->approx_rects,
                             DeppartConfig::cfg_max_rects_in_approximation);
       this->approx_valid.store_release(true);
     }
@@ -1830,6 +1838,117 @@ namespace Realm {
 
     if(trigger_precise.exists())
       GenEventImpl::trigger(trigger_precise, false /*!poisoned*/);
+
+  }
+
+
+  //Here, we copy everything the CPU finalize does except manipulating the entries further
+  //and we indicate that the sparsity map was constructed from the cpu
+
+  template <int N, typename T>
+  void SparsityMapImpl<N, T>::gpu_finalize(void)
+  {
+    this->from_gpu = true;
+
+    if(true /*ID(me).sparsity_creator_node() == Network::my_node_id*/) {
+      assert(!this->approx_valid.load());
+      this->approx_valid.store_release(true);
+    }
+
+    {
+      LoggerMessage msg = log_part.info();
+      if(msg.is_active()) {
+        msg << "finalizing " << me << "(" << this << "), " << this->entries.size()
+            << " entries";
+        for(size_t i = 0; i < this->entries.size(); i++)
+          msg << "\n  [" << i << "]: bounds=" << this->entries[i].bounds
+              << " sparsity=" << this->entries[i].sparsity
+              << " bitmap=" << this->entries[i].bitmap;
+      }
+    }
+
+#ifdef DEBUG_PARTITIONING
+    std::cout << "finalizing " << this << ", " << this->entries.size() << " entries"
+              << std::endl;
+    for(size_t i = 0; i < this->entries.size(); i++)
+      std::cout << "  [" << i << "]: bounds=" << this->entries[i].bounds
+                << " sparsity=" << this->entries[i].sparsity
+                << " bitmap=" << this->entries[i].bitmap << std::endl;
+#endif
+    NodeSet sendto_precise, sendto_approx;
+    Event trigger_precise = Event::NO_EVENT;
+    Event trigger_approx = Event::NO_EVENT;
+    std::vector<PartitioningMicroOp *> precise_waiters_copy, approx_waiters_copy;
+    {
+      AutoLock<> al(mutex);
+
+      assert(!this->entries_valid.load());
+      this->entries_valid.store_release(true);
+
+      precise_requested = false;
+      if(precise_ready_event.exists()) {
+        trigger_precise = precise_ready_event;
+        precise_ready_event = Event::NO_EVENT;
+      }
+
+      precise_waiters_copy.swap(precise_waiters);
+      approx_waiters_copy.swap(approx_waiters);
+
+      remote_precise_waiters.swap(sendto_precise);
+      remote_approx_waiters.swap(sendto_approx);
+    }
+
+    for(std::vector<PartitioningMicroOp *>::const_iterator it =
+            precise_waiters_copy.begin();
+        it != precise_waiters_copy.end(); it++)
+      (*it)->sparsity_map_ready(this, true);
+
+    for(std::vector<PartitioningMicroOp *>::const_iterator it =
+            approx_waiters_copy.begin();
+        it != approx_waiters_copy.end(); it++)
+      (*it)->sparsity_map_ready(this, false);
+
+    if(!sendto_approx.empty()) {
+      for(NodeID i = 0; (i <= Network::max_node_id) && !sendto_approx.empty(); i++)
+        if(sendto_approx.contains(i)) {
+          bool also_precise = sendto_precise.contains(i);
+          if(also_precise)
+            sendto_precise.remove(i);
+          remote_data_reply(i, also_precise, true);
+          sendto_approx.remove(i);
+        }
+    }
+
+    if(!sendto_precise.empty()) {
+      for(NodeID i = 0; (i <= Network::max_node_id) && !sendto_precise.empty(); i++)
+        if(sendto_precise.contains(i)) {
+          remote_data_reply(i, true, false);
+          sendto_precise.remove(i);
+        }
+    }
+
+    if(trigger_approx.exists())
+      GenEventImpl::trigger(trigger_approx, false /*!poisoned*/);
+
+    if(trigger_precise.exists())
+      GenEventImpl::trigger(trigger_precise, false /*!poisoned*/);
+  }
+
+
+  //Allows a GPU deppart client to set the entries directly with a host region instance
+  template<int N, typename T>
+    void SparsityMapImpl<N, T>::set_instance(RegionInstance _entries_instance, size_t size)
+  {
+    this->entries_instance = _entries_instance;
+    this->num_entries = size;
+  }
+
+  //Allows a GPU deppart client to set the approx rects directly with a host region instance
+  template<int N, typename T>
+  void SparsityMapImpl<N, T>::set_approx_instance(RegionInstance _approx_instance, size_t size)
+  {
+    this->approx_instance = _approx_instance;
+    this->num_approx = size;
   }
 
   template <int N, typename T>
diff --git a/src/realm/deppart/sparsity_impl.h b/src/realm/deppart/sparsity_impl.h
index 4a3ed14349..2618f4decc 100644
--- a/src/realm/deppart/sparsity_impl.h
+++ b/src/realm/deppart/sparsity_impl.h
@@ -109,6 +109,8 @@ namespace Realm {
     SparsityMapImpl(SparsityMap<N, T> _me, NodeSet &subscribers,
                     SparsityMapCommunicator<N, T> *_sparsity_comm);
 
+    ~SparsityMapImpl();
+
     // actual implementation - SparsityMapPublicImpl's version just calls this one
     Event make_valid(bool precise = true);
 
@@ -136,6 +138,10 @@ namespace Realm {
     void remote_data_request(NodeID requestor, bool send_precise, bool send_approx);
     void remote_data_reply(NodeID requestor, bool send_precise, bool send_approx);
 
+    void set_instance(RegionInstance _entries_instance, size_t size);
+    void set_approx_instance(RegionInstance _approx_instance, size_t size);
+    void gpu_finalize(void);
+
     SparsityMap<N, T> me;
 
     struct RemoteSparsityRequest {
diff --git a/src/realm/deppart/untemplated_gpu_kernels.cu b/src/realm/deppart/untemplated_gpu_kernels.cu
new file mode 100644
index 0000000000..a45e8f8962
--- /dev/null
+++ b/src/realm/deppart/untemplated_gpu_kernels.cu
@@ -0,0 +1,119 @@
+#include "realm/deppart/partitions.h"
+
+namespace Realm {
+
+__device__ __forceinline__
+int bvh_common_prefix(const uint64_t *morton, const uint64_t *leafIdx, int i, int j, int n) {
+  if (j < 0 || j >= n) return -1;
+  uint64_t x = morton[i] ^ morton[j];
+  uint64_t y = leafIdx[i] ^ leafIdx[j];
+  if (x == 0) {
+    return 64 + __clzll(y);
+  }
+  return __clzll(x);
+}
+
+__global__
+void bvh_build_radix_tree_kernel(
+    const uint64_t *morton,    // [n]
+    const uint64_t *leafIdx,   // [n]  (unused here but kept for symmetry)
+    int n,
+    int *childLeft,            // [2n−1]
+    int *childRight,           // [2n−1]
+    int *parent)               // [2n−1], pre‐initialized to −1
+{
+  int idx = blockIdx.x*blockDim.x + threadIdx.x;
+  int i = idx;
+  if (i >= n-1) return;            // we only build n−1 internal nodes
+
+  int left, right;
+  int dL = bvh_common_prefix(morton, leafIdx, i, i-1, n);
+  int dR = bvh_common_prefix(morton, leafIdx, i, i+1, n);
+  int d  = (dR > dL ? +1 : -1);
+  int deltaMin = (dR > dL ? dL : dR);
+
+  // 3) find j by exponential + binary search
+  int l_max = 2;
+  int delta = -1;
+  int i_tmp = i + d * l_max;
+  if (0 <= i_tmp && i_tmp < n) {
+    delta = bvh_common_prefix(morton, leafIdx, i, i_tmp, n);
+  }
+  while (delta > deltaMin) {
+    l_max <<= 1;
+    i_tmp = i + d * l_max;
+    delta = -1;
+    if (0 <= i_tmp && i_tmp < n) {
+      delta = bvh_common_prefix(morton, leafIdx, i, i_tmp, n);
+    }
+  }
+  int l = 0;
+  int t = (l_max) >> 1;
+  while (t > 0) {
+    i_tmp = i + d*(l + t);
+    delta = -1;
+    if (0 <= i_tmp && i_tmp < n) {
+      delta = bvh_common_prefix(morton, leafIdx, i, i_tmp, n);
+    }
+    if (delta > deltaMin) {
+      l += t;
+    }
+    t >>= 1;
+  }
+  if (d < 0) {
+    right = i;
+    left = i + d*l;
+  } else {
+    left = i;
+    right = i + d*l;
+  }
+
+  int gamma;
+  if (morton[left] == morton[right] && leafIdx[left] == leafIdx[right]) {
+    gamma = (left+right) >> 1;
+  } else {
+    int deltaNode = bvh_common_prefix(morton, leafIdx, left, right, n);
+    int split = left;
+    int stride = right - left;
+    do {
+      stride = (stride + 1) >> 1;
+      int middle = split + stride;
+      if (middle < right) {
+        int delta = bvh_common_prefix(morton, leafIdx, left, middle, n);
+        if (delta > deltaNode) {
+          split = middle;
+        }
+      }
+    } while (stride > 1);
+    gamma = split;
+  }
+
+  int left_node = gamma;
+  int right_node = gamma + 1;
+  if (left == gamma) {
+    left_node += n-1;
+  }
+  if (right == gamma + 1) {
+    right_node += n-1;
+  }
+
+  childLeft [idx] = left_node;
+  childRight[idx] = right_node;
+  parent[left_node]  = idx;
+  parent[right_node] = idx;
+}
+
+__global__
+void bvh_build_root_kernel(
+    int *root,
+    int *parent,
+    size_t total_rects) {
+
+  int tid = blockIdx.x*blockDim.x + threadIdx.x;
+  if (tid >= 2 * total_rects - 1) return;
+  if (parent[tid] == -1) {
+    *root = tid;
+  }
+}
+
+}
\ No newline at end of file
diff --git a/src/realm/indexspace.h b/src/realm/indexspace.h
index 842213c467..b61a77d689 100644
--- a/src/realm/indexspace.h
+++ b/src/realm/indexspace.h
@@ -29,6 +29,7 @@
 
 #include "realm/realm_c.h"
 #include "realm/realm_config.h"
+#include "realm/realm_assert.h"
 #include "realm/sparsity.h"
 #include "realm/dynamic_templates.h"
 
@@ -782,6 +783,17 @@ namespace Realm {
         Event wait_on = Event::NO_EVENT) const;
     ///@}
 
+    ///@{
+    ///
+
+    template <int N2, typename T2>
+    REALM_PUBLIC_API Event gpu_subspaces_by_image(
+        const DomainTransform<N, T, N2, T2> &domain_transform,
+        const std::vector<IndexSpace<N2, T2>> &sources,
+        std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
+        std::pair<size_t, size_t> &sizes, RegionInstance buffer = RegionInstance::NO_INST, Event wait_on = Event::NO_EVENT) const;
+    ///@}
+
     ///@{
     /**
      * Computes subspaces of this index space by determining what subsets are
@@ -813,6 +825,14 @@ namespace Realm {
         std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
         Event wait_on = Event::NO_EVENT) const;
 
+    template <int N2, typename T2>
+    REALM_PUBLIC_API Event gpu_subspaces_by_image(
+        const std::vector<FieldDataDescriptor<IndexSpace<N2, T2>, Point<N, T>>>
+            &field_data,
+        const std::vector<IndexSpace<N2, T2>> &sources,
+        std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
+        std::pair<size_t, size_t> &sizes, RegionInstance buffer = RegionInstance::NO_INST, Event wait_on = Event::NO_EVENT) const;
+
     // range versions
     template <int N2, typename T2>
     REALM_PUBLIC_API Event create_subspace_by_image(
diff --git a/src/realm/indexspace.inl b/src/realm/indexspace.inl
index cb0a83e6cb..c633aa5e46 100644
--- a/src/realm/indexspace.inl
+++ b/src/realm/indexspace.inl
@@ -488,13 +488,12 @@ namespace Realm {
       SparsityMapPublicImpl<N, T> *impl = sparsity.impl();
 
       // if we don't have the data, it's too late - somebody should have waited
-      // we should have the metadata valid
       REALM_ASSERT(impl->is_valid(precise));
 
       // always use precise info if it's available
       if(impl->is_valid(true /*precise*/)) {
         IndexSpace<N, T> result;
-        const std::vector<SparsityMapEntry<N, T>> &entries = impl->get_entries();
+        span<SparsityMapEntry<N, T>> entries = impl->get_entries();
         // three cases:
         // 1) empty index space
         if(entries.empty()) {
@@ -534,7 +533,7 @@ namespace Realm {
         log_dpops.info() << "tighten: " << *this << " = " << result;
         return result;
       } else {
-        const std::vector<Rect<N, T>> &approx_rects = impl->get_approx_rects();
+        span<Rect<N, T>> approx_rects = impl->get_approx_rects();
 
         // two cases:
         // 1) empty index space
@@ -561,7 +560,7 @@ namespace Realm {
   //  the index of the entry that contains the point, or the first one to appear after
   //  that point
   template <int N, typename T>
-  static size_t bsearch_map_entries(const std::vector<SparsityMapEntry<N, T>> &entries,
+  static size_t bsearch_map_entries(const span<SparsityMapEntry<N, T>> &entries,
                                     const Point<N, T> &p)
   {
     assert(N == 1);
@@ -592,41 +591,40 @@ namespace Realm {
     if(dense())
       return true;
 
-    SparsityMapPublicImpl<N, T> *impl = sparsity.impl();
-    const std::vector<SparsityMapEntry<N, T>> &entries = impl->get_entries();
+    SparsityMapPublicImpl<N,T> *impl = sparsity.impl();
+    span<SparsityMapEntry<N, T>> entries = impl->get_entries();
     if(N == 1) {
       // binary search to find the element we want
-      size_t idx = bsearch_map_entries<N, T>(entries, p);
-      if(idx >= entries.size())
-        return false;
+      size_t idx = bsearch_map_entries<N,T>(entries, p);
+      if(idx >= entries.size()) return false;
 
-      const SparsityMapEntry<N, T> &e = entries[idx];
+      const SparsityMapEntry<N,T>& e = entries[idx];
 
       // the search guaranteed we're below the upper bound of the returned entry,
       //  but we might be below the lower bound
       if(p[0] < e.bounds.lo[0])
-        return false;
+	return false;
 
       if(e.sparsity.exists()) {
-        assert(0);
+	assert(0);
       }
       if(e.bitmap != 0) {
-        assert(0);
+	assert(0);
       }
       return true;
     } else {
-      for(typename std::vector<SparsityMapEntry<N, T>>::const_iterator it =
-              entries.begin();
-          it != entries.end(); it++) {
-        if(!it->bounds.contains(p))
-          continue;
-        if(it->sparsity.exists()) {
-          assert(0);
-        } else if(it->bitmap != 0) {
-          assert(0);
-        } else {
-          return true;
-        }
+      for(size_t i = 0; i < entries.size(); i++) {
+        SparsityMapEntry<N, T> entry = entries[i];
+	if(!entry.bounds.contains(p)) {
+	  continue;
+	}
+	if(entry.sparsity.exists()) {
+	  assert(0);
+	} else if(entry.bitmap != 0) {
+	  assert(0);
+	} else {
+	  return true;
+	}
       }
     }
 
@@ -644,21 +642,19 @@ namespace Realm {
     if(!dense()) {
       // test against sparsity map too
       size_t total_volume = 0;
-      SparsityMapPublicImpl<N, T> *impl = sparsity.impl();
-      const std::vector<SparsityMapEntry<N, T>> &entries = impl->get_entries();
-      for(typename std::vector<SparsityMapEntry<N, T>>::const_iterator it =
-              entries.begin();
-          it != entries.end(); it++) {
-        if(!it->bounds.overlaps(r))
-          continue;
-        if(it->sparsity.exists()) {
-          assert(0);
-        } else if(it->bitmap != 0) {
-          assert(0);
-        } else {
-          Rect<N, T> isect = it->bounds.intersection(r);
+      SparsityMapPublicImpl<N,T> *impl = sparsity.impl();
+      span<SparsityMapEntry<N, T>> entries = impl->get_entries();
+      for(size_t i = 0; i < entries.size(); i++) {
+        SparsityMapEntry<N, T> entry = entries[i];
+	if(!entry.bounds.overlaps(r)) continue;
+	if(entry.sparsity.exists()) {
+	  assert(0);
+	} else if(entry.bitmap != 0) {
+	  assert(0);
+	} else {
+          Rect<N,T> isect = entry.bounds.intersection(r);
           total_volume += isect.volume();
-        }
+	}
       }
 
       // did we miss anything?
@@ -678,22 +674,20 @@ namespace Realm {
 
     if(!dense()) {
       // test against sparsity map too
-      SparsityMapPublicImpl<N, T> *impl = sparsity.impl();
-      const std::vector<SparsityMapEntry<N, T>> &entries = impl->get_entries();
-      for(typename std::vector<SparsityMapEntry<N, T>>::const_iterator it =
-              entries.begin();
-          it != entries.end(); it++) {
-        if(!it->bounds.overlaps(r))
-          continue;
-        if(it->sparsity.exists()) {
-          assert(0);
-        } else if(it->bitmap != 0) {
-          assert(0);
-        } else {
-          return true;
-        }
+      SparsityMapPublicImpl<N,T> *impl = sparsity.impl();
+      span<SparsityMapEntry<N, T>> entries = impl->get_entries();
+      for(size_t i = 0; i < entries.size(); i++) {
+        SparsityMapEntry<N, T> entry = entries[i];
+	if(!entry.bounds.overlaps(r)) continue;
+	if(entry.sparsity.exists()) {
+	  assert(0);
+	} else if(entry.bitmap != 0) {
+	  assert(0);
+	} else {
+	  return true;
+	}
       }
-
+      
       return false;
     }
 
@@ -732,15 +726,15 @@ namespace Realm {
 
     size_t total = 0;
     SparsityMapPublicImpl<N, T> *impl = sparsity.impl();
-    const std::vector<SparsityMapEntry<N, T>> &entries = impl->get_entries();
-    for(typename std::vector<SparsityMapEntry<N, T>>::const_iterator it = entries.begin();
-        it != entries.end(); it++) {
-      Rect<N, T> isect = bounds.intersection(it->bounds);
+   span<SparsityMapEntry<N, T>> entries = impl->get_entries();
+    for(size_t i = 0; i < entries.size(); i++) {
+      SparsityMapEntry<N, T> entry = entries[i];
+      Rect<N, T> isect = bounds.intersection(entry.bounds);
       if(isect.empty())
         continue;
-      if(it->sparsity.exists()) {
+      if(entry.sparsity.exists()) {
         assert(0);
-      } else if(it->bitmap != 0) {
+      } else if(entry.bitmap != 0) {
         assert(0);
       } else {
         total += isect.volume();
@@ -764,19 +758,20 @@ namespace Realm {
     if(dense())
       return true;
 
-    SparsityMapPublicImpl<N, T> *impl = sparsity.impl();
-    const std::vector<Rect<N, T>> &approx_rects = impl->get_approx_rects();
-    for(typename std::vector<Rect<N, T>>::const_iterator it = approx_rects.begin();
-        it != approx_rects.end(); it++)
-      if(it->contains(p))
-        return true;
+    SparsityMapPublicImpl<N,T> *impl = sparsity.impl();
+    span<Rect<N, T>> approx_rects = impl->get_approx_rects();
+    for(size_t i = 0; i < approx_rects.size(); i++) {
+      Rect<N, T> entry = approx_rects[i];
+      if(entry.contains(p))
+	return true;
+    }
 
     // no entries matched, so the point is definitely not contained in this space
     return false;
   }
 
   template <int N, typename T>
-  inline bool IndexSpace<N, T>::contains_all_approx(const Rect<N, T> &r) const
+  inline bool IndexSpace<N,T>::contains_all_approx(const Rect<N,T>& r) const
   {
     // test on bounding box first
     if(!bounds.contains(r))
@@ -786,14 +781,14 @@ namespace Realm {
     if(dense())
       return true;
 
-    SparsityMapPublicImpl<N, T> *impl = sparsity.impl();
-    const std::vector<Rect<N, T>> &approx_rects = impl->get_approx_rects();
-    for(typename std::vector<Rect<N, T>>::const_iterator it = approx_rects.begin();
-        it != approx_rects.end(); it++) {
-      if(it->contains(r))
-        return true;
-      if(it->overlaps(r))
-        assert(0);
+    SparsityMapPublicImpl<N,T> *impl = sparsity.impl();
+    span<Rect<N, T>> approx_rects = impl->get_approx_rects();
+    for(size_t i = 0; i < approx_rects.size(); i++) {
+      Rect<N, T> entry = approx_rects[i];
+      if(entry.contains(r))
+	return true;
+      if(entry.overlaps(r))
+	assert(0);
     }
 
     // no entries matched, so the point is definitely not contained in this space
@@ -811,12 +806,12 @@ namespace Realm {
     if(dense())
       return true;
 
-    SparsityMapPublicImpl<N, T> *impl = sparsity.impl();
-    const std::vector<Rect<N, T>> &approx_rects = impl->get_approx_rects();
-    for(typename std::vector<Rect<N, T>>::const_iterator it = approx_rects.begin();
-        it != approx_rects.end(); it++) {
-      if(it->overlaps(r))
-        return true;
+    SparsityMapPublicImpl<N,T> *impl = sparsity.impl();
+    span<Rect<N, T>> approx_rects = impl->get_approx_rects();
+    for(size_t i = 0; i < approx_rects.size(); i++) {
+      Rect<N, T> entry = approx_rects[i];
+      if(entry.overlaps(r))
+	return true;
     }
 
     // no entries matched, so the point is definitely not contained in this space
@@ -838,29 +833,27 @@ namespace Realm {
       return contains_any_approx(other.bounds);
 
     // both sparse case can be expensive...
-    SparsityMapPublicImpl<N, T> *impl = sparsity.impl();
-    SparsityMapPublicImpl<N, T> *other_impl = other.sparsity.impl();
+    SparsityMapPublicImpl<N,T> *impl = sparsity.impl();
+    SparsityMapPublicImpl<N,T> *other_impl = other.sparsity.impl();
     // overlap can only be within intersecion of bounds
-    Rect<N, T> isect = bounds.intersection(other.bounds);
+    Rect<N,T> isect = bounds.intersection(other.bounds);
 
     return impl->overlaps(other_impl, isect, true /*approx*/);
   }
 
-  // approximage number of points in index space (may be less than volume of bounding box,
-  // but larger than
+  // approximage number of points in index space (may be less than volume of bounding box, but larger than
   //   actual volume)
   template <int N, typename T>
-  inline size_t IndexSpace<N, T>::volume_approx(void) const
+  inline size_t IndexSpace<N,T>::volume_approx(void) const
   {
     if(dense())
       return bounds.volume();
 
     size_t total = 0;
-    SparsityMapPublicImpl<N, T> *impl = sparsity.impl();
-    const std::vector<Rect<N, T>> &approx_rects = impl->get_approx_rects();
-    for(typename std::vector<Rect<N, T>>::const_iterator it = approx_rects.begin();
-        it != approx_rects.end(); it++)
-      total += it->volume();
+    SparsityMapPublicImpl<N,T> *impl = sparsity.impl();
+    span<Rect<N, T>> approx_rects = impl->get_approx_rects();
+    for(size_t i = 0; i < approx_rects.size(); i++)
+      total += approx_rects[i].volume();
 
     return total;
   }
@@ -981,6 +974,18 @@ namespace Realm {
                                      images, reqs, wait_on);
   }
 
+  template <int N, typename T>
+  template <int N2, typename T2>
+  inline Event IndexSpace<N, T>::gpu_subspaces_by_image(
+      const std::vector<FieldDataDescriptor<IndexSpace<N2, T2>, Point<N, T>>> &field_data,
+      const std::vector<IndexSpace<N2, T2>> &sources,
+      std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
+        std::pair<size_t, size_t> &sizes, RegionInstance buffer, Event wait_on) const
+  {
+    return gpu_subspaces_by_image(DomainTransform<N, T, N2, T2>(field_data), sources,
+                                     images, reqs, sizes, buffer, wait_on);
+  }
+
   template <int N, typename T>
   template <int N2, typename T2>
   inline Event IndexSpace<N, T>::create_subspaces_by_image(
@@ -1320,7 +1325,7 @@ namespace Realm {
 
     rect = Rect<N, T>::make_empty();
 
-    const std::vector<SparsityMapEntry<N, T>> &entries = s_impl->get_entries();
+    span<SparsityMapEntry<N, T>> entries = s_impl->get_entries();
     // find the first entry that overlaps our restriction - speed this up with a
     //  binary search on the low end of the restriction if we're 1-D
 
@@ -1356,7 +1361,7 @@ namespace Realm {
     // TODO: handle iteration within a sparsity entry
 
     // move onto the next sparsity entry (that overlaps our restriction)
-    const std::vector<SparsityMapEntry<N, T>> &entries = s_impl->get_entries();
+    const span<SparsityMapEntry<N, T>> entries = s_impl->get_entries();
     for(cur_entry++; cur_entry < entries.size(); cur_entry++) {
       const SparsityMapEntry<N, T> &e = entries[cur_entry];
       rect = restriction.intersection(e.bounds);
diff --git a/src/realm/inst_layout.inl b/src/realm/inst_layout.inl
index 0ee4db6960..acb2896e41 100644
--- a/src/realm/inst_layout.inl
+++ b/src/realm/inst_layout.inl
@@ -90,13 +90,13 @@ namespace Realm {
       // we need precise data for non-dense index spaces (the original
       //  'bounds' on the IndexSpace is often VERY conservative)
       SparsityMapPublicImpl<N, T> *impl = is.sparsity.impl();
-      const std::vector<SparsityMapEntry<N, T>> &entries = impl->get_entries();
+      span<SparsityMapEntry<N, T>> entries = impl->get_entries();
       if(!entries.empty()) {
         // TODO: set some sort of threshold for merging entries
-        typename std::vector<SparsityMapEntry<N, T>>::const_iterator it = entries.begin();
-        Rect<N, T> bbox = is.bounds.intersection(it->bounds);
-        while(++it != entries.end())
-          bbox = bbox.union_bbox(is.bounds.intersection(it->bounds));
+        size_t i = 0;
+        Rect<N, T> bbox = is.bounds.intersection(entries[i].bounds);
+        while(++i < entries.size())
+          bbox = bbox.union_bbox(is.bounds.intersection(entries[i].bounds));
         if(!bbox.empty())
           piece_bounds.push_back(bbox);
       }
diff --git a/src/realm/sparsity.h b/src/realm/sparsity.h
index 1dc402a709..bf46284c6d 100644
--- a/src/realm/sparsity.h
+++ b/src/realm/sparsity.h
@@ -205,7 +205,7 @@ namespace Realm {
      * @return the entries of this sparsity map
      */
     REALM_PUBLIC_API
-    const std::vector<SparsityMapEntry<N, T>> &get_entries(void);
+    const span<SparsityMapEntry<N, T>> get_entries(void);
 
     /**
      * Get the approximate rectangles of this sparsity map.
@@ -215,7 +215,7 @@ namespace Realm {
      * @return the approximate rectangles of this sparsity map
      */
     REALM_PUBLIC_API
-    const std::vector<Rect<N, T>> &get_approx_rects(void);
+    const span<Rect<N, T>> get_approx_rects(void);
 
     /**
      * Check if this sparsity map overlaps another sparsity map.
@@ -246,8 +246,23 @@ namespace Realm {
 
   protected:
     atomic<bool> entries_valid{false}, approx_valid{false};
-    std::vector<SparsityMapEntry<N, T>> entries;
-    std::vector<Rect<N, T>> approx_rects;
+
+    //BOTH RegionInstance and vector are returned as a span
+    //only on can be valid (i.e. only finalize or gpu_finalize can be called, not both)
+
+    //Stores rectangles for CPU deppart (easy manipulation for sort/merge entries)
+    std::vector<SparsityMapEntry<N,T> > entries;
+    std::vector<Rect<N,T> > approx_rects;
+
+    //Stores rectangles for GPU deppart (allows fast copy after merged on GPU)
+    RegionInstance entries_instance = RegionInstance::NO_INST;
+    size_t num_entries = 0;
+
+    RegionInstance approx_instance = RegionInstance::NO_INST;
+    size_t num_approx = 0;
+
+    //Tracks whether to use instance or vector
+    bool from_gpu = false;
   };
 
 }; // namespace Realm
diff --git a/src/realm/sparsity.inl b/src/realm/sparsity.inl
index a4a72fec05..7ff00ef552 100644
--- a/src/realm/sparsity.inl
+++ b/src/realm/sparsity.inl
@@ -18,9 +18,9 @@
 // sparsity maps for Realm
 
 // nop, but helps IDEs
+#include "realm/inst_layout.h"
 #include "realm/sparsity.h"
 
-#include "realm/realm_assert.h"
 #include "realm/serialize.h"
 
 TEMPLATE_TYPE_IS_SERIALIZABLE2(int N, typename T, Realm::SparsityMap<N, T>);
@@ -84,19 +84,37 @@ namespace Realm {
   }
 
   template <int N, typename T>
-  inline const std::vector<SparsityMapEntry<N, T>> &
-  SparsityMapPublicImpl<N, T>::get_entries(void)
+  inline const span<SparsityMapEntry<N, T>> SparsityMapPublicImpl<N, T>::get_entries(void)
   {
     REALM_ASSERT(entries_valid.load_acquire());
-    return entries;
+    if(from_gpu) {
+      if (num_entries == 0) {
+        return span<SparsityMapEntry<N, T>>();
+      }
+      return span<SparsityMapEntry<N, T>>(
+          reinterpret_cast<SparsityMapEntry<N, T> *>(entries_instance.pointer_untyped(
+              0, num_entries * sizeof(SparsityMapEntry<N, T>))),
+          num_entries);
+    } else {
+      return span<SparsityMapEntry<N, T>>(entries.data(), entries.size());
+    }
   }
 
   template <int N, typename T>
-  inline const std::vector<Rect<N, T>> &
-  SparsityMapPublicImpl<N, T>::get_approx_rects(void)
+  inline const span<Rect<N, T>> SparsityMapPublicImpl<N, T>::get_approx_rects(void)
   {
     REALM_ASSERT(approx_valid.load_acquire());
-    return approx_rects;
+    if(from_gpu) {
+      if (num_approx == 0) {
+        return span<Rect<N, T>>();
+      }
+      return span<Rect<N, T>>(
+          reinterpret_cast<Rect<N, T> *>(
+              approx_instance.pointer_untyped(0, num_approx * sizeof(Rect<N, T>))),
+          num_approx);
+    } else {
+      return span<Rect<N, T>>(approx_rects.data(), approx_rects.size());
+    }
   }
 
 }; // namespace Realm
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index a6213d8b46..e166888637 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -439,6 +439,10 @@ if(TEST_USE_GPU)
     task_stream "${REALM_TEST_DIR}/task_stream.cc" "${REALM_TEST_DIR}/task_stream_gpu.cu"
   )
   target_link_libraries(task_stream ${TEST_GPU_LIBS})
+  set(gpu_deppart_1d_ARGS -ll:gpu 1)
+  set(gpu_deppart_1d_RESOURCE_LOCK gpu)
+  add_integration_test(gpu_deppart_1d "${REALM_TEST_DIR}/gpu_deppart_1d.cc")
+  target_link_libraries(gpu_deppart_1d ${TEST_GPU_LIBS})
 endif()
 
 #### C API tests
diff --git a/tests/deppart.cc b/tests/deppart.cc
index e33708daf0..815f2cb490 100644
--- a/tests/deppart.cc
+++ b/tests/deppart.cc
@@ -41,6 +41,10 @@ enum
 {
   TOP_LEVEL_TASK = Processor::TASK_ID_FIRST_AVAILABLE + 0,
   INIT_CIRCUIT_DATA_TASK,
+  INIT_BASIC_DATA_TASK,
+  INIT_TILE_DATA_TASK,
+  INIT_RANGE_DATA_TASK,
+  INIT_2D_DATA_TASK,
   INIT_PENNANT_DATA_TASK,
   INIT_MINIAERO_DATA_TASK,
 };
@@ -87,14 +91,14 @@ void dump_sparse_index_space(const char *pfx, IndexSpace<N, T> is)
   if(!is.sparsity.exists())
     return;
   SparsityMapPublicImpl<N, T> *impl = is.sparsity.impl();
-  const std::vector<SparsityMapEntry<N, T>> &entries = impl->get_entries();
-  for(typename std::vector<SparsityMapEntry<N, T>>::const_iterator it = entries.begin();
-      it != entries.end(); it++) {
-    std::cout << "  " << it->bounds;
-    if(it->bitmap)
-      std::cout << " bitmap(" << it->bitmap << ")";
-    if(it->sparsity.exists())
-      std::cout << " sparsity(" << it->sparsity << ")";
+  span<SparsityMapEntry<N, T>> entries = impl->get_entries();
+  for(size_t i = 0; i < entries.size(); i++) {
+    SparsityMapEntry<N, T> entry = entries[i];
+    std::cout << "  " << entry.bounds;
+    if(entry.bitmap)
+      std::cout << " bitmap(" << entry.bitmap << ")";
+    if(entry.sparsity.exists())
+      std::cout << " sparsity(" << entry.sparsity << ")";
     std::cout << "\n";
   }
 }
@@ -161,6 +165,1571 @@ int find_split(const std::vector<T> &cuts, T v)
   return 0;
 }
 
+/*
+ * Basic test - create a graph, partition it by
+ * node subgraph id and then check that the partitioning
+ * is correct
+ */
+class BasicTest : public TestInterface {
+public:
+  // graph config parameters
+  int num_nodes = 1000;
+  int num_edges = 1000;
+  int num_pieces = 4;
+  std::string filename;
+
+  BasicTest(int argc, const char *argv[])
+  {
+    for(int i = 1; i < argc; i++) {
+
+      if(!strcmp(argv[i], "-p")) {
+        num_pieces = atoi(argv[++i]);
+        continue;
+      }
+      if(!strcmp(argv[i], "-n")) {
+        num_nodes = atoi(argv[++i]);
+        continue;
+      }
+      if(!strcmp(argv[i], "-e")) {
+        num_edges = atoi(argv[++i]);
+        continue;
+      }
+    }
+
+
+    if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0) {
+      log_app.error() << "Invalid config: nodes=" << num_nodes << " edges=" << num_edges << " pieces=" << num_pieces << "\n";
+      exit(1);
+    }
+  }
+
+  struct InitDataArgs {
+    int index;
+    RegionInstance ri_nodes;
+    RegionInstance ri_edges;
+  };
+
+  enum PRNGStreams
+  {
+    NODE_SUBGRAPH_STREAM,
+  };
+
+  // assign subgraph ids to nodes
+  void random_node_data(int idx, int &subgraph)
+  {
+    if(random_colors)
+      subgraph =
+          Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces);
+    else
+      subgraph = idx * num_pieces / num_nodes;
+  }
+
+  void random_edge_data(int idx, int& src, int& dst)
+  {
+    src = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes);
+    dst = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, num_nodes);
+  }
+
+  static void init_data_task_wrapper(const void *args, size_t arglen,
+                                     const void *userdata, size_t userlen, Processor p)
+  {
+    BasicTest *me = (BasicTest *)testcfg;
+    me->init_data_task(args, arglen, p);
+  }
+
+  //Each piece has a task to initialize its data
+  void init_data_task(const void *args, size_t arglen, Processor p)
+  {
+    const InitDataArgs &i_args = *(const InitDataArgs *)args;
+
+    log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes
+                   << ")";
+
+    i_args.ri_nodes.fetch_metadata(p).wait();
+    i_args.ri_edges.fetch_metadata(p).wait();
+
+    IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>();
+    IndexSpace<1> is_edges = i_args.ri_edges.get_indexspace<1>();
+
+    log_app.debug() << "N: " << is_nodes;
+    log_app.debug() << "E: " << is_edges;
+
+    //For each node in the graph, mark it with a random (or deterministic) subgraph id
+    {
+      AffineAccessor<int, 1> a_piece_id(i_args.ri_nodes, 0 /* offset */);
+
+      for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) {
+        int subgraph;
+        random_node_data(i, subgraph);
+        a_piece_id.write(i, subgraph);
+      }
+
+      AffineAccessor<Point<1>,1> a_src(i_args.ri_edges, 0 /* offset */);
+      AffineAccessor<Point<1>,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */);
+
+      for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) {
+        int src, dst;
+        random_edge_data(i, src, dst);
+        a_src.write(i, Point<1>(src));
+        a_dst.write(i, Point<1>(dst));
+      }
+    }
+
+    //Optionally print out the assigned subgraph ids
+    if(show_graph) {
+      AffineAccessor<int, 1> a_piece_id(i_args.ri_nodes, 0 /* offset */);
+
+      for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++)
+        log_app.info() << "piece_id[" << i << "] = " << a_piece_id.read(i) << "\n";
+
+      AffineAccessor<Point<1>,1> a_src(i_args.ri_edges, 0 /* offset */);
+      AffineAccessor<Point<1>,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */);
+
+      for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++)
+        log_app.info() << "src, dst[" << i << "] = " << a_src.read(i) << ", " << a_dst.read(i) << "\n";
+    }
+  }
+
+  IndexSpace<1> is_nodes, is_edges;
+  std::vector<RegionInstance> ri_nodes, ri_edges;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, int> > piece_id_field_data;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1> > > src_node_field_data, dst_node_field_data;
+
+  virtual void print_info(void)
+  {
+    printf("Realm dependent partitioning test - basic: %d nodes, %d edges, %d pieces\n",
+	   (int)num_nodes, (int) num_edges, (int)num_pieces);
+  }
+
+  virtual Event initialize_data(const std::vector<Memory> &memories,
+                                const std::vector<Processor> &procs)
+  {
+    // now create index space for nodes
+    is_nodes = Rect<1>(0, num_nodes - 1);
+    is_edges = Rect<1>(0, num_edges - 1);
+
+    // equal partition is used to do initial population of edges and nodes
+    std::vector<IndexSpace<1> > ss_nodes_eq;
+    std::vector<IndexSpace<1> > ss_edges_eq;
+
+    log_app.info() << "Creating equal subspaces\n";
+
+    is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait();
+    is_edges.create_equal_subspaces(num_pieces, 1, ss_edges_eq, Realm::ProfilingRequestSet()).wait();
+
+    log_app.debug() << "Initial partitions:";
+    for(size_t i = 0; i < ss_nodes_eq.size(); i++)
+      log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i];
+    for(size_t i = 0; i < ss_edges_eq.size(); i++)
+      log_app.debug() << " Edges #" << i << ": " << ss_edges_eq[i];
+
+    // create instances for each of these subspaces
+    std::vector<size_t> node_fields, edge_fields;
+    node_fields.push_back(sizeof(int));  // piece_id
+    assert(sizeof(int) == sizeof(Point<1>));
+    edge_fields.push_back(sizeof(Point<1>));  // src_node
+    edge_fields.push_back(sizeof(Point<1>));  // dst_node
+
+    ri_nodes.resize(num_pieces);
+    piece_id_field_data.resize(num_pieces);
+
+    for(size_t i = 0; i < ss_nodes_eq.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i],
+                                      node_fields, 0 /*SOA*/,
+                                      Realm::ProfilingRequestSet())
+          .wait();
+      ri_nodes[i] = ri;
+
+      piece_id_field_data[i].index_space = ss_nodes_eq[i];
+      piece_id_field_data[i].inst = ri_nodes[i];
+      piece_id_field_data[i].field_offset = 0;
+    }
+
+
+    // Fire off tasks to initialize data
+    ri_edges.resize(num_pieces);
+    src_node_field_data.resize(num_pieces);
+    dst_node_field_data.resize(num_pieces);
+
+    for(size_t i = 0; i < ss_edges_eq.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri,
+				      memories[i % memories.size()],
+				      ss_edges_eq[i],
+				      edge_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+      ri_edges[i] = ri;
+
+      src_node_field_data[i].index_space = ss_edges_eq[i];
+      src_node_field_data[i].inst = ri_edges[i];
+      src_node_field_data[i].field_offset = 0 * sizeof(Point<1>);
+
+      dst_node_field_data[i].index_space = ss_edges_eq[i];
+      dst_node_field_data[i].inst = ri_edges[i];
+      dst_node_field_data[i].field_offset = 1 * sizeof(Point<1>);
+    }
+
+    // fire off tasks to initialize data
+    std::set<Event> events;
+    for(int i = 0; i < num_pieces; i++) {
+      Processor p = procs[i % procs.size()];
+      InitDataArgs args;
+      args.index = i;
+      args.ri_nodes = ri_nodes[i];
+      args.ri_edges = ri_edges[i];
+      Event e = p.spawn(INIT_BASIC_DATA_TASK, &args, sizeof(args));
+      events.insert(e);
+    }
+
+    return Event::merge_events(events);
+  }
+
+  // the outputs of our partitioning will be:
+  //  p_nodes - nodes partitioned by subgraph id (from GPU)
+  //  p_nodes_cpu - nodes partitioned by subgraph id (from CPU)
+
+
+    std::vector<IndexSpace<1> > p_nodes, p_rd;
+    std::vector<IndexSpace<1> > p_edges, p_preimage_edges;
+
+    std::vector<IndexSpace<1> > p_nodes_cpu, p_rd_cpu;
+    std::vector<IndexSpace<1> > p_edges_cpu, p_preimage_edges_cpu;
+
+  virtual Event perform_partitioning(void)
+  {
+    // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU
+    // Ensure that the results are identical
+
+    std::vector<int> colors(num_pieces);
+    for(int i = 0; i < num_pieces; i++)
+      colors[i] = i;
+
+    // We need a GPU memory for GPU partitioning
+    Memory gpu_memory;
+    bool found_gpu_memory = false;
+    Machine machine = Machine::get_machine();
+    std::set<Memory> all_memories;
+    machine.get_all_memories(all_memories);
+    for(Memory memory : all_memories) {
+      if(memory.kind() == Memory::GPU_FB_MEM) {
+        gpu_memory = memory;
+        found_gpu_memory = true;
+        break;
+      }
+    }
+    if (!found_gpu_memory) {
+      log_app.error() << "No GPU memory found for partitioning test\n";
+      return Event::NO_EVENT;
+    }
+    std::vector<size_t> edge_fields;
+    edge_fields.push_back(sizeof(Point<1>));
+    edge_fields.push_back(sizeof(Point<1>))	;
+    std::vector<size_t> node_fields;
+    node_fields.push_back(sizeof(int));
+
+    std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1> > > src_field_data_gpu;
+    std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1> > > dst_field_data_gpu;
+    std::vector<FieldDataDescriptor<IndexSpace<1>, int> > piece_field_data_gpu;
+    piece_field_data_gpu.resize(num_pieces);
+    src_field_data_gpu.resize(num_pieces);
+    dst_field_data_gpu.resize(num_pieces);
+    for (int i = 0; i < num_pieces; i++) {
+        RegionInstance src_gpu_instance;
+        RegionInstance dst_gpu_instance;
+    	RegionInstance piece_gpu_instance;
+        RegionInstance::create_instance(src_gpu_instance,
+				      gpu_memory,
+				      src_node_field_data[i].index_space,
+				      edge_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+        RegionInstance::create_instance(dst_gpu_instance,
+				      gpu_memory,
+				      dst_node_field_data[i].index_space,
+				      edge_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+    	RegionInstance::create_instance(piece_gpu_instance,
+					  gpu_memory,
+					  piece_id_field_data[i].index_space,
+					  node_fields,
+					  0 /*SOA*/,
+					  Realm::ProfilingRequestSet()).wait();
+      CopySrcDstField src_gpu_field, src_cpu_field, dst_gpu_field, dst_cpu_field, piece_gpu_field, piece_cpu_field;
+      src_gpu_field.inst = src_gpu_instance;
+      src_gpu_field.size = sizeof(Point<1>);
+      src_gpu_field.field_id = 0;
+      src_cpu_field.inst = src_node_field_data[i].inst;
+      src_cpu_field.size = sizeof(Point<1>);
+      src_cpu_field.field_id = 0;
+      dst_gpu_field.inst = dst_gpu_instance;
+      dst_gpu_field.size = sizeof(Point<1>);
+      dst_gpu_field.field_id = sizeof(Point<1>);
+      dst_cpu_field.inst = dst_node_field_data[i].inst;
+      dst_cpu_field.size = sizeof(Point<1>);
+      dst_cpu_field.field_id = sizeof(Point<1>);
+      piece_gpu_field.inst = piece_gpu_instance;
+      piece_gpu_field.size = sizeof(int);
+      piece_gpu_field.field_id = 0;
+      piece_cpu_field.inst = piece_id_field_data[i].inst;
+      piece_cpu_field.size = sizeof(int);
+      piece_cpu_field.field_id = 0;
+      std::vector<CopySrcDstField> src_cpu_data, src_gpu_data, dst_cpu_data, dst_gpu_data, piece_cpu_data, piece_gpu_data;
+      src_cpu_data.push_back(src_cpu_field);
+      dst_cpu_data.push_back(dst_cpu_field);
+      src_gpu_data.push_back(src_gpu_field);
+      dst_gpu_data.push_back(dst_gpu_field);
+    	piece_gpu_data.push_back(piece_gpu_field);
+    	piece_cpu_data.push_back(piece_cpu_field);
+      Event copy_event = src_node_field_data[i].index_space.copy(src_cpu_data, src_gpu_data, Realm::ProfilingRequestSet());
+      copy_event.wait();
+      Event second_copy_event = dst_node_field_data[i].index_space.copy(dst_cpu_data, dst_gpu_data, Realm::ProfilingRequestSet());
+      second_copy_event.wait();
+    	Event third_copy_event = piece_id_field_data[i].index_space.copy(piece_cpu_data, piece_gpu_data, Realm::ProfilingRequestSet());
+    		  third_copy_event.wait();
+      src_field_data_gpu[i].inst = src_gpu_instance;
+      src_field_data_gpu[i].index_space = src_node_field_data[i].index_space;
+      src_field_data_gpu[i].field_offset = 0;
+      dst_field_data_gpu[i].inst = dst_gpu_instance;
+      dst_field_data_gpu[i].index_space = dst_node_field_data[i].index_space;
+      dst_field_data_gpu[i].field_offset = 1 * sizeof(Point<1>);
+    	piece_field_data_gpu[i].inst = piece_gpu_instance;
+    	piece_field_data_gpu[i].index_space = piece_id_field_data[i].index_space;
+    	piece_field_data_gpu[i].field_offset = 0;
+    }
+	wait_on_events = true;
+        log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n";
+        std::vector<IndexSpace<1> > p_garbage_nodes, p_garbage_edges, p_garbage_rd, p_garbage_preimage_edges;
+    Event e01 = is_nodes.create_subspaces_by_field(piece_field_data_gpu,
+                                                  colors,
+                                                  p_garbage_nodes,
+                                                  Realm::ProfilingRequestSet());
+        if (wait_on_events) e01.wait();
+    Event e02 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+                                                     p_garbage_nodes,
+                                                     p_garbage_edges,
+                                                     Realm::ProfilingRequestSet(),
+                                                     e01);
+    if(wait_on_events) e02.wait();
+    std::pair<size_t, size_t> estimate;
+    Event _e = is_nodes.gpu_subspaces_by_image(src_field_data_gpu,
+                                                  p_garbage_edges,
+                                                  p_garbage_rd,
+                                                  Realm::ProfilingRequestSet(),
+                                                  estimate,
+                                                  RegionInstance::NO_INST,
+                                                  e02);
+    std::cout << "Minimum size: " << estimate.first << " bytes, "
+              << "Maximum size: " << estimate.second << " bytes\n";
+
+    // an image of p_edges through out_node gives us all the shared nodes, along
+    //  with some private nodes
+    const char* val = std::getenv("TILE_SIZE");  // or any env var
+    size_t tile_size = 100000000; //default
+    if (val) {
+      tile_size = atoi(val);
+    }
+    std::vector<size_t> byte_fields = {sizeof(char)};
+    IndexSpace<1> instance_index_space(Rect<1>(0, tile_size-1));
+    RegionInstance buffer;
+    RegionInstance::create_instance(buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
+    estimate.first = tile_size;
+    Event e03 = is_nodes.gpu_subspaces_by_image(src_field_data_gpu,
+                                                  p_garbage_edges,
+                                                  p_garbage_rd,
+                                                  Realm::ProfilingRequestSet(),
+                                                  estimate,
+                                                  buffer,
+                                                  e02);
+    if(wait_on_events) e03.wait();
+
+    Event e04 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+                                                  p_garbage_rd,
+                                                  p_garbage_preimage_edges,
+                                                  Realm::ProfilingRequestSet(),
+                                                  e03);
+    e04.wait();
+        log_app.info() << "warming up complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting GPU Partitioning " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting GPU By Field " << Clock::current_time_in_microseconds() << "\n";
+    Event e1 = is_nodes.create_subspaces_by_field(piece_field_data_gpu,
+						  colors,
+						  p_nodes,
+						  Realm::ProfilingRequestSet());
+    if(wait_on_events) e1.wait();
+  	log_app.info() << "GPU By Field complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting GPU Preimage " << Clock::current_time_in_microseconds() << "\n";
+    // now compute p_edges based on the color of their in_node (i.e. a preimage)
+    Event e2 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+						     p_nodes,
+						     p_edges,
+						     Realm::ProfilingRequestSet(),
+						     e1);
+    if(wait_on_events) e2.wait();
+  	log_app.info() << "GPU Preimage complete " << Clock::current_time_in_microseconds() << "\n";
+	log_app.info() << "Starting GPU Image " << Clock::current_time_in_microseconds() << "\n";
+
+    // an image of p_edges through out_node gives us all the shared nodes, along
+    //  with some private nodes
+    Event e3 = is_nodes.gpu_subspaces_by_image(src_field_data_gpu,
+                                                  p_edges,
+                                                  p_rd,
+                                                  Realm::ProfilingRequestSet(),
+                                                  estimate,
+                                                  buffer,
+                                                  e2);
+    if(wait_on_events) e3.wait();
+  	log_app.info() << "GPU Image complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting second GPU preimage " << Clock::current_time_in_microseconds() << "\n";
+
+    Event e4 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+						  p_rd,
+						  p_preimage_edges,
+						  Realm::ProfilingRequestSet(),
+						  e3);
+  	e4.wait();
+  	log_app.info() << "Second GPU preimage complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "GPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting CPU Partitioning " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting CPU By Field " << Clock::current_time_in_microseconds() << "\n";
+  	Event e5 = is_nodes.create_subspaces_by_field(piece_id_field_data,
+						  colors,
+						  p_nodes_cpu,
+						  Realm::ProfilingRequestSet());
+  	if(wait_on_events) e5.wait();
+  	log_app.info() << "CPU By Field complete " << Clock::current_time_in_microseconds() << "\n";
+  	// now compute p_edges based on the color of their in_node (i.e. a preimage)
+  	log_app.info() << "Starting CPU Preimage " << Clock::current_time_in_microseconds() << "\n";
+  	Event e6 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+							   p_nodes_cpu,
+							   p_edges_cpu,
+							   Realm::ProfilingRequestSet(),
+							   e5);
+  	if(wait_on_events) e6.wait();
+  	log_app.info() << "CPU Preimage complete " << Clock::current_time_in_microseconds() << "\n";
+
+  	// an image of p_edges through out_node gives us all the shared nodes, along
+  	//  with some private nodes
+  	log_app.info() << "Starting CPU Image " << Clock::current_time_in_microseconds() << "\n";
+  	Event e7 = is_nodes.create_subspaces_by_image(src_node_field_data,
+							p_edges_cpu,
+							p_rd_cpu,
+							Realm::ProfilingRequestSet(),
+							e6);
+  	if(wait_on_events) e7.wait();
+  	log_app.info() << "CPU Image complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting second CPU preimage " << Clock::current_time_in_microseconds() << "\n";
+
+  	Event e8 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+							p_rd_cpu,
+							p_preimage_edges_cpu,
+							Realm::ProfilingRequestSet(),
+							e7);
+  	e8.wait();
+  	log_app.info() << "Second CPU preimage complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n";
+    return e8;
+  }
+
+  virtual int perform_dynamic_checks(void)
+  {
+    // Nothing to do here
+    return 0;
+  }
+
+  virtual int check_partitioning(void)
+  {
+    int errors = 0;
+
+    if (!p_nodes.size()) {
+      return 0;
+    }
+
+    log_app.info() << "Checking correctness of partitioning " << "\n";
+
+    for(int i = 0; i < num_pieces; i++) {
+      for(IndexSpaceIterator<1> it(p_nodes[i]); it.valid; it.step()) {
+        for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+          if (!p_nodes_cpu[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU has extra byfield point " << point.p
+                            << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+      for(IndexSpaceIterator<1> it(p_nodes_cpu[i]); it.valid; it.step()) {
+        for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+          if (!p_nodes[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU is missing byfield point " << point.p
+                          << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_edges[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+           if (!p_edges_cpu[i].contains(point.p)) {
+             log_app.error() << "Mismatch! GPU has extra preimage edge " << point.p
+                         << " on piece " << i << "\n";
+             errors++;
+          }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_edges_cpu[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+            if (!p_edges[i].contains(point.p)) {
+              log_app.error() << "Mismatch! GPU is missing preimage edge " << point.p
+                         << " on piece " << i << "\n";
+              errors++;
+            }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_rd[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+           if (!p_rd_cpu[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU has extra image node " << point.p
+            << " on piece " << i << "\n";
+            errors++;
+           }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_rd_cpu[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+           if (!p_rd[i].contains(point.p)) {
+               log_app.error() << "Mismatch! GPU is missing image node " << point.p
+                           << " on piece " << i << "\n";
+               errors++;
+           }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_preimage_edges[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+            if (!p_preimage_edges_cpu[i].contains(point.p)) {
+                  log_app.error() << "Mismatch! GPU has extra second preimage edge " << point.p
+                                  << " on piece " << i << "\n";
+                  errors++;
+            }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_preimage_edges_cpu[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+           if (!p_preimage_edges[i].contains(point.p)) {
+           log_app.error() << "Mismatch! GPU is missing second preimage edge " << point.p
+                           << " on piece " << i << "\n";
+               errors++;
+           }
+        }
+      }
+
+    }
+    return errors;
+  }
+};
+
+class TileTest : public TestInterface {
+public:
+  // graph config parameters
+  int num_nodes = 1000;
+  int num_edges = 1000;
+  int num_pieces = 4;
+  int num_tiles = 1;
+  std::string filename;
+
+  TileTest(int argc, const char *argv[])
+  {
+    for(int i = 1; i < argc; i++) {
+
+      if(!strcmp(argv[i], "-p")) {
+        num_pieces = atoi(argv[++i]);
+        continue;
+      }
+      if(!strcmp(argv[i], "-n")) {
+        num_nodes = atoi(argv[++i]);
+        continue;
+      }
+      if(!strcmp(argv[i], "-e")) {
+        num_edges = atoi(argv[++i]);
+        continue;
+      }
+      if(!strcmp(argv[i], "-t")) {
+        num_tiles = atoi(argv[++i]);
+        continue;
+      }
+    }
+
+
+    if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0) {
+      log_app.error() << "Invalid config: nodes=" << num_nodes << " edges=" << num_edges << " pieces=" << num_pieces << "\n";
+      exit(1);
+    }
+  }
+
+  struct InitDataArgs {
+    int index;
+    RegionInstance ri_nodes;
+    RegionInstance ri_edges;
+  };
+
+  enum PRNGStreams
+  {
+    NODE_SUBGRAPH_STREAM,
+  };
+
+  // assign subgraph ids to nodes
+  void random_node_data(int idx, int &subgraph)
+  {
+    if(random_colors)
+      subgraph =
+          Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces);
+    else
+      subgraph = idx * num_pieces / num_nodes;
+  }
+
+  void random_edge_data(int idx, int& src, int& dst)
+  {
+    src = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes);
+    dst = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, num_nodes);
+  }
+
+  static void init_data_task_wrapper(const void *args, size_t arglen,
+                                     const void *userdata, size_t userlen, Processor p)
+  {
+    TileTest *me = (TileTest *)testcfg;
+    me->init_data_task(args, arglen, p);
+  }
+
+  //Each piece has a task to initialize its data
+  void init_data_task(const void *args, size_t arglen, Processor p)
+  {
+    const InitDataArgs &i_args = *(const InitDataArgs *)args;
+
+    log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes
+                   << ")";
+
+    i_args.ri_nodes.fetch_metadata(p).wait();
+    i_args.ri_edges.fetch_metadata(p).wait();
+
+    IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>();
+    IndexSpace<1> is_edges = i_args.ri_edges.get_indexspace<1>();
+
+    log_app.debug() << "N: " << is_nodes;
+    log_app.debug() << "E: " << is_edges;
+
+    //For each node in the graph, mark it with a random (or deterministic) subgraph id
+    {
+      AffineAccessor<int, 1> a_piece_id(i_args.ri_nodes, 0 /* offset */);
+
+      for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) {
+        int subgraph;
+        random_node_data(i, subgraph);
+        a_piece_id.write(i, subgraph);
+      }
+
+      AffineAccessor<Point<1>,1> a_src(i_args.ri_edges, 0 /* offset */);
+      AffineAccessor<Point<1>,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */);
+
+      for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) {
+        int src, dst;
+        random_edge_data(i, src, dst);
+        a_src.write(i, Point<1>(src));
+        a_dst.write(i, Point<1>(dst));
+      }
+    }
+
+    //Optionally print out the assigned subgraph ids
+    if(show_graph) {
+      AffineAccessor<int, 1> a_piece_id(i_args.ri_nodes, 0 /* offset */);
+
+      for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++)
+        log_app.info() << "piece_id[" << i << "] = " << a_piece_id.read(i) << "\n";
+
+      AffineAccessor<Point<1>,1> a_src(i_args.ri_edges, 0 /* offset */);
+      AffineAccessor<Point<1>,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */);
+
+      for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++)
+        log_app.info() << "src, dst[" << i << "] = " << a_src.read(i) << ", " << a_dst.read(i) << "\n";
+    }
+  }
+
+  IndexSpace<1> is_nodes, is_edges;
+  std::vector<RegionInstance> ri_nodes, ri_edges;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, int> > piece_id_field_data;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1> > > src_node_field_data, dst_node_field_data;
+
+  virtual void print_info(void)
+  {
+    printf("Realm dependent partitioning test - tile: %d nodes, %d edges, %d pieces, %d tiles\n",
+	   (int)num_nodes, (int) num_edges, (int)num_pieces, (int)num_tiles);
+  }
+
+  virtual Event initialize_data(const std::vector<Memory> &memories,
+                                const std::vector<Processor> &procs)
+  {
+    // now create index space for nodes
+    is_nodes = Rect<1>(0, num_nodes - 1);
+    is_edges = Rect<1>(0, num_edges - 1);
+    // equal partition is used to do initial population of edges and nodes
+    std::vector<IndexSpace<1> > ss_nodes_eq;
+    std::vector<IndexSpace<1> > ss_edges_eq;
+
+    log_app.info() << "Creating equal subspaces\n";
+
+    is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait();
+    is_edges.create_equal_subspaces(num_pieces, 1, ss_edges_eq, Realm::ProfilingRequestSet()).wait();
+
+    log_app.debug() << "Initial partitions:";
+    for(size_t i = 0; i < ss_nodes_eq.size(); i++)
+      log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i];
+    for(size_t i = 0; i < ss_edges_eq.size(); i++)
+      log_app.debug() << " Edges #" << i << ": " << ss_edges_eq[i];
+
+    // create instances for each of these subspaces
+    std::vector<size_t> node_fields, edge_fields;
+    node_fields.push_back(sizeof(int));  // piece_id
+    assert(sizeof(int) == sizeof(Point<1>));
+    edge_fields.push_back(sizeof(Point<1>));  // src_node
+    edge_fields.push_back(sizeof(Point<1>));  // dst_node
+
+    ri_nodes.resize(num_pieces);
+    piece_id_field_data.resize(num_pieces);
+
+    for(size_t i = 0; i < ss_nodes_eq.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i],
+                                      node_fields, 0 /*SOA*/,
+                                      Realm::ProfilingRequestSet())
+          .wait();
+      ri_nodes[i] = ri;
+
+      piece_id_field_data[i].index_space = ss_nodes_eq[i];
+      piece_id_field_data[i].inst = ri_nodes[i];
+      piece_id_field_data[i].field_offset = 0;
+    }
+
+
+    // Fire off tasks to initialize data
+    ri_edges.resize(num_pieces);
+    src_node_field_data.resize(num_pieces);
+    dst_node_field_data.resize(num_pieces);
+
+    for(size_t i = 0; i < ss_edges_eq.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri,
+				      memories[i % memories.size()],
+				      ss_edges_eq[i],
+				      edge_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+      ri_edges[i] = ri;
+
+      src_node_field_data[i].index_space = ss_edges_eq[i];
+      src_node_field_data[i].inst = ri_edges[i];
+      src_node_field_data[i].field_offset = 0 * sizeof(Point<1>);
+
+      dst_node_field_data[i].index_space = ss_edges_eq[i];
+      dst_node_field_data[i].inst = ri_edges[i];
+      dst_node_field_data[i].field_offset = 1 * sizeof(Point<1>);
+    }
+
+    // fire off tasks to initialize data
+    std::set<Event> events;
+    for(int i = 0; i < num_pieces; i++) {
+      Processor p = procs[i % procs.size()];
+      InitDataArgs args;
+      args.index = i;
+      args.ri_nodes = ri_nodes[i];
+      args.ri_edges = ri_edges[i];
+      Event e = p.spawn(INIT_TILE_DATA_TASK, &args, sizeof(args));
+      events.insert(e);
+    }
+
+    return Event::merge_events(events);
+  }
+
+  // the outputs of our partitioning will be:
+  //  p_nodes - nodes partitioned by subgraph id (from GPU)
+  //  p_nodes_cpu - nodes partitioned by subgraph id (from CPU)
+
+
+    std::vector<IndexSpace<1> > p_nodes, p_rd;
+    std::vector<IndexSpace<1> > p_edges, p_preimage_edges;
+
+    std::vector<IndexSpace<1> > p_nodes_cpu, p_rd_cpu;
+    std::vector<IndexSpace<1> > p_edges_cpu, p_preimage_edges_cpu;
+
+  virtual Event perform_partitioning(void)
+  {
+    // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU
+    // Ensure that the results are identical
+
+    std::vector<int> colors(num_pieces);
+    for(int i = 0; i < num_pieces; i++)
+      colors[i] = i;
+
+    // We need a GPU memory for GPU partitioning
+    Memory gpu_memory;
+    bool found_gpu_memory = false;
+    Machine machine = Machine::get_machine();
+    std::set<Memory> all_memories;
+    machine.get_all_memories(all_memories);
+    for(Memory memory : all_memories) {
+      if(memory.kind() == Memory::GPU_FB_MEM) {
+        gpu_memory = memory;
+        found_gpu_memory = true;
+        break;
+      }
+    }
+    if (!found_gpu_memory) {
+      log_app.error() << "No GPU memory found for partitioning test\n";
+      return Event::NO_EVENT;
+    }
+    std::vector<size_t> edge_fields;
+    edge_fields.push_back(sizeof(Point<1>));
+    edge_fields.push_back(sizeof(Point<1>))	;
+    std::vector<size_t> node_fields;
+    node_fields.push_back(sizeof(int));
+
+    std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1> > > src_field_data_gpu;
+    std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1> > > dst_field_data_gpu;
+    std::vector<FieldDataDescriptor<IndexSpace<1>, int> > piece_field_data_gpu;
+    piece_field_data_gpu.resize(num_pieces);
+    src_field_data_gpu.resize(num_pieces);
+    dst_field_data_gpu.resize(num_pieces);
+    for (int i = 0; i < num_pieces; i++) {
+        RegionInstance src_gpu_instance;
+        RegionInstance dst_gpu_instance;
+    	RegionInstance piece_gpu_instance;
+        RegionInstance::create_instance(src_gpu_instance,
+				      gpu_memory,
+				      src_node_field_data[i].index_space,
+				      edge_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+        RegionInstance::create_instance(dst_gpu_instance,
+				      gpu_memory,
+				      dst_node_field_data[i].index_space,
+				      edge_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+    	RegionInstance::create_instance(piece_gpu_instance,
+					  gpu_memory,
+					  piece_id_field_data[i].index_space,
+					  node_fields,
+					  0 /*SOA*/,
+					  Realm::ProfilingRequestSet()).wait();
+      CopySrcDstField src_gpu_field, src_cpu_field, dst_gpu_field, dst_cpu_field, piece_gpu_field, piece_cpu_field;
+      src_gpu_field.inst = src_gpu_instance;
+      src_gpu_field.size = sizeof(Point<1>);
+      src_gpu_field.field_id = 0;
+      src_cpu_field.inst = src_node_field_data[i].inst;
+      src_cpu_field.size = sizeof(Point<1>);
+      src_cpu_field.field_id = 0;
+      dst_gpu_field.inst = dst_gpu_instance;
+      dst_gpu_field.size = sizeof(Point<1>);
+      dst_gpu_field.field_id = sizeof(Point<1>);
+      dst_cpu_field.inst = dst_node_field_data[i].inst;
+      dst_cpu_field.size = sizeof(Point<1>);
+      dst_cpu_field.field_id = sizeof(Point<1>);
+      piece_gpu_field.inst = piece_gpu_instance;
+      piece_gpu_field.size = sizeof(int);
+      piece_gpu_field.field_id = 0;
+      piece_cpu_field.inst = piece_id_field_data[i].inst;
+      piece_cpu_field.size = sizeof(int);
+      piece_cpu_field.field_id = 0;
+      std::vector<CopySrcDstField> src_cpu_data, src_gpu_data, dst_cpu_data, dst_gpu_data, piece_cpu_data, piece_gpu_data;
+      src_cpu_data.push_back(src_cpu_field);
+      dst_cpu_data.push_back(dst_cpu_field);
+      src_gpu_data.push_back(src_gpu_field);
+      dst_gpu_data.push_back(dst_gpu_field);
+    	piece_gpu_data.push_back(piece_gpu_field);
+    	piece_cpu_data.push_back(piece_cpu_field);
+      Event copy_event = src_node_field_data[i].index_space.copy(src_cpu_data, src_gpu_data, Realm::ProfilingRequestSet());
+      copy_event.wait();
+      Event second_copy_event = dst_node_field_data[i].index_space.copy(dst_cpu_data, dst_gpu_data, Realm::ProfilingRequestSet());
+      second_copy_event.wait();
+    	Event third_copy_event = piece_id_field_data[i].index_space.copy(piece_cpu_data, piece_gpu_data, Realm::ProfilingRequestSet());
+    		  third_copy_event.wait();
+      src_field_data_gpu[i].inst = src_gpu_instance;
+      src_field_data_gpu[i].index_space = src_node_field_data[i].index_space;
+      src_field_data_gpu[i].field_offset = 0;
+      dst_field_data_gpu[i].inst = dst_gpu_instance;
+      dst_field_data_gpu[i].index_space = dst_node_field_data[i].index_space;
+      dst_field_data_gpu[i].field_offset = 1 * sizeof(Point<1>);
+    	piece_field_data_gpu[i].inst = piece_gpu_instance;
+    	piece_field_data_gpu[i].index_space = piece_id_field_data[i].index_space;
+    	piece_field_data_gpu[i].field_offset = 0;
+    }
+	wait_on_events = true;
+        log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n";
+        std::vector<IndexSpace<1> > p_garbage_nodes, p_garbage_edges, p_garbage_rd, p_garbage_preimage_edges;
+    Event e01 = is_nodes.create_subspaces_by_field(piece_field_data_gpu,
+                                                  colors,
+                                                  p_garbage_nodes,
+                                                  Realm::ProfilingRequestSet());
+        if (wait_on_events) e01.wait();
+    Event e02 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+                                                     p_garbage_nodes,
+                                                     p_garbage_edges,
+                                                     Realm::ProfilingRequestSet(),
+                                                     e01);
+    if(wait_on_events) e02.wait();
+
+    // an image of p_edges through out_node gives us all the shared nodes, along
+    //  with some private nodes
+    Event e03 = is_nodes.create_subspaces_by_image(src_field_data_gpu,
+                                                  p_garbage_edges,
+                                                  p_garbage_rd,
+                                                  Realm::ProfilingRequestSet(),
+                                                  e02);
+    if(wait_on_events) e03.wait();
+
+    Event e04 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+                                                  p_garbage_rd,
+                                                  p_garbage_preimage_edges,
+                                                  Realm::ProfilingRequestSet(),
+                                                  e03);
+    e04.wait();
+        log_app.info() << "warming up complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting GPU Partitioning " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting GPU By Field " << Clock::current_time_in_microseconds() << "\n";
+    Event e1 = is_nodes.create_subspaces_by_field(piece_field_data_gpu,
+						  colors,
+						  p_nodes,
+						  Realm::ProfilingRequestSet());
+    if(wait_on_events) e1.wait();
+  	log_app.info() << "GPU By Field complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting GPU Preimage " << Clock::current_time_in_microseconds() << "\n";
+    // now compute p_edges based on the color of their in_node (i.e. a preimage)
+    Event e2 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+						     p_nodes,
+						     p_edges,
+						     Realm::ProfilingRequestSet(),
+						     e1);
+    if(wait_on_events) e2.wait();
+  	log_app.info() << "GPU Preimage complete " << Clock::current_time_in_microseconds() << "\n";
+	log_app.info() << "Starting GPU Image " << Clock::current_time_in_microseconds() << "\n";
+
+    // an image of p_edges through out_node gives us all the shared nodes, along
+    //  with some private nodes
+    Event e3 = is_nodes.create_subspaces_by_image(src_field_data_gpu,
+						  p_edges,
+						  p_rd,
+						  Realm::ProfilingRequestSet(),
+						  e2);
+    if(wait_on_events) e3.wait();
+  	log_app.info() << "GPU Image complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting second GPU preimage " << Clock::current_time_in_microseconds() << "\n";
+
+    Event e4 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+						  p_rd,
+						  p_preimage_edges,
+						  Realm::ProfilingRequestSet(),
+						  e3);
+  	e4.wait();
+  	log_app.info() << "Second GPU preimage complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "GPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting CPU Partitioning " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting CPU By Field " << Clock::current_time_in_microseconds() << "\n";
+  	Event e5 = is_nodes.create_subspaces_by_field(piece_id_field_data,
+						  colors,
+						  p_nodes_cpu,
+						  Realm::ProfilingRequestSet());
+  	if(wait_on_events) e5.wait();
+  	log_app.info() << "CPU By Field complete " << Clock::current_time_in_microseconds() << "\n";
+  	// now compute p_edges based on the color of their in_node (i.e. a preimage)
+  	log_app.info() << "Starting CPU Preimage " << Clock::current_time_in_microseconds() << "\n";
+  	Event e6 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+							   p_nodes_cpu,
+							   p_edges_cpu,
+							   Realm::ProfilingRequestSet(),
+							   e5);
+  	if(wait_on_events) e6.wait();
+  	log_app.info() << "CPU Preimage complete " << Clock::current_time_in_microseconds() << "\n";
+
+  	// an image of p_edges through out_node gives us all the shared nodes, along
+  	//  with some private nodes
+  	log_app.info() << "Starting CPU Image " << Clock::current_time_in_microseconds() << "\n";
+  	Event e7 = is_nodes.create_subspaces_by_image(src_node_field_data,
+							p_edges_cpu,
+							p_rd_cpu,
+							Realm::ProfilingRequestSet(),
+							e6);
+  	if(wait_on_events) e7.wait();
+  	log_app.info() << "CPU Image complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting second CPU preimage " << Clock::current_time_in_microseconds() << "\n";
+
+  	Event e8 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+							p_rd_cpu,
+							p_preimage_edges_cpu,
+							Realm::ProfilingRequestSet(),
+							e7);
+  	e8.wait();
+  	log_app.info() << "Second CPU preimage complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n";
+    return e8;
+  }
+
+  virtual int perform_dynamic_checks(void)
+  {
+    // Nothing to do here
+    return 0;
+  }
+
+  virtual int check_partitioning(void)
+  {
+    int errors = 0;
+
+    if (!p_nodes.size()) {
+      return 0;
+    }
+
+    log_app.info() << "Checking correctness of partitioning " << "\n";
+
+    for(int i = 0; i < num_pieces; i++) {
+      for(IndexSpaceIterator<1> it(p_nodes[i]); it.valid; it.step()) {
+        for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+          if (!p_nodes_cpu[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU has extra byfield point " << point.p
+                            << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+      for(IndexSpaceIterator<1> it(p_nodes_cpu[i]); it.valid; it.step()) {
+        for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+          if (!p_nodes[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU is missing byfield point " << point.p
+                          << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_edges[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+           if (!p_edges_cpu[i].contains(point.p)) {
+             log_app.error() << "Mismatch! GPU has extra preimage edge " << point.p
+                         << " on piece " << i << "\n";
+             errors++;
+          }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_edges_cpu[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+            if (!p_edges[i].contains(point.p)) {
+              log_app.error() << "Mismatch! GPU is missing preimage edge " << point.p
+                         << " on piece " << i << "\n";
+              errors++;
+            }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_rd[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+           if (!p_rd_cpu[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU has extra image node " << point.p
+            << " on piece " << i << "\n";
+            errors++;
+           }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_rd_cpu[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+           if (!p_rd[i].contains(point.p)) {
+               log_app.error() << "Mismatch! GPU is missing image node " << point.p
+                           << " on piece " << i << "\n";
+               errors++;
+           }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_preimage_edges[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+            if (!p_preimage_edges_cpu[i].contains(point.p)) {
+                  log_app.error() << "Mismatch! GPU has extra second preimage edge " << point.p
+                                  << " on piece " << i << "\n";
+                  errors++;
+            }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_preimage_edges_cpu[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+           if (!p_preimage_edges[i].contains(point.p)) {
+           log_app.error() << "Mismatch! GPU is missing second preimage edge " << point.p
+                           << " on piece " << i << "\n";
+               errors++;
+           }
+        }
+      }
+
+    }
+    return errors;
+  }
+};
+
+class RangeTest : public TestInterface {
+public:
+  // graph config parameters
+  int num_nodes = 1000;
+  int num_rects = 1000;
+  int max_rect_size = 10;
+  int num_pieces = 4;
+  std::string filename;
+
+  RangeTest(int argc, const char *argv[])
+  {
+    for(int i = 1; i < argc; i++) {
+
+      if(!strcmp(argv[i], "-p")) {
+	num_pieces = atoi(argv[++i]);
+	continue;
+      }
+
+      if(!strcmp(argv[i], "-n")) {
+        num_nodes = atoi(argv[++i]);
+        continue;
+      }
+
+      if(!strcmp(argv[i], "-r")) {
+        num_rects = atoi(argv[++i]);
+        continue;
+      }
+
+      if(!strcmp(argv[i], "-m")) {
+        max_rect_size = atoi(argv[++i]);
+        continue;
+      }
+    }
+
+
+
+    if (num_nodes <= 0 || num_rects <= 0) {
+      log_app.error() << "Invalid graph dimensions in input file: rects=" << num_rects << " nodes=" << num_nodes;
+      exit(1);
+    }
+
+  }
+
+
+
+  struct InitDataArgs {
+    int index;
+    RegionInstance ri_nodes;
+    RegionInstance ri_rects;
+  };
+
+  enum PRNGStreams {
+    NODE_SUBGRAPH_STREAM,
+  };
+
+  void random_rect_data(int idx, int& subgraph)
+  {
+    if(random_colors)
+      subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces);
+    else
+      subgraph = idx * num_pieces / num_rects;
+  }
+
+  void random_node_data(int idx, int& subgraph)
+  {
+    if(true)
+      subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces);
+    else
+      subgraph = idx * num_pieces / num_nodes;
+  }
+
+  void initialize_rect_data(int idx, Rect<1> &rect, int max_rect_size = 10)
+  {
+
+    int first = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes);
+    int amount = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, max_rect_size);
+    rect = Rect<1>(first, first + amount);
+  }
+
+
+  static void init_data_task_wrapper(const void *args, size_t arglen,
+				     const void *userdata, size_t userlen, Processor p)
+  {
+    RangeTest *me = (RangeTest *)testcfg;
+    me->init_data_task(args, arglen, p);
+  }
+
+  void init_data_task(const void *args, size_t arglen, Processor p)
+  {
+    const InitDataArgs& i_args = *(const InitDataArgs *)args;
+
+    log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes << ", ri_rects=" << i_args.ri_rects << ")";
+
+    i_args.ri_nodes.fetch_metadata(p).wait();
+    i_args.ri_rects.fetch_metadata(p).wait();
+
+    IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>();
+    IndexSpace<1> is_rects = i_args.ri_rects.get_indexspace<1>();
+
+    log_app.debug() << "N: " << is_nodes;
+    log_app.debug() << "E: " << is_rects;
+
+    //Write out colors and rectangles
+
+    {
+      AffineAccessor<int,1> a_rect_id(i_args.ri_rects, 0 /* offset */);
+
+      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) {
+	      int subgraph;
+	      random_rect_data(i, subgraph);
+	      a_rect_id.write(i, subgraph);
+      }
+    }
+    {
+      AffineAccessor<int,1> a_piece_id(i_args.ri_nodes, 0 /* offset */);
+
+      for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) {
+        int subgraph;
+        random_node_data(i, subgraph);
+        a_piece_id.write(i, subgraph);
+      }
+    }
+
+
+    {
+
+      AffineAccessor<Rect<1>, 1> a_rect_val(i_args.ri_rects, 1 * sizeof(int) /* offset */);
+
+      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) {
+        Rect<1> rect;
+        initialize_rect_data(i, rect, max_rect_size);
+        a_rect_val.write(i, rect);
+      }
+    }
+
+    if(show_graph) {
+      AffineAccessor<int,1> a_piece_id(i_args.ri_nodes, 0 /* offset */);
+
+      for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++)
+	log_app.info() << "node_id[" << i << "] = " << a_piece_id.read(i) << "\n";
+
+      AffineAccessor<int,1> a_rect_id(i_args.ri_rects, 0 * sizeof(Point<1>) /* offset */);
+
+      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++)
+	log_app.info() << "rect_id[" << i << "] = " << a_rect_id.read(i) << "\n";
+
+      AffineAccessor<Rect<1>,1> a_rect_val(i_args.ri_rects, 1 * sizeof(int) /* offset */);
+
+      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++)
+	log_app.info() << "rect_val[" << i << "] = " << a_rect_val.read(i) << "\n";
+    }
+  }
+
+  IndexSpace<1> is_nodes, is_rects;
+  std::vector<RegionInstance> ri_nodes;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, int> > node_id_field_data;
+  std::vector<RegionInstance> ri_rects;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, int> > rect_id_field_data;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, Rect<1> > > rect_val_field_data;
+
+  virtual void print_info(void)
+  {
+    printf("Realm dependent partitioning test - ranges: %d nodes, %d rects, %d pieces\n",
+	   (int)num_nodes, (int)num_rects, (int)num_pieces);
+  }
+
+  virtual Event initialize_data(const std::vector<Memory>& memories,
+				const std::vector<Processor>& procs)
+  {
+    // now create index spaces for nodes and edges
+    is_nodes = Rect<1>(0, num_nodes - 1);
+    is_rects = Rect<1>(0, num_rects - 1);
+
+    // equal partition is used to do initial population of edges and nodes
+    std::vector<IndexSpace<1> > ss_nodes_eq;
+    std::vector<IndexSpace<1> > ss_rects_eq;
+
+    log_app.info() << "Creating equal subspaces" << "\n";
+
+    is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait();
+    is_rects.create_equal_subspaces(num_pieces, 1, ss_rects_eq, Realm::ProfilingRequestSet()).wait();
+
+    log_app.debug() << "Initial partitions:";
+    for(size_t i = 0; i < ss_nodes_eq.size(); i++)
+      log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i];
+    for(size_t i = 0; i < ss_rects_eq.size(); i++)
+      log_app.debug() << " Rects #" << i << ": " << ss_rects_eq[i];
+
+    // create instances for each of these subspaces
+    std::vector<size_t> node_fields, rect_fields;
+    node_fields.push_back(sizeof(int));  // piece_id
+    rect_fields.push_back(sizeof(int));  // src_node
+    rect_fields.push_back(sizeof(Rect<1>));  // dst_node
+
+    ri_nodes.resize(num_pieces);
+    node_id_field_data.resize(num_pieces);
+
+    for(size_t i = 0; i < ss_nodes_eq.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri,
+				      memories[i % memories.size()],
+				      ss_nodes_eq[i],
+				      node_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+      ri_nodes[i] = ri;
+
+      node_id_field_data[i].index_space = ss_nodes_eq[i];
+      node_id_field_data[i].inst = ri_nodes[i];
+      node_id_field_data[i].field_offset = 0;
+    }
+
+    ri_rects.resize(num_pieces);
+    rect_id_field_data.resize(num_pieces);
+    rect_val_field_data.resize(num_pieces);
+
+    for(size_t i = 0; i < ss_rects_eq.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri,
+				      memories[i % memories.size()],
+				      ss_rects_eq[i],
+				      rect_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+      ri_rects[i] = ri;
+
+      rect_id_field_data[i].index_space = ss_rects_eq[i];
+      rect_id_field_data[i].inst = ri_rects[i];
+      rect_id_field_data[i].field_offset = 0;
+
+      rect_val_field_data[i].index_space = ss_rects_eq[i];
+      rect_val_field_data[i].inst = ri_rects[i];
+      rect_val_field_data[i].field_offset = 1 * sizeof(int);
+    }
+
+    // fire off tasks to initialize data
+    std::set<Event> events;
+    for(int i = 0; i < num_pieces; i++) {
+      Processor p = procs[i % procs.size()];
+      InitDataArgs args;
+      args.index = i;
+      args.ri_nodes = ri_nodes[i];
+      args.ri_rects = ri_rects[i];
+      Event e = p.spawn(INIT_RANGE_DATA_TASK, &args, sizeof(args));
+      events.insert(e);
+    }
+
+    return Event::merge_events(events);
+  }
+
+  // the outputs of our partitioning will be:
+  //p_colored_rects -> all of our rectangles marked with the color given by random_rect_data
+  //p_rects -> image range by p colored rects into nodes
+
+  std::vector<IndexSpace<1> > p_colored_rects, p_rects;
+  std::vector<IndexSpace<1> > p_colored_rects_cpu, p_rects_cpu;
+
+  virtual Event perform_partitioning(void)
+  {
+
+    std::vector<int> colors(num_pieces);
+    for(int i = 0; i < num_pieces; i++)
+      colors[i] = i;
+
+    Memory gpu_memory;
+    bool found_gpu_memory = false;
+    Machine machine = Machine::get_machine();
+    std::set<Memory> all_memories;
+    machine.get_all_memories(all_memories);
+    for(auto& memory : all_memories) {
+      if(memory.kind() == Memory::GPU_FB_MEM) {
+        gpu_memory = memory;
+        found_gpu_memory = true;
+        break;
+      }
+    }
+    assert(found_gpu_memory);
+    std::vector<size_t> rect_fields;
+    rect_fields.push_back(sizeof(int));
+    rect_fields.push_back(sizeof(Rect<1>));
+    std::vector<size_t> node_fields;
+    node_fields.push_back(sizeof(int));
+
+    std::vector<FieldDataDescriptor<IndexSpace<1>, int > > node_id_data_gpu;
+    std::vector<FieldDataDescriptor<IndexSpace<1>, int > > rect_id_data_gpu;
+    std::vector<FieldDataDescriptor<IndexSpace<1>, Rect<1>>> rect_val_data_gpu;
+    node_id_data_gpu.resize(num_pieces);
+    rect_id_data_gpu.resize(num_pieces);
+    rect_val_data_gpu.resize(num_pieces);
+    for (int i = 0; i < num_pieces; i++) {
+	RegionInstance node_id_instance;
+	RegionInstance rect_id_instance;
+    	RegionInstance rect_val_instance;
+        RegionInstance::create_instance(node_id_instance,
+				      gpu_memory,
+				      node_id_field_data[i].index_space,
+				      node_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+        RegionInstance::create_instance(rect_id_instance,
+				      gpu_memory,
+				      rect_id_field_data[i].index_space,
+				      rect_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+    	RegionInstance::create_instance(rect_val_instance,
+					  gpu_memory,
+					  rect_val_field_data[i].index_space,
+					  rect_fields,
+					  0 /*SOA*/,
+					  Realm::ProfilingRequestSet()).wait();
+      CopySrcDstField node_id_gpu_field, node_id_cpu_field, rect_id_gpu_field, rect_id_cpu_field, rect_val_gpu_field, rect_val_cpu_field;
+      node_id_gpu_field.inst = node_id_instance;
+      node_id_gpu_field.size = sizeof(int);
+      node_id_gpu_field.field_id = 0;
+      node_id_cpu_field.inst = node_id_field_data[i].inst;
+      node_id_cpu_field.size = sizeof(int);
+      node_id_cpu_field.field_id = 0;
+      rect_id_gpu_field.inst = rect_id_instance;
+      rect_id_gpu_field.size = sizeof(int);
+      rect_id_gpu_field.field_id = 0;
+      rect_id_cpu_field.inst = rect_id_field_data[i].inst;
+      rect_id_cpu_field.size = sizeof(int);
+      rect_id_cpu_field.field_id = 0;
+      rect_val_gpu_field.inst = rect_val_instance;
+      rect_val_gpu_field.size = sizeof(Rect<1>);
+      rect_val_gpu_field.field_id = sizeof(int);
+      rect_val_cpu_field.inst = rect_val_field_data[i].inst;
+      rect_val_cpu_field.size = sizeof(Rect<1>);
+      rect_val_cpu_field.field_id = sizeof(int);
+      std::vector<CopySrcDstField> node_id_gpu_data, node_id_cpu_data, rect_id_gpu_data, rect_id_cpu_data, rect_val_gpu_data, rect_val_cpu_data;
+      node_id_gpu_data.push_back(node_id_gpu_field);
+      node_id_cpu_data.push_back(node_id_cpu_field);
+      rect_id_gpu_data.push_back(rect_id_gpu_field);
+      rect_id_cpu_data.push_back(rect_id_cpu_field);
+      rect_val_gpu_data.push_back(rect_val_gpu_field);
+      rect_val_cpu_data.push_back(rect_val_cpu_field);
+      Event copy_event = node_id_field_data[i].index_space.copy(node_id_cpu_data, node_id_gpu_data, Realm::ProfilingRequestSet());
+      copy_event.wait();
+      Event second_copy_event = rect_id_field_data[i].index_space.copy(rect_id_cpu_data, rect_id_gpu_data, Realm::ProfilingRequestSet());
+      second_copy_event.wait();
+      Event third_copy_event = rect_val_field_data[i].index_space.copy(rect_val_cpu_data, rect_val_gpu_data, Realm::ProfilingRequestSet());
+      third_copy_event.wait();
+      node_id_data_gpu[i].inst = node_id_instance;
+      node_id_data_gpu[i].index_space = node_id_field_data[i].index_space;
+      node_id_data_gpu[i].field_offset = 0;
+      rect_id_data_gpu[i].inst = rect_id_instance;
+      rect_id_data_gpu[i].index_space = rect_id_field_data[i].index_space;
+      rect_id_data_gpu[i].field_offset = 0;
+      rect_val_data_gpu[i].inst = rect_val_instance;
+      rect_val_data_gpu[i].index_space = rect_val_field_data[i].index_space;
+      rect_val_data_gpu[i].field_offset = sizeof(int);
+    }
+    wait_on_events = true;
+    std::vector<IndexSpace<1>> p_garbage_rects, p_garbage_colors;
+    log_app.info() << "WARMING UP " << "\n";
+
+    Event e001 = is_rects.create_subspaces_by_field(rect_id_data_gpu,
+                                                  colors,
+                                                  p_garbage_colors,
+                                                  Realm::ProfilingRequestSet());
+    if (wait_on_events) e001.wait();
+    Event e002 = is_nodes.create_subspaces_by_image(rect_val_data_gpu,
+                                                     p_garbage_colors,
+                                                     p_garbage_rects,
+                                                     Realm::ProfilingRequestSet(),
+                                                     e001);
+    if(wait_on_events) e002.wait();
+
+    log_app.info() << "FINISHED WARMING UP " << "\n";
+    log_app.info() << "starting GPU  partitioning " << Clock::current_time_in_microseconds() << "\n";
+
+    log_app.info() << "STARTING GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
+
+    Event e01 = is_rects.create_subspaces_by_field(rect_id_data_gpu,
+                                                  colors,
+                                                  p_colored_rects,
+                                                  Realm::ProfilingRequestSet());
+        if (wait_on_events) e01.wait();
+
+    log_app.info() << "FINISHED GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
+    log_app.info() << "STARTING GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
+    Event e02 = is_nodes.create_subspaces_by_image(rect_val_data_gpu,
+                                                     p_colored_rects,
+                                                     p_rects,
+                                                     Realm::ProfilingRequestSet(),
+                                                     e01);
+    if(wait_on_events) e02.wait();
+
+    log_app.info() << "FINISHED GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
+    log_app.info() << "STARTING CPU  partitioning " << Clock::current_time_in_microseconds() << "\n";
+    log_app.info() << "STARTING CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
+    Event e1 = is_rects.create_subspaces_by_field(rect_id_field_data,
+                                                  colors,
+                                                  p_colored_rects_cpu,
+                                                  Realm::ProfilingRequestSet());
+    if (wait_on_events) e1.wait();
+    log_app.info() << "FINISHED CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
+    log_app.info() << "STARTING CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
+    Event e2 = is_nodes.create_subspaces_by_image(rect_val_field_data,
+                                                     p_colored_rects_cpu,
+                                                     p_rects_cpu,
+                                                     Realm::ProfilingRequestSet(),
+                                                     e1);
+    if(wait_on_events) e2.wait();
+    log_app.info() << "FINISHED CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
+    log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n";
+    return e2;
+  }
+
+
+
+  virtual int perform_dynamic_checks(void)
+  {
+    return 0;
+  }
+
+  virtual int check_partitioning(void)
+  {
+    log_app.info() << "Checking correctness of partitioning " << "\n";
+    int errors = 0;
+
+    for (int i = 0; i < num_pieces; i++) {
+      for (IndexSpaceIterator<1> it(p_colored_rects[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+          if (!p_colored_rects_cpu[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU has extra colored rect point " << point.p
+                            << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_colored_rects_cpu[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+          if(!p_colored_rects[i].contains(point.p)) {
+                log_app.error() << "Mismatch! GPU is missing colored rect point " << point.p
+                                  << " on piece " << i << "\n";
+                errors++;
+          }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_rects[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+          if (!p_rects_cpu[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU has extra rect point " << point.p
+                            << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_rects_cpu[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+          if(!p_rects[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU is missing rect point " << point.p
+                            << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+    }
+    return errors;
+  }
+};
+
 class MiniAeroTest : public TestInterface {
 public:
   enum ProblemType
@@ -625,7 +2194,7 @@ class MiniAeroTest : public TestInterface {
       AffineAccessor<int, 1> a_cell_blockid(i_args.ri_cells, 0 /* offset */);
 
       for(int i = is_cells.bounds.lo[0]; i <= is_cells.bounds.hi[0]; i++)
-        std::cout << "Z[" << i << "]: blockid=" << a_cell_blockid.read(i) << std::endl;
+        std::cout << "Z[" << i << "]: blockid=" << a_cell_blockid.read(i) << "\n";
 
       AffineAccessor<Point<1>, 1> a_face_left(i_args.ri_faces,
                                               0 * sizeof(Point<1>) /* offset */);
@@ -637,7 +2206,7 @@ class MiniAeroTest : public TestInterface {
       for(int i = is_faces.bounds.lo[0]; i <= is_faces.bounds.hi[0]; i++)
         std::cout << "S[" << i << "]:"
                   << " left=" << a_face_left.read(i) << " right=" << a_face_right.read(i)
-                  << " type=" << a_face_type.read(i) << std::endl;
+                  << " type=" << a_face_type.read(i) << "\n";
     }
   }
 
@@ -1006,7 +2575,6 @@ class CircuitTest : public TestInterface {
 
     {
       AffineAccessor<int, 1> a_subckt_id(i_args.ri_nodes, 0 /* offset */);
-      // std::cout << "a_subckt_id = " << a_subckt_id << "\n";
 
       for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) {
         int subckt;
@@ -1021,9 +2589,6 @@ class CircuitTest : public TestInterface {
       AffineAccessor<Point<1>, 1> a_out_node(i_args.ri_edges,
                                              1 * sizeof(Point<1>) /* offset */);
 
-      // std::cout << "a_in_node = " << a_in_node << "\n";
-      // std::cout << "a_out_node = " << a_out_node << "\n";
-
       for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) {
         Point<1> in_node, out_node;
         random_edge_data(i, in_node, out_node);
@@ -1036,19 +2601,19 @@ class CircuitTest : public TestInterface {
       AffineAccessor<int, 1> a_subckt_id(i_args.ri_nodes, 0 /* offset */);
 
       for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++)
-        std::cout << "subckt_id[" << i << "] = " << a_subckt_id.read(i) << std::endl;
+        std::cout << "subckt_id[" << i << "] = " << a_subckt_id.read(i) << "\n";
 
       AffineAccessor<Point<1>, 1> a_in_node(i_args.ri_edges,
                                             0 * sizeof(Point<1>) /* offset */);
 
       for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++)
-        std::cout << "in_node[" << i << "] = " << a_in_node.read(i) << std::endl;
+        std::cout << "in_node[" << i << "] = " << a_in_node.read(i) << "\n";
 
       AffineAccessor<Point<1>, 1> a_out_node(i_args.ri_edges,
                                              1 * sizeof(Point<1>) /* offset */);
 
       for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++)
-        std::cout << "out_node[" << i << "] = " << a_out_node.read(i) << std::endl;
+        std::cout << "out_node[" << i << "] = " << a_out_node.read(i) << "\n";
     }
   }
 
@@ -1761,7 +3326,7 @@ class PennantTest : public TestInterface {
       AffineAccessor<int, 1> a_zone_color(i_args.ri_zones, 0 /* offset */);
 
       for(int i = is_zones.bounds.lo; i <= is_zones.bounds.hi; i++)
-        std::cout << "Z[" << i << "]: color=" << a_zone_color.read(i) << std::endl;
+        std::cout << "Z[" << i << "]: color=" << a_zone_color.read(i) << "\n";
 
       AffineAccessor<Point<1>, 1> a_side_mapsz(i_args.ri_sides,
                                                0 * sizeof(Point<1>) /* offset */);
@@ -1777,7 +3342,7 @@ class PennantTest : public TestInterface {
                   << " mapsz=" << a_side_mapsz.read(i)
                   << " mapss3=" << a_side_mapss3.read(i)
                   << " mapsp1=" << a_side_mapsp1.read(i) << " ok=" << a_side_ok.read(i)
-                  << std::endl;
+                  << "\n";
     }
   }
 
@@ -2831,6 +4396,21 @@ int main(int argc, char **argv)
       break;
     }
 
+    if(!strcmp(argv[i], "basic")) {
+      testcfg = new BasicTest(argc - i, const_cast<const char **>(argv + i));
+      break;
+    }
+
+    if(!strcmp(argv[i], "tile")) {
+      testcfg = new TileTest(argc - i, const_cast<const char **>(argv + i));
+      break;
+    }
+
+    if (!strcmp(argv[i], "range")) {
+      testcfg = new RangeTest(argc - i, const_cast<const char **>(argv + i));
+      break;
+    }
+
     if(!strcmp(argv[i], "pennant")) {
       testcfg = new PennantTest(argc - i, const_cast<const char **>(argv + i));
       break;
@@ -2867,6 +4447,9 @@ int main(int argc, char **argv)
   rt.register_task(TOP_LEVEL_TASK, top_level_task);
   rt.register_task(INIT_CIRCUIT_DATA_TASK, CircuitTest::init_data_task_wrapper);
   rt.register_task(INIT_PENNANT_DATA_TASK, PennantTest::init_data_task_wrapper);
+  rt.register_task(INIT_BASIC_DATA_TASK, BasicTest::init_data_task_wrapper);
+  rt.register_task(INIT_TILE_DATA_TASK, TileTest::init_data_task_wrapper);
+  rt.register_task(INIT_RANGE_DATA_TASK, RangeTest::init_data_task_wrapper);
   rt.register_task(INIT_MINIAERO_DATA_TASK, MiniAeroTest::init_data_task_wrapper);
 
   signal(SIGALRM, sigalrm_handler);
diff --git a/tests/gpu_deppart_1d.cc b/tests/gpu_deppart_1d.cc
new file mode 100644
index 0000000000..250a63f2df
--- /dev/null
+++ b/tests/gpu_deppart_1d.cc
@@ -0,0 +1,327 @@
+/*
+ * Copyright 2025 Stanford University, NVIDIA
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cassert>
+#include <vector>
+#include <set>
+#include <cstdio>
+#include <cstring>
+#include "realm.h"
+#include "realm/id.h"
+#include "realm/machine.h"
+#include "realm/cmdline.h"
+#include "philox.h"
+
+using namespace Realm;
+
+#ifdef REALM_USE_CUDA
+#include "realm/cuda/cuda_memcpy.h"
+#include "realm/cuda/cuda_module.h"
+#endif
+#ifdef REALM_USE_HIP
+#include "hip_cuda_compat/hip_cuda.h"
+#include "realm/hip/hip_module.h"
+#endif
+
+#ifdef REALM_USE_CUDA
+using namespace Realm::Cuda;
+#endif
+#ifdef REALM_USE_HIP
+using namespace Realm::Hip;
+#endif
+
+Logger log_app("app");
+
+// ---------------- Config (matches transpose_test style) ----------------
+namespace TestConfig {
+  int    num_nodes   = 1000;
+  int    num_edges   = 5000;
+  int    num_pieces  = 4;
+  int    random      = 0;           // 0 deterministic, 1 random
+  unsigned long long seed = 123456789ULL;
+  int    show        = 0;           // print assigned ids
+  int    verify      = 1;           // do correctness check
+};
+static const FieldID FID_SUBGRAPH = 0;
+static const FieldID FID_SRC = 0;
+static const FieldID FID_DST = sizeof(Point<1, int>);
+
+// ---------------- Small helpers (same idioms as transpose_test) --------
+template <int N, typename T, typename DT, typename Fn>
+static void fill_index_space(RegionInstance inst,
+                             FieldID fid,
+                             const IndexSpace<N,T>& is,
+                             Fn gen)
+{
+  AffineAccessor<DT, N, T> acc(inst, fid);
+  for (IndexSpaceIterator<N,T> it(is); it.valid; it.step()) {
+    for (PointInRectIterator<N,T> p(it.rect); p.valid; p.step())
+      acc[p.p] = gen(p.p);
+  }
+}
+
+template <int N, typename T, typename DT>
+static void copy_field(const IndexSpace<N,T>& is,
+                       RegionInstance src, RegionInstance dst, FieldID fid)
+{
+  std::vector<CopySrcDstField> srcs(1), dsts(1);
+  srcs[0].set_field(src, fid, sizeof(DT));
+  dsts[0].set_field(dst, fid, sizeof(DT));
+  is.copy(srcs, dsts, ProfilingRequestSet()).wait();
+}
+
+static void choose_cpu_and_gpu_mems(Memory& cpu_mem, Memory& gpu_mem, bool& have_gpu)
+{
+  have_gpu = false;
+  for (auto mem : Machine::MemoryQuery(Machine::get_machine())) {
+    if (!cpu_mem.exists() && (mem.kind() == Memory::SYSTEM_MEM))
+      cpu_mem = mem;
+    if (!gpu_mem.exists() && (mem.kind() == Memory::GPU_FB_MEM)) {
+      gpu_mem = mem;
+      have_gpu = true;
+    }
+  }
+}
+
+// For brevity, we use the simple vector<size_t> layout helper (as in many Realm tests)
+static Event make_instance(RegionInstance& ri,
+                           Memory mem,
+                           const IndexSpace<1,int>& is,
+                           std::vector<size_t> fields)
+{
+  return RegionInstance::create_instance(ri, mem, is, fields,
+                                         /*soa=*/0, ProfilingRequestSet());
+}
+
+// Compare two partitions index-space-by-index-space
+static int compare_partitions(const std::vector<IndexSpace<1,int>>& A,
+                              const std::vector<IndexSpace<1,int>>& B)
+{
+  int errors = 0;
+  if (A.size() != B.size()) return 1;
+  for (size_t i = 0; i < A.size(); i++) {
+    // Check A minus B
+    for (IndexSpaceIterator<1,int> it(A[i]); it.valid; it.step())
+      for (PointInRectIterator<1,int> p(it.rect); p.valid; p.step())
+        if (!B[i].contains(p.p)) { errors++; }
+    // Check B minus A
+    for (IndexSpaceIterator<1,int> it(B[i]); it.valid; it.step())
+      for (PointInRectIterator<1,int> p(it.rect); p.valid; p.step())
+        if (!A[i].contains(p.p)) { errors++; }
+  }
+  return errors;
+}
+
+// ---------------- Top-level task (like transpose_test_gpu) --------------
+enum {
+  TOP_LEVEL_TASK = Processor::TASK_ID_FIRST_AVAILABLE + 300,
+};
+
+static void top_level_task(const void*, size_t, const void*, size_t, Processor)
+{
+  log_app.print() << "deppart_byfield_itest starting";
+
+  // Build the 1D node space [0 .. N-1]
+  IndexSpace<1,int> is_nodes(Rect<1,int>(0, TestConfig::num_nodes - 1));
+  IndexSpace<1,int> is_edges(Rect<1, int>(0, TestConfig::num_edges - 1));
+
+  // Choose memories
+  Memory cpu_mem, gpu_mem;
+  bool have_gpu = false;
+  choose_cpu_and_gpu_mems(cpu_mem, gpu_mem, have_gpu);
+  if (!cpu_mem.exists()) {
+    log_app.fatal() << "No SYSTEM_MEM found";
+    assert(0);
+    return;
+  }
+  if (!have_gpu) {
+    log_app.warning() << "No GPU_FB_MEM found; running CPU-only check.";
+  }
+
+  // Create CPU instance holding subgraph ids
+  RegionInstance cpu_inst_nodes;
+  make_instance(cpu_inst_nodes, cpu_mem, is_nodes, {sizeof(int)}).wait();
+
+  RegionInstance cpu_inst_edges;
+  make_instance(cpu_inst_edges, cpu_mem, is_edges, {sizeof(Point<1, int>), sizeof(Point<1, int>)}).wait();
+
+  // Fill ids (deterministic or random)
+  auto gen_id = [&](Point<1,int> p)->int {
+    if (TestConfig::random) {
+      return Philox_2x32<>::rand_int(TestConfig::seed,
+                                     /*counter=*/p[0],
+                                     /*stream=*/0,
+                                     /*bound=*/TestConfig::num_pieces);
+    } else {
+      // even split
+      return int((long long)p[0] * TestConfig::num_pieces / TestConfig::num_nodes);
+    }
+  };
+  fill_index_space<1,int,int>(cpu_inst_nodes, FID_SUBGRAPH, is_nodes, gen_id);
+
+  auto gen_src = [&](Point<1,int> p)->Point<1, int> {
+    if (TestConfig::random) {
+      return Point<1, int>(Philox_2x32<>::rand_int(TestConfig::seed,
+                                     /*counter=*/p[0],
+                                     /*stream=*/0,
+                                     /*bound=*/TestConfig::num_nodes));
+    } else {
+      return Point<1, int>(p[0] % TestConfig::num_nodes);
+    }
+  };
+
+  fill_index_space<1,int,Point<1,int>>(cpu_inst_edges, FID_SRC, is_edges, gen_src);
+
+  auto gen_dst = [&](Point<1,int> p)->Point<1, int> {
+    if (TestConfig::random) {
+      return Point<1, int>(Philox_2x32<>::rand_int(TestConfig::seed,
+                                     /*counter=*/p[0]+TestConfig::num_edges,
+                                     /*stream=*/0,
+                                     /*bound=*/TestConfig::num_nodes));
+    } else {
+      return Point<1, int>((p[0]+1) % TestConfig::num_nodes);
+    }
+  };
+
+  fill_index_space<1,int,Point<1,int>>(cpu_inst_edges, FID_DST, is_edges, gen_dst);
+
+  if (TestConfig::show) {
+    AffineAccessor<int,1,int> acc(cpu_inst_nodes, FID_SUBGRAPH);
+    for (IndexSpaceIterator<1,int> it(is_nodes); it.valid; it.step())
+      for (PointInRectIterator<1,int> p(it.rect); p.valid; p.step())
+        log_app.print() << "id[" << p.p << "]=" << acc[p.p];
+
+    AffineAccessor<Point<1,int>,1,int> acc_src(cpu_inst_edges, FID_SRC);
+    AffineAccessor<Point<1,int>,1,int> acc_dst(cpu_inst_edges, FID_DST);
+    for (IndexSpaceIterator<1,int> it(is_edges); it.valid; it.step())
+      for (PointInRectIterator<1,int> p(it.rect); p.valid; p.step())
+        log_app.print() << "edge[" << p.p << "]=" << acc_src[p.p] << "->" << acc_dst[p.p];
+  }
+
+  // Describe the field data (CPU)
+  FieldDataDescriptor<IndexSpace<1,int>, int> cpu_field_nodes;
+  cpu_field_nodes.index_space  = is_nodes;
+  cpu_field_nodes.inst         = cpu_inst_nodes;
+  cpu_field_nodes.field_offset = 0;
+
+  FieldDataDescriptor<IndexSpace<1,int>, Point<1, int>> cpu_field_src;
+  cpu_field_src.index_space  = is_edges;
+  cpu_field_src.inst         = cpu_inst_edges;
+  cpu_field_src.field_offset = 0;
+
+  FieldDataDescriptor<IndexSpace<1,int>, Point<1, int>> cpu_field_dst;
+  cpu_field_dst.index_space  = is_edges;
+  cpu_field_dst.inst         = cpu_inst_edges;
+  cpu_field_dst.field_offset = sizeof(Point<1,int>);
+
+  std::vector<FieldDataDescriptor<IndexSpace<1,int>, int>> cpu_nodes(1, cpu_field_nodes);
+  std::vector<FieldDataDescriptor<IndexSpace<1,int>, Point<1, int>>> cpu_src(1, cpu_field_src);
+  std::vector<FieldDataDescriptor<IndexSpace<1,int>, Point<1, int>>> cpu_dst(1, cpu_field_dst);
+
+
+  // Colors 0..num_pieces-1
+  std::vector<int> colors(TestConfig::num_pieces);
+  for (int i = 0; i < TestConfig::num_pieces; i++) colors[i] = i;
+
+  // CPU partitioning
+  std::vector<IndexSpace<1,int>> p_cpu_nodes, p_cpu_edges, p_cpu_rd;
+  Event e_cpu_byfield = is_nodes.create_subspaces_by_field(cpu_nodes, colors, p_cpu_nodes, ProfilingRequestSet());
+  Event e_cpu_bypreimage = is_edges.create_subspaces_by_preimage(cpu_dst, p_cpu_nodes, p_cpu_edges, ProfilingRequestSet(), e_cpu_byfield);
+  Event e_cpu_image = is_nodes.create_subspaces_by_image(cpu_src, p_cpu_edges, p_cpu_rd, ProfilingRequestSet(), e_cpu_bypreimage);
+
+  // GPU path (optional if GPU exists)
+  std::vector<IndexSpace<1,int>> p_gpu_nodes, p_gpu_edges, p_gpu_rd;
+  if (have_gpu) {
+    RegionInstance gpu_inst_nodes, gpu_inst_edges;
+    make_instance(gpu_inst_nodes, gpu_mem, is_nodes, {sizeof(int)}).wait();
+    make_instance(gpu_inst_edges, gpu_mem, is_edges, {sizeof(Point<1, int>), sizeof(Point<1, int>)}).wait();
+
+    // Copy field data CPU -> GPU
+    copy_field<1,int,int>(is_nodes, cpu_inst_nodes, gpu_inst_nodes, FID_SUBGRAPH);
+    copy_field<1,int,Point<1,int>>(is_edges, cpu_inst_edges, gpu_inst_edges, FID_SRC);
+    copy_field<1,int,Point<1,int>>(is_edges, cpu_inst_edges, gpu_inst_edges, FID_DST);
+
+    // Describe the field data (CPU)
+    FieldDataDescriptor<IndexSpace<1,int>, int> gpu_field_nodes;
+    gpu_field_nodes.index_space  = is_nodes;
+    gpu_field_nodes.inst         = gpu_inst_nodes;
+    gpu_field_nodes.field_offset = 0;
+
+    FieldDataDescriptor<IndexSpace<1,int>, Point<1, int>> gpu_field_src;
+    gpu_field_src.index_space  = is_edges;
+    gpu_field_src.inst         = gpu_inst_edges;
+    gpu_field_src.field_offset = 0;
+
+    FieldDataDescriptor<IndexSpace<1,int>, Point<1, int>> gpu_field_dst;
+    gpu_field_dst.index_space  = is_edges;
+    gpu_field_dst.inst         = cpu_inst_edges;
+    gpu_field_dst.field_offset = sizeof(Point<1,int>);
+
+    std::vector<FieldDataDescriptor<IndexSpace<1,int>, int>> gpu_nodes(1, gpu_field_nodes);
+    std::vector<FieldDataDescriptor<IndexSpace<1,int>, Point<1, int>>> gpu_src(1, gpu_field_src);
+    std::vector<FieldDataDescriptor<IndexSpace<1,int>, Point<1, int>>> gpu_dst(1, gpu_field_dst);
+
+    std::vector<IndexSpace<1,int>> p_gpu_nodes, p_gpu_edges, p_gpu_rd;
+    Event e_gpu_byfield = is_nodes.create_subspaces_by_field(gpu_nodes, colors, p_gpu_nodes,
+                                               ProfilingRequestSet());
+    Event e_gpu_bypreimage = is_edges.create_subspaces_by_preimage(gpu_dst, p_gpu_nodes, p_gpu_edges, ProfilingRequestSet(), e_gpu_byfield);
+    Event e_gpu_image = is_nodes.create_subspaces_by_image(gpu_src, p_gpu_edges, p_gpu_rd, ProfilingRequestSet(), e_gpu_bypreimage);
+
+    e_cpu_image.wait();
+    e_gpu_image.wait();
+    // Compare CPU vs GPU partitions
+    if (TestConfig::verify) {
+      int errs = compare_partitions(p_cpu_nodes, p_gpu_nodes) +
+                 compare_partitions(p_cpu_edges, p_gpu_edges) +
+                 compare_partitions(p_cpu_rd, p_gpu_rd);
+      if (errs) {
+        log_app.fatal() << "Mismatch between CPU and GPU partitions, errors=" << errs;
+        assert(0);
+      }
+    }
+    gpu_inst_nodes.destroy();
+    gpu_inst_edges.destroy();
+  } else {
+    e_cpu_image.wait();
+  }
+
+  // Cleanup
+  cpu_inst_nodes.destroy();
+  cpu_inst_edges.destroy();
+  is_nodes.destroy();
+  is_edges.destroy();
+
+  log_app.print() << "deppart_1d_itest: PASS";
+}
+
+// ---------------- Main (same as transpose_test pattern) -----------------
+int main(int argc, char** argv)
+{
+  Runtime rt;
+  rt.init(&argc, &argv);
+
+  // Parse simple flags similar to the example
+  CommandLineParser cp;
+  cp.add_option_int("-n",      TestConfig::num_nodes)
+    .add_option_int("-e",      TestConfig::num_edges)
+    .add_option_int("-p",      TestConfig::num_pieces)
+    .add_option_int("-random", TestConfig::random)
+    .add_option_int("-show",   TestConfig::show)
+    .add_option_int("-verify", TestConfig::verify);
+  bool ok = cp.parse_command_line(argc, const_cast<const char**>(argv));
+  assert(ok);
+
+  rt.register_task(TOP_LEVEL_TASK, top_level_task);
+
+  Processor p = Machine::ProcessorQuery(Machine::get_machine())
+                  .only_kind(Processor::LOC_PROC)
+                  .first();
+  assert(p.exists());
+
+  Event e = rt.collective_spawn(p, TOP_LEVEL_TASK, nullptr, 0);
+  rt.shutdown(e);
+  rt.wait_for_shutdown();
+  return 0;
+}
\ No newline at end of file

From 45d9973b7dbc972f377129efcb8c8ba9975c0147 Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Tue, 20 Jan 2026 17:03:41 -0800
Subject: [PATCH 02/32] Updated api

---
 src/realm/deppart/image.cc           | 147 ++++-----------------------
 src/realm/deppart/image.h            |  34 +------
 src/realm/deppart/image_gpu_impl.hpp |   3 +-
 src/realm/deppart/image_tmpl.cc      |   8 +-
 src/realm/indexspace.h               |  27 ++---
 src/realm/indexspace.inl             |  20 +---
 tests/deppart.cc                     |  19 ++--
 7 files changed, 43 insertions(+), 215 deletions(-)

diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc
index 660d0f77ad..c57b86b426 100644
--- a/src/realm/deppart/image.cc
+++ b/src/realm/deppart/image.cc
@@ -32,15 +32,16 @@ namespace Realm {
 
   template <int N, typename T>
   template <int N2, typename T2>
-  Event IndexSpace<N, T>::gpu_subspaces_by_image(
+  Event IndexSpace<N, T>::create_subspaces_by_image(
       const DomainTransform<N, T, N2, T2> &domain_transform,
       const std::vector<IndexSpace<N2, T2>> &sources,
       std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
-      std::pair<size_t, size_t> &sizes, RegionInstance buffer, Event wait_on) const {
-    // output vector should start out empty
-    assert(images.empty());
+      Event wait_on,
+      RegionInstance buffer, std::pair<size_t, size_t>* buffer_bounds) const {
+   // output vector should start out empty
+   assert(images.empty());
 
-    if (buffer==RegionInstance::NO_INST) {
+   if (buffer_bounds != nullptr || buffer != RegionInstance::NO_INST) {
       size_t optimal_size = 0;
       for (size_t i = 0; i < sources.size(); i++) {
         optimal_size += 5 * sources[i].volume() * sizeof(RectDesc<N, T>);
@@ -73,49 +74,21 @@ namespace Realm {
           (2 * source_entries * sizeof(uint64_t)) +
           (source_entries * sizeof(uint64_t));
       }
-      sizes = std::make_pair(minimal_size, minimal_size + optimal_size);
-      return Event::NO_EVENT;
-    }
-
-    GenEventImpl *finish_event = GenEventImpl::create_genevent();
-    Event e = finish_event->current_event();
-
-    GPUImageOperation<N, T, N2, T2> *op = new GPUImageOperation<N, T, N2, T2>(
-        *this, domain_transform, reqs, sizes.first, buffer, finish_event, ID(e).event_generation());
-
-    size_t n = sources.size();
-    images.resize(n);
-    for (size_t i = 0; i < n; i++) {
-      images[i] = op->add_source(sources[i]);
-
-      if(!images[i].dense()) {
-        e = Event::merge_events(
-            {e, SparsityMapRefCounter(images[i].sparsity.id).add_references(1)});
+      if (buffer_bounds != nullptr && buffer == RegionInstance::NO_INST) {
+	*buffer_bounds = std::make_pair(minimal_size, minimal_size + optimal_size);
+      	return Event::NO_EVENT;
       }
-
-      log_dpops.info() << "image: " << *this << " src=" << sources[i] << " -> "
-                       << images[i] << " (" << e << ")";
+      assert(buffer != RegionInstance::NO_INST);
+      size_t buffer_size = buffer.get_layout()->bytes_used;
+      assert(buffer_size >= minimal_size);
     }
 
-    op->launch(wait_on);
-    return e;
-  }
-
-  template <int N, typename T>
-  template <int N2, typename T2>
-  Event IndexSpace<N, T>::create_subspaces_by_image(
-      const DomainTransform<N, T, N2, T2> &domain_transform,
-      const std::vector<IndexSpace<N2, T2>> &sources,
-      std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
-      Event wait_on) const {
-   // output vector should start out empty
-   assert(images.empty());
 
    GenEventImpl *finish_event = GenEventImpl::create_genevent();
    Event e = finish_event->current_event();
 
    ImageOperation<N, T, N2, T2> *op = new ImageOperation<N, T, N2, T2>(
-       *this, domain_transform, reqs, finish_event, ID(e).event_generation());
+       *this, domain_transform, reqs, finish_event, ID(e).event_generation(), buffer);
 
    size_t n = sources.size();
    images.resize(n);
@@ -507,10 +480,11 @@ namespace Realm {
       const IndexSpace<N, T> &_parent,
       const DomainTransform<N, T, N2, T2> &_domain_transform,
       const ProfilingRequestSet &reqs, GenEventImpl *_finish_event,
-      EventImpl::gen_t _finish_gen)
+      EventImpl::gen_t _finish_gen, RegionInstance _buffer)
       : PartitioningOperation(reqs, _finish_event, _finish_gen),
         parent(_parent),
-        domain_transform(_domain_transform) {}
+        domain_transform(_domain_transform),
+	buffer(_buffer) {}
 
   template <int N, typename T, int N2, typename T2>
   ImageOperation<N,T,N2,T2>::~ImageOperation(void)
@@ -715,24 +689,9 @@ namespace Realm {
     if (gpu_data) {
     	std::swap(domain_transform.ptr_data, gpu_ptr_data);
     	std::swap(domain_transform.range_data, gpu_rect_data);
-        const char* val = std::getenv("TILE_SIZE");  // or any env var
-        size_t tile_size = 100000000; //default
-        if (val) {
-          tile_size = atoi(val);
-        }
-        std::vector<size_t> byte_fields = {sizeof(char)};
-        IndexSpace<1> instance_index_space(Rect<1>(0, tile_size-1));
-        RegionInstance buffer;
-        Memory my_mem;
-        if (domain_transform.ptr_data.size() > 0) {
-          my_mem = domain_transform.ptr_data[0].inst.get_location();
-        } else {
-          my_mem = domain_transform.range_data[0].inst.get_location();
-        }
-        RegionInstance::create_instance(buffer, my_mem, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
     	GPUImageMicroOp<N, T, N2, T2> *micro_op =
 	       new GPUImageMicroOp<N, T, N2, T2>(
-		 parent, domain_transform, !cpu_data, tile_size, buffer);
+		 parent, domain_transform, !cpu_data, buffer);
     	for (size_t j = 0; j < sources.size(); j++) {
     		micro_op->add_sparsity_output(sources[j], images[j]);
     	}
@@ -818,74 +777,6 @@ namespace Realm {
     os << "ImageOperation(" << parent << ")";
   }
 
-  ////////////////////////////////////////////////////////////////////////
-  //
-  // class GPUImageOperation<N,T,N2,T2>
-
-  template <int N, typename T, int N2, typename T2>
-  GPUImageOperation<N, T, N2, T2>::GPUImageOperation(
-      const IndexSpace<N, T> &_parent,
-      const DomainTransform<N, T, N2, T2> &_domain_transform,
-      const ProfilingRequestSet &reqs, size_t _buffer_size, RegionInstance _buffer,
-      GenEventImpl *_finish_event, EventImpl::gen_t _finish_gen)
-      : PartitioningOperation(reqs, _finish_event, _finish_gen),
-        parent(_parent),
-        domain_transform(_domain_transform),
-        buffer_size(_buffer_size),
-        buffer(_buffer) {}
-
-  template <int N, typename T, int N2, typename T2>
-  GPUImageOperation<N,T,N2,T2>::~GPUImageOperation(void)
-  {}
-
-  template <int N, typename T, int N2, typename T2>
-  IndexSpace<N,T> GPUImageOperation<N,T,N2,T2>::add_source(const IndexSpace<N2,T2>& source)
-  {
-    // try to filter out obviously empty sources
-    if(parent.empty() || source.empty())
-      return IndexSpace<N,T>::make_empty();
-
-    // otherwise it'll be something smaller than the current parent
-    IndexSpace<N,T> image;
-    image.bounds = parent.bounds;
-
-    // if the source has a sparsity map, use the same node - otherwise
-    // get a sparsity ID by round-robin'ing across the nodes that have field data
-    int target_node = 0;
-    if(!source.dense())
-      target_node = ID(source.sparsity).sparsity_creator_node();
-    else
-      if(!domain_transform.ptr_data.empty())
-	target_node = ID(domain_transform.ptr_data[sources.size() % domain_transform.ptr_data.size()].inst).instance_owner_node();
-      else
-	target_node = ID(domain_transform.range_data[sources.size() % domain_transform.range_data.size()].inst).instance_owner_node();
-
-    SparsityMap<N,T> sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert<SparsityMap<N,T> >();
-    image.sparsity = sparsity;
-
-    sources.push_back(source);
-    images.push_back(sparsity);
-
-    return image;
-  }
-
-  template <int N, typename T, int N2, typename T2>
-  void GPUImageOperation<N, T, N2, T2>::execute(void) {
-    	GPUImageMicroOp<N, T, N2, T2> *micro_op =
-	       new GPUImageMicroOp<N, T, N2, T2>(
-		 parent, domain_transform, true, buffer_size, buffer);
-    	for (size_t j = 0; j < sources.size(); j++) {
-    		micro_op->add_sparsity_output(sources[j], images[j]);
-    	}
-    	micro_op->dispatch(this, true);
-  }
-
-  template <int N, typename T, int N2, typename T2>
-  void GPUImageOperation<N,T,N2,T2>::print(std::ostream& os) const
-  {
-    os << "ImageOperation(" << parent << ")";
-  }
-
   ////////////////////////////////////////////////////////////////////////
   //
   // class StructuredImageMicroOp<N, T, N2, T2>
@@ -1015,8 +906,8 @@ namespace Realm {
   GPUImageMicroOp<N, T, N2, T2>::GPUImageMicroOp(
       const IndexSpace<N, T> &_parent,
       const DomainTransform<N, T, N2, T2> &_domain_transform,
-      bool _exclusive, size_t _fixed_buffer_size, RegionInstance _buffer)
-      : parent_space(_parent), domain_transform(_domain_transform), fixed_buffer_size(_fixed_buffer_size), buffer(_buffer)
+      bool _exclusive, RegionInstance _buffer)
+      : parent_space(_parent), domain_transform(_domain_transform), buffer(_buffer)
   {
 	  this->exclusive = _exclusive;
   }
diff --git a/src/realm/deppart/image.h b/src/realm/deppart/image.h
index 82b6393eb7..2f0347c5ff 100644
--- a/src/realm/deppart/image.h
+++ b/src/realm/deppart/image.h
@@ -97,7 +97,7 @@ namespace Realm {
         ImageOperation(const IndexSpace<N, T> &_parent,
                        const DomainTransform<N, T, N2, T2> &_domain_transform,
                        const ProfilingRequestSet &reqs, GenEventImpl *_finish_event,
-                       EventImpl::gen_t _finish_gen);
+                       EventImpl::gen_t _finish_gen, RegionInstance buffer = RegionInstance::NO_INST);
 
         virtual ~ImageOperation(void);
 
@@ -122,6 +122,7 @@ namespace Realm {
         std::vector<IndexSpace<N, T> > diff_rhss;
         std::vector<SparsityMap<N, T> > images;
         bool is_intersection;
+        RegionInstance buffer;
     };
 
     template<int N, typename T, int N2, typename T2>
@@ -149,41 +150,13 @@ namespace Realm {
         std::vector<SparsityMap<N, T> > sparsity_outputs;
     };
 
-    template<int N, typename T, int N2, typename T2>
-      class GPUImageOperation : public PartitioningOperation {
-    public:
-      GPUImageOperation(const IndexSpace<N, T> &_parent,
-                     const DomainTransform<N, T, N2, T2> &_domain_transform,
-                     const ProfilingRequestSet &reqs,
-                     size_t _buffer_size,
-                     RegionInstance _buffer,
-                     GenEventImpl *_finish_event,
-                     EventImpl::gen_t _finish_gen);
-
-      virtual ~GPUImageOperation(void);
-
-      IndexSpace<N, T> add_source(const IndexSpace<N2, T2> &source);
-
-      virtual void execute(void);
-
-      virtual void print(std::ostream &os) const;
-
-    protected:
-      IndexSpace<N, T> parent;
-      DomainTransform<N, T, N2, T2> domain_transform;
-      std::vector<IndexSpace<N2, T2> > sources;
-      std::vector<SparsityMap<N, T> > images;
-      size_t buffer_size;
-      RegionInstance buffer;
-    };
-
     template<int N, typename T, int N2, typename T2>
     class GPUImageMicroOp : public GPUMicroOp<N, T> {
     public:
         GPUImageMicroOp(
             const IndexSpace<N, T> &_parent,
             const DomainTransform<N, T, N2, T2> &_domain_transform,
-            bool _exclusive, size_t fixed_buffer_size = 0, RegionInstance buffer = RegionInstance::NO_INST);
+            bool _exclusive, RegionInstance buffer = RegionInstance::NO_INST);
 
         virtual ~GPUImageMicroOp(void);
 
@@ -203,7 +176,6 @@ namespace Realm {
         DomainTransform<N, T, N2, T2> domain_transform;
         std::vector<IndexSpace<N2, T2> > sources;
         std::vector<SparsityMap<N, T> > sparsity_outputs;
-        size_t fixed_buffer_size;
         RegionInstance buffer;
     };
 }; // namespace Realm
diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp
index 6abb27c043..ce357436b7 100644
--- a/src/realm/deppart/image_gpu_impl.hpp
+++ b/src/realm/deppart/image_gpu_impl.hpp
@@ -231,7 +231,8 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
 
     cudaStream_t stream = Cuda::get_task_cuda_stream();
 
-    size_t tile_size = fixed_buffer_size;
+    size_t tile_size = buffer.get_layout()->bytes_used;
+    std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
     RegionInstance fixed_buffer = buffer;
     Arena buffer_arena(reinterpret_cast<void *>(AffineAccessor<char, 1>(fixed_buffer, 0).base), tile_size);
 
diff --git a/src/realm/deppart/image_tmpl.cc b/src/realm/deppart/image_tmpl.cc
index c12dfdb138..6f4371bae2 100644
--- a/src/realm/deppart/image_tmpl.cc
+++ b/src/realm/deppart/image_tmpl.cc
@@ -52,12 +52,8 @@ namespace Realm {
   template ImageMicroOp<N1,T1,N2,T2>::ImageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \
   template Event IndexSpace<N1, T1>::create_subspaces_by_image(                                                      \
       const DomainTransform<N1, T1, N2, T2> &, const std::vector<IndexSpace<N2, T2> > &,                             \
-      std::vector<IndexSpace<N1, T1> > &, const ProfilingRequestSet &, Event)                                        \
-      const;                                                                                                         \
-  template Event IndexSpace<N1, T1>::gpu_subspaces_by_image(                                                         \
-      const DomainTransform<N1, T1, N2, T2> &, const std::vector<IndexSpace<N2, T2> > &,                             \
-      std::vector<IndexSpace<N1, T1> > &, const ProfilingRequestSet &, std::pair<size_t, size_t> &,                  \
-      RegionInstance, Event) const;                                                                                  \
+      std::vector<IndexSpace<N1, T1> > &, const ProfilingRequestSet &, Event,					     \
+	RegionInstance, std::pair<size_t, size_t>*)  const;							     \
   template Event IndexSpace<N1,T1>::create_subspaces_by_image_with_difference(                                       \
       const DomainTransform<N1, T1, N2, T2> &,                                                                       \
 									       const std::vector<IndexSpace<N2,T2> >&,                                                     \
diff --git a/src/realm/indexspace.h b/src/realm/indexspace.h
index b61a77d689..448b2815fb 100644
--- a/src/realm/indexspace.h
+++ b/src/realm/indexspace.h
@@ -780,19 +780,10 @@ namespace Realm {
         const DomainTransform<N, T, N2, T2> &domain_transform,
         const std::vector<IndexSpace<N2, T2>> &sources,
         std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
-        Event wait_on = Event::NO_EVENT) const;
+        Event wait_on = Event::NO_EVENT, RegionInstance buffer = RegionInstance::NO_INST,
+        std::pair<size_t, size_t>* buffer_bounds = nullptr) const;
     ///@}
 
-    ///@{
-    ///
-
-    template <int N2, typename T2>
-    REALM_PUBLIC_API Event gpu_subspaces_by_image(
-        const DomainTransform<N, T, N2, T2> &domain_transform,
-        const std::vector<IndexSpace<N2, T2>> &sources,
-        std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
-        std::pair<size_t, size_t> &sizes, RegionInstance buffer = RegionInstance::NO_INST, Event wait_on = Event::NO_EVENT) const;
-    ///@}
 
     ///@{
     /**
@@ -823,15 +814,8 @@ namespace Realm {
             &field_data,
         const std::vector<IndexSpace<N2, T2>> &sources,
         std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
-        Event wait_on = Event::NO_EVENT) const;
-
-    template <int N2, typename T2>
-    REALM_PUBLIC_API Event gpu_subspaces_by_image(
-        const std::vector<FieldDataDescriptor<IndexSpace<N2, T2>, Point<N, T>>>
-            &field_data,
-        const std::vector<IndexSpace<N2, T2>> &sources,
-        std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
-        std::pair<size_t, size_t> &sizes, RegionInstance buffer = RegionInstance::NO_INST, Event wait_on = Event::NO_EVENT) const;
+        Event wait_on = Event::NO_EVENT, RegionInstance buffer = RegionInstance::NO_INST,
+        std::pair<size_t, size_t>* buffer_bounds = nullptr) const;
 
     // range versions
     template <int N2, typename T2>
@@ -847,7 +831,8 @@ namespace Realm {
             &field_data,
         const std::vector<IndexSpace<N2, T2>> &sources,
         std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
-        Event wait_on = Event::NO_EVENT) const;
+        Event wait_on = Event::NO_EVENT, RegionInstance buffer = RegionInstance::NO_INST,
+        std::pair<size_t, size_t>* buffer_bounds = nullptr) const;
     ///@}
 
     ///@{
diff --git a/src/realm/indexspace.inl b/src/realm/indexspace.inl
index c633aa5e46..87cab4ce47 100644
--- a/src/realm/indexspace.inl
+++ b/src/realm/indexspace.inl
@@ -968,22 +968,10 @@ namespace Realm {
       const std::vector<FieldDataDescriptor<IndexSpace<N2, T2>, Point<N, T>>> &field_data,
       const std::vector<IndexSpace<N2, T2>> &sources,
       std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
-      Event wait_on) const
+      Event wait_on, RegionInstance buffer, std::pair<size_t, size_t>* buffer_bounds) const
   {
     return create_subspaces_by_image(DomainTransform<N, T, N2, T2>(field_data), sources,
-                                     images, reqs, wait_on);
-  }
-
-  template <int N, typename T>
-  template <int N2, typename T2>
-  inline Event IndexSpace<N, T>::gpu_subspaces_by_image(
-      const std::vector<FieldDataDescriptor<IndexSpace<N2, T2>, Point<N, T>>> &field_data,
-      const std::vector<IndexSpace<N2, T2>> &sources,
-      std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
-        std::pair<size_t, size_t> &sizes, RegionInstance buffer, Event wait_on) const
-  {
-    return gpu_subspaces_by_image(DomainTransform<N, T, N2, T2>(field_data), sources,
-                                     images, reqs, sizes, buffer, wait_on);
+                                     images, reqs, wait_on, buffer, buffer_bounds);
   }
 
   template <int N, typename T>
@@ -992,10 +980,10 @@ namespace Realm {
       const std::vector<FieldDataDescriptor<IndexSpace<N2, T2>, Rect<N, T>>> &field_data,
       const std::vector<IndexSpace<N2, T2>> &sources,
       std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
-      Event wait_on) const
+      Event wait_on, RegionInstance buffer, std::pair<size_t, size_t>* buffer_bounds) const
   {
     return create_subspaces_by_image(DomainTransform<N, T, N2, T2>(field_data), sources,
-                                     images, reqs, wait_on);
+                                     images, reqs, wait_on, buffer, buffer_bounds);
   }
 
   template <int N, typename T>
diff --git a/tests/deppart.cc b/tests/deppart.cc
index 815f2cb490..18d74c44f4 100644
--- a/tests/deppart.cc
+++ b/tests/deppart.cc
@@ -514,13 +514,13 @@ class BasicTest : public TestInterface {
                                                      e01);
     if(wait_on_events) e02.wait();
     std::pair<size_t, size_t> estimate;
-    Event _e = is_nodes.gpu_subspaces_by_image(src_field_data_gpu,
+    Event _e = is_nodes.create_subspaces_by_image(src_field_data_gpu,
                                                   p_garbage_edges,
                                                   p_garbage_rd,
                                                   Realm::ProfilingRequestSet(),
-                                                  estimate,
+                                                  e02,
                                                   RegionInstance::NO_INST,
-                                                  e02);
+                                                  &estimate);
     std::cout << "Minimum size: " << estimate.first << " bytes, "
               << "Maximum size: " << estimate.second << " bytes\n";
 
@@ -535,14 +535,11 @@ class BasicTest : public TestInterface {
     IndexSpace<1> instance_index_space(Rect<1>(0, tile_size-1));
     RegionInstance buffer;
     RegionInstance::create_instance(buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
-    estimate.first = tile_size;
-    Event e03 = is_nodes.gpu_subspaces_by_image(src_field_data_gpu,
+    Event e03 = is_nodes.create_subspaces_by_image(src_field_data_gpu,
                                                   p_garbage_edges,
                                                   p_garbage_rd,
                                                   Realm::ProfilingRequestSet(),
-                                                  estimate,
-                                                  buffer,
-                                                  e02);
+                                                  e02, buffer);
     if(wait_on_events) e03.wait();
 
     Event e04 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
@@ -573,13 +570,11 @@ class BasicTest : public TestInterface {
 
     // an image of p_edges through out_node gives us all the shared nodes, along
     //  with some private nodes
-    Event e3 = is_nodes.gpu_subspaces_by_image(src_field_data_gpu,
+    Event e3 = is_nodes.create_subspaces_by_image(src_field_data_gpu,
                                                   p_edges,
                                                   p_rd,
                                                   Realm::ProfilingRequestSet(),
-                                                  estimate,
-                                                  buffer,
-                                                  e2);
+                                                  e2, buffer);
     if(wait_on_events) e3.wait();
   	log_app.info() << "GPU Image complete " << Clock::current_time_in_microseconds() << "\n";
   	log_app.info() << "Starting second GPU preimage " << Clock::current_time_in_microseconds() << "\n";

From fd2fe62316f2b1b6261a90a7f984d8301d9ebe8c Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Thu, 29 Jan 2026 00:00:22 -0800
Subject: [PATCH 03/32] API that builds

---
 src/realm/deppart/image.cc      | 191 +++++++++++++++-----------------
 src/realm/deppart/image.h       |   4 +-
 src/realm/deppart/image_tmpl.cc |   4 +-
 src/realm/indexspace.h          |  31 +++++-
 src/realm/indexspace.inl        |   8 +-
 tests/deppart.cc                |  16 +--
 6 files changed, 129 insertions(+), 125 deletions(-)

diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc
index c57b86b426..19ecf60ea5 100644
--- a/src/realm/deppart/image.cc
+++ b/src/realm/deppart/image.cc
@@ -30,65 +30,56 @@ namespace Realm {
   extern Logger log_part;
   extern Logger log_uop_timing;
 
+  template <int N, typename T>
+  template <int N2, typename T2>
+  void IndexSpace<N, T>::estimate_image(
+      const DeppartInput<N2, T2>& input,
+      DeppartSuggestion& suggestion) {
+    size_t minimal_size = 0;
+    size_t source_entries = 0;
+    bool bvh = false;
+    for (size_t size : input.source_sizes) {
+      source_entries += size == 0 ? 1 : size;
+    }
+    minimal_size += sizeof(Rect<N2, T2>) * source_entries;
+    if (this->dense()) {
+      minimal_size += sizeof(Rect<N, T>);
+    } else {
+      minimal_size += sizeof(Rect<N, T>) * input.parent_size;
+    }
+    if (bvh) {
+      minimal_size +=
+        (source_entries * sizeof(uint64_t)) +
+        (source_entries * sizeof(size_t)) +
+        ((2*source_entries - 1) * sizeof(Rect<N, T>)) +
+        (2 * (2*source_entries - 1) * sizeof(int)) +
+        sizeof(Rect<N, T>) +
+        (2 * source_entries * sizeof(uint64_t)) +
+        (source_entries * sizeof(uint64_t));
+    }
+    for (size_t i = 0; i < input.insts.size(); i++) {
+      IndexSpace<N2, T2> is = input.insts[i].first;
+      Memory mem = input.insts[i].second;
+      size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>) * input.source_sizes.size() + minimal_size;
+      suggestion.suggestions[mem] = std::make_pair(minimal_size, optimal_size);
+    }
+  }
+
   template <int N, typename T>
   template <int N2, typename T2>
   Event IndexSpace<N, T>::create_subspaces_by_image(
       const DomainTransform<N, T, N2, T2> &domain_transform,
       const std::vector<IndexSpace<N2, T2>> &sources,
       std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
-      Event wait_on,
-      RegionInstance buffer, std::pair<size_t, size_t>* buffer_bounds) const {
+      Event wait_on, DeppartOutput* buffers) const {
    // output vector should start out empty
    assert(images.empty());
 
-   if (buffer_bounds != nullptr || buffer != RegionInstance::NO_INST) {
-      size_t optimal_size = 0;
-      for (size_t i = 0; i < sources.size(); i++) {
-        optimal_size += 5 * sources[i].volume() * sizeof(RectDesc<N, T>);
-      }
-      size_t minimal_size = 0;
-      size_t source_entries = 0;
-      bool bvh = false;
-      for (size_t i = 0; i < sources.size(); ++i) {
-        IndexSpace<N2,T2> my_space = sources[i];
-        if (my_space.dense()) {
-          source_entries += 1;
-        } else {
-          bvh = true;
-          source_entries += my_space.sparsity.impl()->get_entries().size();
-        }
-      }
-      minimal_size += sizeof(Rect<N2, T2>) * source_entries;
-      if (this->dense()) {
-        minimal_size += sizeof(Rect<N, T>);
-      } else {
-        minimal_size += sizeof(Rect<N, T>) * this->sparsity.impl()->get_entries().size();
-      }
-      if (bvh) {
-        minimal_size +=
-          (source_entries * sizeof(uint64_t)) +
-          (source_entries * sizeof(size_t)) +
-          ((2*source_entries - 1) * sizeof(Rect<N, T>)) +
-          (2 * (2*source_entries - 1) * sizeof(int)) +
-          sizeof(Rect<N, T>) +
-          (2 * source_entries * sizeof(uint64_t)) +
-          (source_entries * sizeof(uint64_t));
-      }
-      if (buffer_bounds != nullptr && buffer == RegionInstance::NO_INST) {
-	*buffer_bounds = std::make_pair(minimal_size, minimal_size + optimal_size);
-      	return Event::NO_EVENT;
-      }
-      assert(buffer != RegionInstance::NO_INST);
-      size_t buffer_size = buffer.get_layout()->bytes_used;
-      assert(buffer_size >= minimal_size);
-    }
-
-
    GenEventImpl *finish_event = GenEventImpl::create_genevent();
    Event e = finish_event->current_event();
 
    ImageOperation<N, T, N2, T2> *op = new ImageOperation<N, T, N2, T2>(
-       *this, domain_transform, reqs, finish_event, ID(e).event_generation(), buffer);
+       *this, domain_transform, reqs, finish_event, ID(e).event_generation(), buffers);
 
    size_t n = sources.size();
    images.resize(n);
@@ -480,11 +471,11 @@ namespace Realm {
       const IndexSpace<N, T> &_parent,
       const DomainTransform<N, T, N2, T2> &_domain_transform,
       const ProfilingRequestSet &reqs, GenEventImpl *_finish_event,
-      EventImpl::gen_t _finish_gen, RegionInstance _buffer)
+      EventImpl::gen_t _finish_gen, DeppartOutput* _buffers)
       : PartitioningOperation(reqs, _finish_event, _finish_gen),
         parent(_parent),
         domain_transform(_domain_transform),
-	buffer(_buffer) {}
+	buffers(_buffers) {}
 
   template <int N, typename T, int N2, typename T2>
   ImageOperation<N,T,N2,T2>::~ImageOperation(void)
@@ -592,14 +583,14 @@ namespace Realm {
   template <int N, typename T, int N2, typename T2>
   void ImageOperation<N, T, N2, T2>::execute(void) {
 
-  	std::vector<FieldDataDescriptor<IndexSpace<N2,T2>,Point<N, T>> > gpu_ptr_data;
+  	std::map<Memory, std::vector<FieldDataDescriptor<IndexSpace<N2,T2>,Point<N, T>> >> gpu_ptr_data;
   	std::vector<FieldDataDescriptor<IndexSpace<N2,T2>,Point<N, T>> > cpu_ptr_data;
-  	std::vector<FieldDataDescriptor<IndexSpace<N2,T2>,Rect<N, T>> > gpu_rect_data;
+  	std::map<Memory, std::vector<FieldDataDescriptor<IndexSpace<N2,T2>,Rect<N, T>> >> gpu_rect_data;
   	std::vector<FieldDataDescriptor<IndexSpace<N2,T2>,Rect<N, T>> > cpu_rect_data;
   	for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) {
   		if (domain_transform.ptr_data[i].inst.get_location().kind() ==
 		      Memory::GPU_FB_MEM) {
-  			gpu_ptr_data.push_back(domain_transform.ptr_data[i]);
+  			gpu_ptr_data[domain_transform.ptr_data[i].inst.get_location()].push_back(domain_transform.ptr_data[i]);
 		      } else {
 		      	cpu_ptr_data.push_back(domain_transform.ptr_data[i]);
 		      }
@@ -607,13 +598,12 @@ namespace Realm {
   	for (size_t i = 0; i < domain_transform.range_data.size(); i++) {
   		if (domain_transform.range_data[i].inst.get_location().kind() ==
 		      Memory::GPU_FB_MEM) {
-  			gpu_rect_data.push_back(domain_transform.range_data[i]);
+  			gpu_rect_data[domain_transform.range_data[i].inst.get_location()].push_back(domain_transform.range_data[i]);
 		      } else {
 		      	cpu_rect_data.push_back(domain_transform.range_data[i]);
 		      }
   	}
   	bool gpu_data = !gpu_ptr_data.empty() || !gpu_rect_data.empty();
-  	bool cpu_data = !cpu_ptr_data.empty() || !cpu_rect_data.empty();
     if (domain_transform.type ==
        DomainTransform<N, T, N2, T2>::DomainTransformType::STRUCTURED && !gpu_data) {
 
@@ -649,54 +639,55 @@ namespace Realm {
 
        	uop->dispatch(this, true /* ok to run in this thread */);
     } else {
-    if (cpu_data) {
-	    	// launch full cross-product of image micro ops right away
-	    	for (size_t i = 0; i < sources.size(); i++)
-	    		SparsityMapImpl<N, T>::lookup(images[i])->set_contributor_count(
-				cpu_ptr_data.size() +
-				cpu_rect_data.size() + (gpu_data ? 1 : 0));
-
-	    	for (size_t i = 0; i < cpu_ptr_data.size(); i++) {
-	    		ImageMicroOp<N, T, N2, T2> *uop = new ImageMicroOp<N, T, N2, T2>(
-				parent, cpu_ptr_data[i].index_space,
-				cpu_ptr_data[i].inst,
-				cpu_ptr_data[i].field_offset, false /*ptrs*/);
-	    		for (size_t j = 0; j < sources.size(); j++)
-	    			if (diff_rhss.empty())
-	    				uop->add_sparsity_output(sources[j], images[j]);
-	    			else
-	    				uop->add_sparsity_output_with_difference(sources[j], diff_rhss[j],
-										     images[j]);
-
-	    		uop->dispatch(this, true /* ok to run in this thread */);
-	    	}
-
-	    	for (size_t i = 0; i < cpu_rect_data.size(); i++) {
-	    		ImageMicroOp<N, T, N2, T2> *uop = new ImageMicroOp<N, T, N2, T2>(
-				parent, cpu_rect_data[i].index_space,
-				cpu_rect_data[i].inst,
-				cpu_rect_data[i].field_offset, true /*ranges*/);
-	    		for (size_t j = 0; j < sources.size(); j++)
-	    			if (diff_rhss.empty())
-	    				uop->add_sparsity_output(sources[j], images[j]);
-	    			else
-	    				uop->add_sparsity_output_with_difference(sources[j], diff_rhss[j],
-										     images[j]);
-
-	    		uop->dispatch(this, true /* ok to run in this thread */);
-	    	}
-	    }
-    if (gpu_data) {
-    	std::swap(domain_transform.ptr_data, gpu_ptr_data);
-    	std::swap(domain_transform.range_data, gpu_rect_data);
-    	GPUImageMicroOp<N, T, N2, T2> *micro_op =
-	       new GPUImageMicroOp<N, T, N2, T2>(
-		 parent, domain_transform, !cpu_data, buffer);
-    	for (size_t j = 0; j < sources.size(); j++) {
-    		micro_op->add_sparsity_output(sources[j], images[j]);
+      if (gpu_data) assert(buffers);
+      bool opcount = cpu_ptr_data.size() + cpu_rect_data.size() + gpu_ptr_data.size() + gpu_rect_data.size();
+      bool exclusive = gpu_data && (opcount == 1);
+      if (!exclusive) {
+    	// launch full cross-product of image micro ops right away
+    	for (size_t i = 0; i < sources.size(); i++) {
+    		SparsityMapImpl<N, T>::lookup(images[i])->set_contributor_count(opcount);
     	}
-    	micro_op->dispatch(this, true);
-    }
+      }
+      for (size_t i = 0; i < cpu_ptr_data.size(); i++) {
+      	ImageMicroOp<N, T, N2, T2> *uop = new ImageMicroOp<N, T, N2, T2>(
+  		parent, cpu_ptr_data[i].index_space,
+  		cpu_ptr_data[i].inst,
+  		cpu_ptr_data[i].field_offset, false /*ptrs*/);
+      	for (size_t j = 0; j < sources.size(); j++)
+      		if (diff_rhss.empty())
+      			uop->add_sparsity_output(sources[j], images[j]);
+      		else
+      			uop->add_sparsity_output_with_difference(sources[j], diff_rhss[j],
+  								     images[j]);
+      	uop->dispatch(this, true /* ok to run in this thread */);
+      }
+      for (size_t i = 0; i < cpu_rect_data.size(); i++) {
+      	ImageMicroOp<N, T, N2, T2> *uop = new ImageMicroOp<N, T, N2, T2>(
+  		parent, cpu_rect_data[i].index_space,
+  		cpu_rect_data[i].inst,
+  		cpu_rect_data[i].field_offset, true /*ranges*/);
+      	for (size_t j = 0; j < sources.size(); j++)
+      		if (diff_rhss.empty())
+      			uop->add_sparsity_output(sources[j], images[j]);
+      		else
+      			uop->add_sparsity_output_with_difference(sources[j], diff_rhss[j],
+  								     images[j]);
+      	uop->dispatch(this, true /* ok to run in this thread */);
+      }
+      for (auto it = gpu_ptr_data.begin(); it != gpu_ptr_data.end(); it++) {
+          	// launch full cross-product of image micro ops right away
+          Memory my_mem = it->first;
+          domain_transform.ptr_data = it->second;
+          assert(buffers->buffers.find(my_mem) != buffers->buffers.end());
+          RegionInstance buffer = buffers->buffers[my_mem];
+      	  GPUImageMicroOp<N, T, N2, T2> *micro_op =
+  	         new GPUImageMicroOp<N, T, N2, T2>(
+  		   parent, domain_transform, exclusive, buffer);
+      	  for (size_t j = 0; j < sources.size(); j++) {
+      		  micro_op->add_sparsity_output(sources[j], images[j]);
+      	  }
+      	  micro_op->dispatch(this, true);
+      }
    }
   }
 
diff --git a/src/realm/deppart/image.h b/src/realm/deppart/image.h
index 2f0347c5ff..cafa58b56e 100644
--- a/src/realm/deppart/image.h
+++ b/src/realm/deppart/image.h
@@ -97,7 +97,7 @@ namespace Realm {
         ImageOperation(const IndexSpace<N, T> &_parent,
                        const DomainTransform<N, T, N2, T2> &_domain_transform,
                        const ProfilingRequestSet &reqs, GenEventImpl *_finish_event,
-                       EventImpl::gen_t _finish_gen, RegionInstance buffer = RegionInstance::NO_INST);
+                       EventImpl::gen_t _finish_gen, DeppartOutput* buffers = nullptr);
 
         virtual ~ImageOperation(void);
 
@@ -122,7 +122,7 @@ namespace Realm {
         std::vector<IndexSpace<N, T> > diff_rhss;
         std::vector<SparsityMap<N, T> > images;
         bool is_intersection;
-        RegionInstance buffer;
+        DeppartOutput* buffers;
     };
 
     template<int N, typename T, int N2, typename T2>
diff --git a/src/realm/deppart/image_tmpl.cc b/src/realm/deppart/image_tmpl.cc
index 6f4371bae2..f07359c745 100644
--- a/src/realm/deppart/image_tmpl.cc
+++ b/src/realm/deppart/image_tmpl.cc
@@ -50,10 +50,12 @@ namespace Realm {
   template class GPUImageMicroOp<N1, T1, N2, T2>;																	 \
   template class ImageOperation<N1,T1,N2,T2>;                                                                        \
   template ImageMicroOp<N1,T1,N2,T2>::ImageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \
+  template void IndexSpace<N1, T1>::estimate_image(                                                      \
+    const DeppartInput<N2, T2>&, DeppartSuggestion&);							     \
   template Event IndexSpace<N1, T1>::create_subspaces_by_image(                                                      \
       const DomainTransform<N1, T1, N2, T2> &, const std::vector<IndexSpace<N2, T2> > &,                             \
       std::vector<IndexSpace<N1, T1> > &, const ProfilingRequestSet &, Event,					     \
-	RegionInstance, std::pair<size_t, size_t>*)  const;							     \
+	DeppartOutput*)  const;							     \
   template Event IndexSpace<N1,T1>::create_subspaces_by_image_with_difference(                                       \
       const DomainTransform<N1, T1, N2, T2> &,                                                                       \
 									       const std::vector<IndexSpace<N2,T2> >&,                                                     \
diff --git a/src/realm/indexspace.h b/src/realm/indexspace.h
index 448b2815fb..4e56b4e4a8 100644
--- a/src/realm/indexspace.h
+++ b/src/realm/indexspace.h
@@ -111,6 +111,21 @@ namespace Realm {
     size_t field_offset;
   };
 
+  template <int N, typename T>
+  struct DeppartInput {
+    std::vector<std::pair<IndexSpace<N, T>, Memory>> insts;
+    std::vector<size_t> source_sizes;
+    size_t parent_size;
+  };
+
+  struct DeppartSuggestion {
+    std::map<Memory, std::pair<size_t, size_t>> suggestions;
+  };
+
+  struct DeppartOutput {
+    std::map<Memory, RegionInstance> buffers;
+  };
+
   /**
    * \class TranslationTransform
    * A translation transform is a special case of an affine transform
@@ -780,8 +795,14 @@ namespace Realm {
         const DomainTransform<N, T, N2, T2> &domain_transform,
         const std::vector<IndexSpace<N2, T2>> &sources,
         std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
-        Event wait_on = Event::NO_EVENT, RegionInstance buffer = RegionInstance::NO_INST,
-        std::pair<size_t, size_t>* buffer_bounds = nullptr) const;
+        Event wait_on = Event::NO_EVENT, DeppartOutput *buffers = nullptr) const;
+
+    template<int N2, typename T2>
+    REALM_PUBLIC_API void estimate_image(
+        const DeppartInput<N2, T2> &input,
+        DeppartSuggestion &suggestion);
+
+
     ///@}
 
 
@@ -814,8 +835,7 @@ namespace Realm {
             &field_data,
         const std::vector<IndexSpace<N2, T2>> &sources,
         std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
-        Event wait_on = Event::NO_EVENT, RegionInstance buffer = RegionInstance::NO_INST,
-        std::pair<size_t, size_t>* buffer_bounds = nullptr) const;
+        Event wait_on = Event::NO_EVENT, DeppartOutput* buffers = nullptr) const;
 
     // range versions
     template <int N2, typename T2>
@@ -831,8 +851,7 @@ namespace Realm {
             &field_data,
         const std::vector<IndexSpace<N2, T2>> &sources,
         std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
-        Event wait_on = Event::NO_EVENT, RegionInstance buffer = RegionInstance::NO_INST,
-        std::pair<size_t, size_t>* buffer_bounds = nullptr) const;
+        Event wait_on = Event::NO_EVENT, DeppartOutput* buffers = nullptr) const;
     ///@}
 
     ///@{
diff --git a/src/realm/indexspace.inl b/src/realm/indexspace.inl
index 87cab4ce47..0627e9799a 100644
--- a/src/realm/indexspace.inl
+++ b/src/realm/indexspace.inl
@@ -968,10 +968,10 @@ namespace Realm {
       const std::vector<FieldDataDescriptor<IndexSpace<N2, T2>, Point<N, T>>> &field_data,
       const std::vector<IndexSpace<N2, T2>> &sources,
       std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
-      Event wait_on, RegionInstance buffer, std::pair<size_t, size_t>* buffer_bounds) const
+      Event wait_on, DeppartOutput* buffers) const
   {
     return create_subspaces_by_image(DomainTransform<N, T, N2, T2>(field_data), sources,
-                                     images, reqs, wait_on, buffer, buffer_bounds);
+                                     images, reqs, wait_on, buffers);
   }
 
   template <int N, typename T>
@@ -980,10 +980,10 @@ namespace Realm {
       const std::vector<FieldDataDescriptor<IndexSpace<N2, T2>, Rect<N, T>>> &field_data,
       const std::vector<IndexSpace<N2, T2>> &sources,
       std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
-      Event wait_on, RegionInstance buffer, std::pair<size_t, size_t>* buffer_bounds) const
+      Event wait_on, DeppartOutput* buffers) const
   {
     return create_subspaces_by_image(DomainTransform<N, T, N2, T2>(field_data), sources,
-                                     images, reqs, wait_on, buffer, buffer_bounds);
+                                     images, reqs, wait_on, buffers);
   }
 
   template <int N, typename T>
diff --git a/tests/deppart.cc b/tests/deppart.cc
index 18d74c44f4..624bb84a97 100644
--- a/tests/deppart.cc
+++ b/tests/deppart.cc
@@ -513,16 +513,7 @@ class BasicTest : public TestInterface {
                                                      Realm::ProfilingRequestSet(),
                                                      e01);
     if(wait_on_events) e02.wait();
-    std::pair<size_t, size_t> estimate;
-    Event _e = is_nodes.create_subspaces_by_image(src_field_data_gpu,
-                                                  p_garbage_edges,
-                                                  p_garbage_rd,
-                                                  Realm::ProfilingRequestSet(),
-                                                  e02,
-                                                  RegionInstance::NO_INST,
-                                                  &estimate);
-    std::cout << "Minimum size: " << estimate.first << " bytes, "
-              << "Maximum size: " << estimate.second << " bytes\n";
+    DeppartOutput output;
 
     // an image of p_edges through out_node gives us all the shared nodes, along
     //  with some private nodes
@@ -535,11 +526,12 @@ class BasicTest : public TestInterface {
     IndexSpace<1> instance_index_space(Rect<1>(0, tile_size-1));
     RegionInstance buffer;
     RegionInstance::create_instance(buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
+    output.buffers[gpu_memory] = buffer;
     Event e03 = is_nodes.create_subspaces_by_image(src_field_data_gpu,
                                                   p_garbage_edges,
                                                   p_garbage_rd,
                                                   Realm::ProfilingRequestSet(),
-                                                  e02, buffer);
+                                                  e02, &output);
     if(wait_on_events) e03.wait();
 
     Event e04 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
@@ -574,7 +566,7 @@ class BasicTest : public TestInterface {
                                                   p_edges,
                                                   p_rd,
                                                   Realm::ProfilingRequestSet(),
-                                                  e2, buffer);
+                                                  e2, &output);
     if(wait_on_events) e3.wait();
   	log_app.info() << "GPU Image complete " << Clock::current_time_in_microseconds() << "\n";
   	log_app.info() << "Starting second GPU preimage " << Clock::current_time_in_microseconds() << "\n";

From 0987dc89a4bb9f9db09fd1635900eaf2b9583b88 Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Sun, 1 Feb 2026 20:18:15 -0800
Subject: [PATCH 04/32] Finished image API

---
 src/realm/deppart/image.cc                | 103 ++++++++++++++--------
 src/realm/deppart/image.h                 |   6 +-
 src/realm/deppart/image_gpu_impl.hpp      |   5 +-
 src/realm/deppart/image_tmpl.cc           |   9 +-
 src/realm/deppart/partitions_gpu_impl.hpp |   2 +-
 src/realm/indexspace.h                    |  33 ++++---
 src/realm/indexspace.inl                  |   8 +-
 tests/deppart.cc                          |  11 ++-
 8 files changed, 103 insertions(+), 74 deletions(-)

diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc
index 19ecf60ea5..d2585bbef0 100644
--- a/src/realm/deppart/image.cc
+++ b/src/realm/deppart/image.cc
@@ -32,20 +32,21 @@ namespace Realm {
 
   template <int N, typename T>
   template <int N2, typename T2>
-  void IndexSpace<N, T>::estimate_image(
-      const DeppartInput<N2, T2>& input,
-      DeppartSuggestion& suggestion) {
+  void IndexSpace<N,T>::suggest_deppart_buffer_size(
+    const std::vector<DeppartSubspace<N2,T2>>& source_spaces,
+    const std::vector<DeppartEstimateInput<N2,T2>>& inputs,
+    std::vector<DeppartEstimateSuggestion>& suggestions) const {
     size_t minimal_size = 0;
     size_t source_entries = 0;
     bool bvh = false;
-    for (size_t size : input.source_sizes) {
-      source_entries += size == 0 ? 1 : size;
+    for (auto subspace : source_spaces) {
+      source_entries += subspace.entries == 0 ? 1 : subspace.entries;
     }
     minimal_size += sizeof(Rect<N2, T2>) * source_entries;
     if (this->dense()) {
       minimal_size += sizeof(Rect<N, T>);
     } else {
-      minimal_size += sizeof(Rect<N, T>) * input.parent_size;
+      minimal_size += sizeof(Rect<N, T>) * this->sparsity.impl()->get_entries().size();
     }
     if (bvh) {
       minimal_size +=
@@ -57,11 +58,27 @@ namespace Realm {
         (2 * source_entries * sizeof(uint64_t)) +
         (source_entries * sizeof(uint64_t));
     }
-    for (size_t i = 0; i < input.insts.size(); i++) {
-      IndexSpace<N2, T2> is = input.insts[i].first;
-      Memory mem = input.insts[i].second;
-      size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>) * input.source_sizes.size() + minimal_size;
-      suggestion.suggestions[mem] = std::make_pair(minimal_size, optimal_size);
+    std::vector<DeppartEstimateSuggestion> result(inputs.size());
+    for (size_t i = 0; i < inputs.size(); i++) {
+      IndexSpace<N2, T2> is = inputs[i].space;
+      Memory mem = inputs[i].location;
+      if (mem.kind() == Memory::GPU_FB_MEM ||
+          mem.kind() == Memory::Z_COPY_MEM) {
+      	const char* val = std::getenv("MIN_SIZE");  // or any env var
+      	size_t device_size = 2000000; //default
+      	if (val) {
+      		device_size = atoi(val);
+      	}
+        minimal_size = max(minimal_size, device_size);
+      	size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>) * source_spaces.size() + minimal_size;
+      	result[i].suggested = mem;
+      	result[i].lower_bound = minimal_size;
+      	result[i].upper_bound = optimal_size;
+      } else {
+	result[i].suggested = Memory::NO_MEMORY;
+      	result[i].lower_bound = 0;
+      	result[i].upper_bound = 0;
+      }
     }
   }
 
@@ -71,7 +88,7 @@ namespace Realm {
       const DomainTransform<N, T, N2, T2> &domain_transform,
       const std::vector<IndexSpace<N2, T2>> &sources,
       std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
-      Event wait_on, DeppartOutput* buffers) const {
+      Event wait_on) const {
    // output vector should start out empty
    assert(images.empty());
 
@@ -79,7 +96,7 @@ namespace Realm {
    Event e = finish_event->current_event();
 
    ImageOperation<N, T, N2, T2> *op = new ImageOperation<N, T, N2, T2>(
-       *this, domain_transform, reqs, finish_event, ID(e).event_generation(), buffers);
+       *this, domain_transform, reqs, finish_event, ID(e).event_generation());
 
    size_t n = sources.size();
    images.resize(n);
@@ -471,11 +488,10 @@ namespace Realm {
       const IndexSpace<N, T> &_parent,
       const DomainTransform<N, T, N2, T2> &_domain_transform,
       const ProfilingRequestSet &reqs, GenEventImpl *_finish_event,
-      EventImpl::gen_t _finish_gen, DeppartOutput* _buffers)
+      EventImpl::gen_t _finish_gen)
       : PartitioningOperation(reqs, _finish_event, _finish_gen),
         parent(_parent),
-        domain_transform(_domain_transform),
-	buffers(_buffers) {}
+        domain_transform(_domain_transform) {}
 
   template <int N, typename T, int N2, typename T2>
   ImageOperation<N,T,N2,T2>::~ImageOperation(void)
@@ -583,26 +599,26 @@ namespace Realm {
   template <int N, typename T, int N2, typename T2>
   void ImageOperation<N, T, N2, T2>::execute(void) {
 
-  	std::map<Memory, std::vector<FieldDataDescriptor<IndexSpace<N2,T2>,Point<N, T>> >> gpu_ptr_data;
+  	std::vector<FieldDataDescriptor<IndexSpace<N2,T2>,Point<N, T>> > gpu_ptr_data;
   	std::vector<FieldDataDescriptor<IndexSpace<N2,T2>,Point<N, T>> > cpu_ptr_data;
-  	std::map<Memory, std::vector<FieldDataDescriptor<IndexSpace<N2,T2>,Rect<N, T>> >> gpu_rect_data;
+  	std::vector<FieldDataDescriptor<IndexSpace<N2,T2>,Rect<N, T>> > gpu_rect_data;
   	std::vector<FieldDataDescriptor<IndexSpace<N2,T2>,Rect<N, T>> > cpu_rect_data;
   	for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) {
   		if (domain_transform.ptr_data[i].inst.get_location().kind() ==
-		      Memory::GPU_FB_MEM) {
-  			gpu_ptr_data[domain_transform.ptr_data[i].inst.get_location()].push_back(domain_transform.ptr_data[i]);
+		      Memory::GPU_FB_MEM || domain_transform.ptr_data[i].inst.get_location().kind() == Memory::Z_COPY_MEM) {
+  		        gpu_ptr_data.push_back(domain_transform.ptr_data[i]);
 		      } else {
 		      	cpu_ptr_data.push_back(domain_transform.ptr_data[i]);
 		      }
   	}
-  	for (size_t i = 0; i < domain_transform.range_data.size(); i++) {
-  		if (domain_transform.range_data[i].inst.get_location().kind() ==
-		      Memory::GPU_FB_MEM) {
-  			gpu_rect_data[domain_transform.range_data[i].inst.get_location()].push_back(domain_transform.range_data[i]);
-		      } else {
-		      	cpu_rect_data.push_back(domain_transform.range_data[i]);
-		      }
-  	}
+        for (size_t i = 0; i < domain_transform.range_data.size(); i++) {
+          if (domain_transform.range_data[i].inst.get_location().kind() ==
+              Memory::GPU_FB_MEM || domain_transform.range_data[i].inst.get_location().kind() == Memory::Z_COPY_MEM) {
+                gpu_rect_data.push_back(domain_transform.range_data[i]);
+              } else {
+                cpu_rect_data.push_back(domain_transform.range_data[i]);
+              }
+        }
   	bool gpu_data = !gpu_ptr_data.empty() || !gpu_rect_data.empty();
     if (domain_transform.type ==
        DomainTransform<N, T, N2, T2>::DomainTransformType::STRUCTURED && !gpu_data) {
@@ -639,8 +655,7 @@ namespace Realm {
 
        	uop->dispatch(this, true /* ok to run in this thread */);
     } else {
-      if (gpu_data) assert(buffers);
-      bool opcount = cpu_ptr_data.size() + cpu_rect_data.size() + gpu_ptr_data.size() + gpu_rect_data.size();
+      size_t opcount = cpu_ptr_data.size() + cpu_rect_data.size() + gpu_ptr_data.size() + gpu_rect_data.size();
       bool exclusive = gpu_data && (opcount == 1);
       if (!exclusive) {
     	// launch full cross-product of image micro ops right away
@@ -674,20 +689,30 @@ namespace Realm {
   								     images[j]);
       	uop->dispatch(this, true /* ok to run in this thread */);
       }
-      for (auto it = gpu_ptr_data.begin(); it != gpu_ptr_data.end(); it++) {
+      for (auto ptr_fdd : gpu_ptr_data) {
           	// launch full cross-product of image micro ops right away
-          Memory my_mem = it->first;
-          domain_transform.ptr_data = it->second;
-          assert(buffers->buffers.find(my_mem) != buffers->buffers.end());
-          RegionInstance buffer = buffers->buffers[my_mem];
+          assert(ptr_fdd.scratch_buffer != RegionInstance::NO_INST);
+          domain_transform.ptr_data = {ptr_fdd};
       	  GPUImageMicroOp<N, T, N2, T2> *micro_op =
-  	         new GPUImageMicroOp<N, T, N2, T2>(
-  		   parent, domain_transform, exclusive, buffer);
+           new GPUImageMicroOp<N, T, N2, T2>(
+  	   parent, domain_transform, exclusive);
       	  for (size_t j = 0; j < sources.size(); j++) {
       		  micro_op->add_sparsity_output(sources[j], images[j]);
       	  }
       	  micro_op->dispatch(this, true);
       }
+      for (auto rect_fdd : gpu_rect_data) {
+        // launch full cross-product of image micro ops right away
+        assert(rect_fdd.scratch_buffer != RegionInstance::NO_INST);
+        domain_transform.range_data = {rect_fdd};
+        GPUImageMicroOp<N, T, N2, T2> *micro_op =
+           new GPUImageMicroOp<N, T, N2, T2>(
+             parent, domain_transform, exclusive);
+        for (size_t j = 0; j < sources.size(); j++) {
+          micro_op->add_sparsity_output(sources[j], images[j]);
+        }
+        micro_op->dispatch(this, true);
+      }
    }
   }
 
@@ -897,8 +922,8 @@ namespace Realm {
   GPUImageMicroOp<N, T, N2, T2>::GPUImageMicroOp(
       const IndexSpace<N, T> &_parent,
       const DomainTransform<N, T, N2, T2> &_domain_transform,
-      bool _exclusive, RegionInstance _buffer)
-      : parent_space(_parent), domain_transform(_domain_transform), buffer(_buffer)
+      bool _exclusive)
+      : parent_space(_parent), domain_transform(_domain_transform)
   {
 	  this->exclusive = _exclusive;
   }
diff --git a/src/realm/deppart/image.h b/src/realm/deppart/image.h
index cafa58b56e..58131338a3 100644
--- a/src/realm/deppart/image.h
+++ b/src/realm/deppart/image.h
@@ -97,7 +97,7 @@ namespace Realm {
         ImageOperation(const IndexSpace<N, T> &_parent,
                        const DomainTransform<N, T, N2, T2> &_domain_transform,
                        const ProfilingRequestSet &reqs, GenEventImpl *_finish_event,
-                       EventImpl::gen_t _finish_gen, DeppartOutput* buffers = nullptr);
+                       EventImpl::gen_t _finish_gen);
 
         virtual ~ImageOperation(void);
 
@@ -122,7 +122,6 @@ namespace Realm {
         std::vector<IndexSpace<N, T> > diff_rhss;
         std::vector<SparsityMap<N, T> > images;
         bool is_intersection;
-        DeppartOutput* buffers;
     };
 
     template<int N, typename T, int N2, typename T2>
@@ -156,7 +155,7 @@ namespace Realm {
         GPUImageMicroOp(
             const IndexSpace<N, T> &_parent,
             const DomainTransform<N, T, N2, T2> &_domain_transform,
-            bool _exclusive, RegionInstance buffer = RegionInstance::NO_INST);
+            bool _exclusive);
 
         virtual ~GPUImageMicroOp(void);
 
@@ -176,7 +175,6 @@ namespace Realm {
         DomainTransform<N, T, N2, T2> domain_transform;
         std::vector<IndexSpace<N2, T2> > sources;
         std::vector<SparsityMap<N, T> > sparsity_outputs;
-        RegionInstance buffer;
     };
 }; // namespace Realm
 
diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp
index ce357436b7..b3c38789f5 100644
--- a/src/realm/deppart/image_gpu_impl.hpp
+++ b/src/realm/deppart/image_gpu_impl.hpp
@@ -224,6 +224,8 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
       return;
     }
 
+    RegionInstance buffer = domain_transform.ptr_data[0].scratch_buffer;
+
     NVTX_DEPPART(gpu_image);
 
     Memory sysmem;
@@ -233,8 +235,7 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
 
     size_t tile_size = buffer.get_layout()->bytes_used;
     std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
-    RegionInstance fixed_buffer = buffer;
-    Arena buffer_arena(reinterpret_cast<void *>(AffineAccessor<char, 1>(fixed_buffer, 0).base), tile_size);
+    Arena buffer_arena(reinterpret_cast<void *>(AffineAccessor<char, 1>(buffer, 0).base), tile_size);
 
     collapsed_space<N2, T2> src_space;
     src_space.offsets = buffer_arena.alloc<size_t>(sources.size()+1);
diff --git a/src/realm/deppart/image_tmpl.cc b/src/realm/deppart/image_tmpl.cc
index f07359c745..288e583758 100644
--- a/src/realm/deppart/image_tmpl.cc
+++ b/src/realm/deppart/image_tmpl.cc
@@ -50,12 +50,13 @@ namespace Realm {
   template class GPUImageMicroOp<N1, T1, N2, T2>;																	 \
   template class ImageOperation<N1,T1,N2,T2>;                                                                        \
   template ImageMicroOp<N1,T1,N2,T2>::ImageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \
-  template void IndexSpace<N1, T1>::estimate_image(                                                      \
-    const DeppartInput<N2, T2>&, DeppartSuggestion&);							     \
+  template void IndexSpace<N1, T1>::suggest_deppart_buffer_size(						     \
+	const std::vector<DeppartSubspace<N2,T2>>&,							     \
+	const std::vector<DeppartEstimateInput<N2,T2>>&,							     \
+	std::vector<DeppartEstimateSuggestion>&) const;							     \
   template Event IndexSpace<N1, T1>::create_subspaces_by_image(                                                      \
       const DomainTransform<N1, T1, N2, T2> &, const std::vector<IndexSpace<N2, T2> > &,                             \
-      std::vector<IndexSpace<N1, T1> > &, const ProfilingRequestSet &, Event,					     \
-	DeppartOutput*)  const;							     \
+      std::vector<IndexSpace<N1, T1> > &, const ProfilingRequestSet &, Event)  const;					\
   template Event IndexSpace<N1,T1>::create_subspaces_by_image_with_difference(                                       \
       const DomainTransform<N1, T1, N2, T2> &,                                                                       \
 									       const std::vector<IndexSpace<N2,T2> >&,                                                     \
diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp
index 678102b56f..b1459f2ede 100644
--- a/src/realm/deppart/partitions_gpu_impl.hpp
+++ b/src/realm/deppart/partitions_gpu_impl.hpp
@@ -1524,7 +1524,7 @@ namespace Realm {
           std::vector<Rect<N, T>> h_rects(end - start);
           CUDA_CHECK(cudaMemcpyAsync(h_rects.data(), final_rects + start, (end - start) * sizeof(Rect<N,T>), cudaMemcpyDeviceToHost, stream), stream);
           CUDA_CHECK(cudaStreamSynchronize(stream), stream);
-          impl->contribute_dense_rect_list(h_rects, true);
+          impl->contribute_dense_rect_list(h_rects, false);
         } else {
           impl->contribute_nothing();
         }
diff --git a/src/realm/indexspace.h b/src/realm/indexspace.h
index 4e56b4e4a8..cf6caf9a26 100644
--- a/src/realm/indexspace.h
+++ b/src/realm/indexspace.h
@@ -109,21 +109,25 @@ namespace Realm {
     IS index_space;
     RegionInstance inst;
     size_t field_offset;
+    RegionInstance scratch_buffer = RegionInstance::NO_INST;
   };
 
   template <int N, typename T>
-  struct DeppartInput {
-    std::vector<std::pair<IndexSpace<N, T>, Memory>> insts;
-    std::vector<size_t> source_sizes;
-    size_t parent_size;
+  struct DeppartSubspace {
+    IndexSpace<N, T> space;
+    size_t entries;
   };
 
-  struct DeppartSuggestion {
-    std::map<Memory, std::pair<size_t, size_t>> suggestions;
+  template <int N, typename T>
+  struct DeppartEstimateInput {
+    IndexSpace<N, T> space;
+    Memory location;
   };
 
-  struct DeppartOutput {
-    std::map<Memory, RegionInstance> buffers;
+  struct DeppartEstimateSuggestion {
+    Memory suggested;
+    size_t lower_bound;
+    size_t upper_bound;
   };
 
   /**
@@ -795,12 +799,13 @@ namespace Realm {
         const DomainTransform<N, T, N2, T2> &domain_transform,
         const std::vector<IndexSpace<N2, T2>> &sources,
         std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
-        Event wait_on = Event::NO_EVENT, DeppartOutput *buffers = nullptr) const;
+        Event wait_on = Event::NO_EVENT) const;
 
     template<int N2, typename T2>
-    REALM_PUBLIC_API void estimate_image(
-        const DeppartInput<N2, T2> &input,
-        DeppartSuggestion &suggestion);
+    REALM_PUBLIC_API void suggest_deppart_buffer_size(
+        const std::vector<DeppartSubspace<N2,T2>>& source_spaces,
+        const std::vector<DeppartEstimateInput<N2,T2>>& inputs,
+        std::vector<DeppartEstimateSuggestion>& suggestions) const;
 
 
     ///@}
@@ -835,7 +840,7 @@ namespace Realm {
             &field_data,
         const std::vector<IndexSpace<N2, T2>> &sources,
         std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
-        Event wait_on = Event::NO_EVENT, DeppartOutput* buffers = nullptr) const;
+        Event wait_on = Event::NO_EVENT) const;
 
     // range versions
     template <int N2, typename T2>
@@ -851,7 +856,7 @@ namespace Realm {
             &field_data,
         const std::vector<IndexSpace<N2, T2>> &sources,
         std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
-        Event wait_on = Event::NO_EVENT, DeppartOutput* buffers = nullptr) const;
+        Event wait_on = Event::NO_EVENT) const;
     ///@}
 
     ///@{
diff --git a/src/realm/indexspace.inl b/src/realm/indexspace.inl
index 0627e9799a..d2c41e4c4e 100644
--- a/src/realm/indexspace.inl
+++ b/src/realm/indexspace.inl
@@ -968,10 +968,10 @@ namespace Realm {
       const std::vector<FieldDataDescriptor<IndexSpace<N2, T2>, Point<N, T>>> &field_data,
       const std::vector<IndexSpace<N2, T2>> &sources,
       std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
-      Event wait_on, DeppartOutput* buffers) const
+      Event wait_on) const
   {
     return create_subspaces_by_image(DomainTransform<N, T, N2, T2>(field_data), sources,
-                                     images, reqs, wait_on, buffers);
+                                     images, reqs, wait_on);
   }
 
   template <int N, typename T>
@@ -980,10 +980,10 @@ namespace Realm {
       const std::vector<FieldDataDescriptor<IndexSpace<N2, T2>, Rect<N, T>>> &field_data,
       const std::vector<IndexSpace<N2, T2>> &sources,
       std::vector<IndexSpace<N, T>> &images, const ProfilingRequestSet &reqs,
-      Event wait_on, DeppartOutput* buffers) const
+      Event wait_on) const
   {
     return create_subspaces_by_image(DomainTransform<N, T, N2, T2>(field_data), sources,
-                                     images, reqs, wait_on, buffers);
+                                     images, reqs, wait_on);
   }
 
   template <int N, typename T>
diff --git a/tests/deppart.cc b/tests/deppart.cc
index 624bb84a97..eaf4a012e8 100644
--- a/tests/deppart.cc
+++ b/tests/deppart.cc
@@ -513,7 +513,6 @@ class BasicTest : public TestInterface {
                                                      Realm::ProfilingRequestSet(),
                                                      e01);
     if(wait_on_events) e02.wait();
-    DeppartOutput output;
 
     // an image of p_edges through out_node gives us all the shared nodes, along
     //  with some private nodes
@@ -524,14 +523,14 @@ class BasicTest : public TestInterface {
     }
     std::vector<size_t> byte_fields = {sizeof(char)};
     IndexSpace<1> instance_index_space(Rect<1>(0, tile_size-1));
-    RegionInstance buffer;
-    RegionInstance::create_instance(buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
-    output.buffers[gpu_memory] = buffer;
+    for (size_t i = 0; i < src_field_data_gpu.size(); i++) {
+      RegionInstance::create_instance(src_field_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
+    }
     Event e03 = is_nodes.create_subspaces_by_image(src_field_data_gpu,
                                                   p_garbage_edges,
                                                   p_garbage_rd,
                                                   Realm::ProfilingRequestSet(),
-                                                  e02, &output);
+                                                  e02);
     if(wait_on_events) e03.wait();
 
     Event e04 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
@@ -566,7 +565,7 @@ class BasicTest : public TestInterface {
                                                   p_edges,
                                                   p_rd,
                                                   Realm::ProfilingRequestSet(),
-                                                  e2, &output);
+                                                  e2);
     if(wait_on_events) e3.wait();
   	log_app.info() << "GPU Image complete " << Clock::current_time_in_microseconds() << "\n";
   	log_app.info() << "Starting second GPU preimage " << Clock::current_time_in_microseconds() << "\n";

From 460710d2582a7cafb3bf3444cc4405e0511a1719 Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Sun, 1 Feb 2026 22:06:28 -0800
Subject: [PATCH 05/32] builds with new APIs (ops themselves are slightly
 broken)

---
 src/CMakeLists.txt                         |    2 +-
 src/realm/deppart/byfield.cc               |   62 +-
 src/realm/deppart/byfield_tmpl.cc          |    6 +-
 src/realm/deppart/image.cc                 |   16 +-
 src/realm/deppart/image_tmpl.cc            |    2 +-
 src/realm/deppart/preimage.cc              | 1149 +++++++++++---------
 src/realm/deppart/preimage.h               |   35 +-
 src/realm/deppart/preimage_gpu_impl.hpp    |  468 ++++++++
 src/realm/deppart/preimage_gpu_kernels.hpp |  256 +++++
 src/realm/deppart/preimage_gpu_tmpl.cu     |   69 ++
 src/realm/deppart/preimage_tmpl.cc         |   43 +-
 src/realm/indexspace.h                     |   13 +-
 12 files changed, 1572 insertions(+), 549 deletions(-)
 create mode 100644 src/realm/deppart/preimage_gpu_impl.hpp
 create mode 100644 src/realm/deppart/preimage_gpu_kernels.hpp
 create mode 100644 src/realm/deppart/preimage_gpu_tmpl.cu

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fd0b1fb81a..c277a1b74d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -158,7 +158,7 @@ endforeach()
 # Generate per-dimension object files for GPU deppart.
 foreach(INST_N1 RANGE 1 ${REALM_MAX_DIM})
     foreach(INST_N2 RANGE 1 ${REALM_MAX_DIM})
-        foreach(SRCFILE realm/deppart/byfield realm/deppart/image)
+        foreach(SRCFILE realm/deppart/byfield realm/deppart/image realm/deppart/preimage)
             set(_result_file "${CMAKE_CURRENT_BINARY_DIR}/${SRCFILE}_gpu_${INST_N1}_${INST_N2}.cu")
             # use cmake's configure_file for a portable way of creating wrapper source files
             configure_file("${PROJECT_SOURCE_DIR}/cmake/deppart_tmpl.cu.in" "${_result_file}")
diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc
index 51b106f519..c6ccacc6ce 100644
--- a/src/realm/deppart/byfield.cc
+++ b/src/realm/deppart/byfield.cc
@@ -29,6 +29,34 @@ namespace Realm {
   extern Logger log_part;
   extern Logger log_uop_timing;
 
+  template <int N, typename T>
+  template<typename FT>
+  void IndexSpace<N,T>::suggest_byfield_buffer_size(
+    const std::vector<DeppartEstimateInput<N,T>>& inputs,
+    std::vector<DeppartEstimateSuggestion>& suggestions) const {
+    suggestions = std::vector<DeppartEstimateSuggestion>(inputs.size());
+    for (size_t i = 0; i < inputs.size(); i++) {
+      IndexSpace<N, T> is = inputs[i].space;
+      Memory mem = inputs[i].location;
+      if (mem.kind() == Memory::GPU_FB_MEM ||
+          mem.kind() == Memory::Z_COPY_MEM) {
+        const char* val = std::getenv("MIN_SIZE");  // or any env var
+        size_t device_size = 2000000; //default
+        if (val) {
+          device_size = atoi(val);
+        }
+        size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>);
+        suggestions[i].suggested = mem;
+        suggestions[i].lower_bound = device_size;
+        suggestions[i].upper_bound = max(device_size, optimal_size);
+          } else {
+            suggestions[i].suggested = Memory::NO_MEMORY;
+            suggestions[i].lower_bound = 0;
+            suggestions[i].upper_bound = 0;
+          }
+    }
+  }
+
 
   template <int N, typename T>
   template <typename FT>
@@ -380,33 +408,35 @@ namespace Realm {
     std::vector<FieldDataDescriptor<IndexSpace<N,T>,FT> > gpu_field_data;
     std::vector<FieldDataDescriptor<IndexSpace<N,T>,FT> > cpu_field_data;
     for (size_t i = 0; i < field_data.size(); i++) {
-      if (field_data[i].inst.get_location().kind() == Memory::GPU_FB_MEM) {
+      if (field_data[i].inst.get_location().kind() == Memory::GPU_FB_MEM
+        || field_data[i].inst.get_location().kind() == Memory::Z_COPY_MEM) {
         gpu_field_data.push_back(field_data[i]);
       } else {
         cpu_field_data.push_back(field_data[i]);
       }
     }
-    if (!cpu_field_data.empty()) {
+    bool exclusive = (gpu_field_data.size() == 1) && cpu_field_data.empty();
+    if (!exclusive) {
       for (size_t i = 0; i < subspaces.size(); i++)
-        SparsityMapImpl<N, T>::lookup(subspaces[i])->set_contributor_count(cpu_field_data.size() + (gpu_field_data.empty() ? 0 : 1));
-      for (size_t i = 0; i < cpu_field_data.size(); i++) {
-        ByFieldMicroOp<N, T, FT> *uop = new ByFieldMicroOp<N, T, FT>(parent,
-                                                                     cpu_field_data[i].index_space,
-                                                                     cpu_field_data[i].inst,
-                                                                     cpu_field_data[i].field_offset);
-        for (size_t j = 0; j < colors.size(); j++)
-          uop->add_sparsity_output(colors[j], subspaces[j]);
-
-        uop->dispatch(this, true /* ok to run in this thread */);
-      }
+        SparsityMapImpl<N, T>::lookup(subspaces[i])->set_contributor_count(cpu_field_data.size() + gpu_field_data.size());
     }
-    if (!gpu_field_data.empty()) {
-      GPUByFieldMicroOp<N, T, FT> *uop = new GPUByFieldMicroOp<N, T, FT>(parent, gpu_field_data, cpu_field_data.empty());
+    for (size_t i = 0; i < cpu_field_data.size(); i++) {
+      ByFieldMicroOp<N, T, FT> *uop = new ByFieldMicroOp<N, T, FT>(parent,
+                                                                   cpu_field_data[i].index_space,
+                                                                   cpu_field_data[i].inst,
+                                                                   cpu_field_data[i].field_offset);
+      for (size_t j = 0; j < colors.size(); j++)
+        uop->add_sparsity_output(colors[j], subspaces[j]);
+
+      uop->dispatch(this, true /* ok to run in this thread */);
+    }
+    for (auto fdd : gpu_field_data) {
+      std::vector<FieldDataDescriptor<IndexSpace<N,T>,FT> > single_gpu_field_data = {fdd};
+      GPUByFieldMicroOp<N, T, FT> *uop = new GPUByFieldMicroOp<N, T, FT>(parent, single_gpu_field_data, exclusive);
       for (size_t i = 0; i < colors.size(); i++) {
         uop->add_sparsity_output(colors[i], subspaces[i]);
       }
       uop->dispatch(this, false);
-
     }
   }
 
diff --git a/src/realm/deppart/byfield_tmpl.cc b/src/realm/deppart/byfield_tmpl.cc
index 38a95a040d..7575607ea2 100644
--- a/src/realm/deppart/byfield_tmpl.cc
+++ b/src/realm/deppart/byfield_tmpl.cc
@@ -52,7 +52,11 @@ namespace Realm {
 							     const std::vector<F>&, \
 							     std::vector<IndexSpace<N,T> >&, \
 							     const ProfilingRequestSet &, \
-							     Event) const;
+							     Event) const;      \
+  template void IndexSpace<N, T>::suggest_byfield_buffer_size<F>(					\
+	const std::vector<DeppartEstimateInput<N,T>>&,						\
+	std::vector<DeppartEstimateSuggestion>&) const;
+
   
 FOREACH_NTF(DOIT)
 
diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc
index d2585bbef0..d207161b22 100644
--- a/src/realm/deppart/image.cc
+++ b/src/realm/deppart/image.cc
@@ -32,7 +32,7 @@ namespace Realm {
 
   template <int N, typename T>
   template <int N2, typename T2>
-  void IndexSpace<N,T>::suggest_deppart_buffer_size(
+  void IndexSpace<N,T>::suggest_image_buffer_size(
     const std::vector<DeppartSubspace<N2,T2>>& source_spaces,
     const std::vector<DeppartEstimateInput<N2,T2>>& inputs,
     std::vector<DeppartEstimateSuggestion>& suggestions) const {
@@ -58,7 +58,7 @@ namespace Realm {
         (2 * source_entries * sizeof(uint64_t)) +
         (source_entries * sizeof(uint64_t));
     }
-    std::vector<DeppartEstimateSuggestion> result(inputs.size());
+    suggestions = std::vector<DeppartEstimateSuggestion>(inputs.size());
     for (size_t i = 0; i < inputs.size(); i++) {
       IndexSpace<N2, T2> is = inputs[i].space;
       Memory mem = inputs[i].location;
@@ -71,13 +71,13 @@ namespace Realm {
       	}
         minimal_size = max(minimal_size, device_size);
       	size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>) * source_spaces.size() + minimal_size;
-      	result[i].suggested = mem;
-      	result[i].lower_bound = minimal_size;
-      	result[i].upper_bound = optimal_size;
+      	suggestions[i].suggested = mem;
+      	suggestions[i].lower_bound = minimal_size;
+      	suggestions[i].upper_bound = optimal_size;
       } else {
-	result[i].suggested = Memory::NO_MEMORY;
-      	result[i].lower_bound = 0;
-      	result[i].upper_bound = 0;
+	suggestions[i].suggested = Memory::NO_MEMORY;
+      	suggestions[i].lower_bound = 0;
+      	suggestions[i].upper_bound = 0;
       }
     }
   }
diff --git a/src/realm/deppart/image_tmpl.cc b/src/realm/deppart/image_tmpl.cc
index 288e583758..8a0e686f22 100644
--- a/src/realm/deppart/image_tmpl.cc
+++ b/src/realm/deppart/image_tmpl.cc
@@ -50,7 +50,7 @@ namespace Realm {
   template class GPUImageMicroOp<N1, T1, N2, T2>;																	 \
   template class ImageOperation<N1,T1,N2,T2>;                                                                        \
   template ImageMicroOp<N1,T1,N2,T2>::ImageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \
-  template void IndexSpace<N1, T1>::suggest_deppart_buffer_size(						     \
+  template void IndexSpace<N1, T1>::suggest_image_buffer_size(						     \
 	const std::vector<DeppartSubspace<N2,T2>>&,							     \
 	const std::vector<DeppartEstimateInput<N2,T2>>&,							     \
 	std::vector<DeppartEstimateSuggestion>&) const;							     \
diff --git a/src/realm/deppart/preimage.cc b/src/realm/deppart/preimage.cc
index 0e43956865..5df628f2f6 100644
--- a/src/realm/deppart/preimage.cc
+++ b/src/realm/deppart/preimage.cc
@@ -17,13 +17,14 @@
 
 // preimage operations for Realm dependent partitioning
 
-#include "realm/deppart/preimage.h"
-
-#include "realm/deppart/deppart_config.h"
-#include "realm/deppart/rectlist.h"
-#include "realm/deppart/inst_helper.h"
-#include "realm/deppart/image.h"
-#include "realm/logging.h"
+#include "preimage.h"
+
+#include "deppart_config.h"
+#include "rectlist.h"
+#include "inst_helper.h"
+#include "image.h"
+#include "../logging.h"
+#include <sstream>
 #include <ctime>
 
 namespace Realm {
@@ -31,6 +32,58 @@ namespace Realm {
   extern Logger log_part;
   extern Logger log_uop_timing;
 
+  template <int N, typename T>
+  template <int N2, typename T2>
+  void IndexSpace<N,T>::suggest_preimage_buffer_size(
+    const std::vector<DeppartSubspace<N2,T2>>& target_spaces,
+    const std::vector<DeppartEstimateInput<N,T>>& inputs,
+    std::vector<DeppartEstimateSuggestion>& suggestions) const {
+    size_t minimal_size = 0;
+    size_t source_entries = 0;
+    bool bvh = false;
+    for (auto subspace : target_spaces) {
+      source_entries += subspace.entries == 0 ? 1 : subspace.entries;
+    }
+    minimal_size += sizeof(Rect<N2, T2>) * source_entries;
+    if (this->dense()) {
+      minimal_size += sizeof(Rect<N, T>);
+    } else {
+      minimal_size += sizeof(Rect<N, T>) * this->sparsity.impl()->get_entries().size();
+    }
+    if (bvh) {
+      minimal_size +=
+        (source_entries * sizeof(uint64_t)) +
+        (source_entries * sizeof(size_t)) +
+        ((2*source_entries - 1) * sizeof(Rect<N, T>)) +
+        (2 * (2*source_entries - 1) * sizeof(int)) +
+        sizeof(Rect<N, T>) +
+        (2 * source_entries * sizeof(uint64_t)) +
+        (source_entries * sizeof(uint64_t));
+    }
+    suggestions = std::vector<DeppartEstimateSuggestion>(inputs.size());
+    for (size_t i = 0; i < inputs.size(); i++) {
+      IndexSpace<N, T> is = inputs[i].space;
+      Memory mem = inputs[i].location;
+      if (mem.kind() == Memory::GPU_FB_MEM ||
+          mem.kind() == Memory::Z_COPY_MEM) {
+        const char* val = std::getenv("MIN_SIZE");  // or any env var
+        size_t device_size = 2000000; //default
+        if (val) {
+          device_size = atoi(val);
+        }
+        minimal_size = max(minimal_size, device_size);
+        size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>) * target_spaces.size() + minimal_size;
+        suggestions[i].suggested = mem;
+        suggestions[i].lower_bound = minimal_size;
+        suggestions[i].upper_bound = optimal_size;
+          } else {
+            suggestions[i].suggested = Memory::NO_MEMORY;
+            suggestions[i].lower_bound = 0;
+            suggestions[i].upper_bound = 0;
+          }
+    }
+  }
+
   template <int N, typename T>
   template <int N2, typename T2>
   Event IndexSpace<N, T>::create_subspaces_by_preimage(
@@ -165,529 +218,625 @@ namespace Realm {
       std::cout << "  " << targets[it->first] << " = " << it->second->rects.size() << " rectangles" << std::endl;
 #endif
 
-    // iterate over sparsity outputs and contribute to all (even if we didn't have any
-    //  points found for it)
-    int empty_count = 0;
-    for(size_t i = 0; i < sparsity_outputs.size(); i++) {
-      SparsityMapImpl<N,T> *impl = SparsityMapImpl<N,T>::lookup(sparsity_outputs[i]);
-      typename std::map<int, DenseRectangleList<N,T> *>::const_iterator it2 = rect_map.find(i);
-      if(it2 != rect_map.end()) {
-	impl->contribute_dense_rect_list(it2->second->rects, true /*disjoint*/);
-	delete it2->second;
-      } else {
-	impl->contribute_nothing();
-	empty_count++;
-      }
-    }
-    if(empty_count > 0)
-      log_part.info() << empty_count << " empty preimages (out of " << sparsity_outputs.size() << ")";
-  }
-
-  template <int N, typename T, int N2, typename T2>
-  void PreimageMicroOp<N,T,N2,T2>::dispatch(PartitioningOperation *op, bool inline_ok)
-  {
-    // a PreimageMicroOp should always be executed on whichever node the field data lives
-    NodeID exec_node = ID(inst).instance_owner_node();
-
-    if(exec_node != Network::my_node_id) {
-      forward_microop<PreimageMicroOp<N,T,N2,T2> >(exec_node, op, this);
-      return;
-    }
-
-    // Need valid data for the instance space
-    if (!inst_space.dense()) {
-      // it's safe to add the count after the registration only because we initialized
-      //  the count to 2 instead of 1
-      bool registered = SparsityMapImpl<N,T>::lookup(inst_space.sparsity)->add_waiter(this, true /*precise*/);
-      if(registered)
-        wait_count.fetch_add(1);
-    }
-
-    // need valid data for each target
-    for(size_t i = 0; i < targets.size(); i++) {
-      if(!targets[i].dense()) {
-	// it's safe to add the count after the registration only because we initialized
-	//  the count to 2 instead of 1
-	bool registered = SparsityMapImpl<N2,T2>::lookup(targets[i].sparsity)->add_waiter(this, true /*precise*/);
-	if(registered)
-	  wait_count.fetch_add(1);
-      }
-    }
-
-    // need valid data for the parent space too
-    if(!parent_space.dense()) {
-      // it's safe to add the count after the registration only because we initialized
-      //  the count to 2 instead of 1
-      bool registered = SparsityMapImpl<N,T>::lookup(parent_space.sparsity)->add_waiter(this, true /*precise*/);
-      if(registered)
-	wait_count.fetch_add(1);
-    }
-    
-    finish_dispatch(op, inline_ok);
-  }
-
-  template <int N, typename T, int N2, typename T2>
-  template <typename S>
-  bool PreimageMicroOp<N,T,N2,T2>::serialize_params(S& s) const
-  {
-    return((s << parent_space) &&
-	   (s << inst_space) &&
-	   (s << inst) &&
-	   (s << field_offset) &&
-	   (s << is_ranged) &&
-	   (s << targets) &&
-	   (s << sparsity_outputs));
-  }
-
-  template <int N, typename T, int N2, typename T2>
-  template <typename S>
-  PreimageMicroOp<N,T,N2,T2>::PreimageMicroOp(NodeID _requestor,
-					      AsyncMicroOp *_async_microop, S& s)
-    : PartitioningMicroOp(_requestor, _async_microop)
-  {
-    bool ok = ((s >> parent_space) &&
-	       (s >> inst_space) &&
-	       (s >> inst) &&
-	       (s >> field_offset) &&
-	       (s >> is_ranged) &&
-	       (s >> targets) &&
-	       (s >> sparsity_outputs));
-    assert(ok);
-    (void)ok;
-  }
-
-  template <int N, typename T, int N2, typename T2>
-  ActiveMessageHandlerReg<RemoteMicroOpMessage<PreimageMicroOp<N,T,N2,T2> > > PreimageMicroOp<N,T,N2,T2>::areg;
-
-
-  ////////////////////////////////////////////////////////////////////////
-  //
-  // class PreimageOperation<N,T,N2,T2>
-
-  template <int N, typename T, int N2, typename T2>
-  PreimageOperation<N, T, N2, T2>::PreimageOperation(
-      const IndexSpace<N, T> &_parent,
-      const DomainTransform<N2, T2, N, T> &_domain_transform,
-      const ProfilingRequestSet &reqs, GenEventImpl *_finish_event,
-      EventImpl::gen_t _finish_gen)
-      : PartitioningOperation(reqs, _finish_event, _finish_gen),
-        parent(_parent),
-        domain_transform(_domain_transform),
-        overlap_tester(0),
-        dummy_overlap_uop(0) {
-   areg.force_instantiation();
-  }
-
-  template <int N, typename T, int N2, typename T2>
-  PreimageOperation<N, T, N2, T2>::~PreimageOperation(void)
-  {
-    if(overlap_tester)
-      delete overlap_tester;
-  }
-
-  template <int N, typename T, int N2, typename T2>
-  IndexSpace<N,T> PreimageOperation<N,T,N2,T2>::add_target(const IndexSpace<N2,T2>& target)
-  {
-    // try to filter out obviously empty targets
-    if(parent.empty() || target.empty())
-      return IndexSpace<N,T>::make_empty();
-
-    // otherwise it'll be something smaller than the current parent
-    IndexSpace<N,T> preimage;
-    preimage.bounds = parent.bounds;
-
-    // if the target has a sparsity map, use the same node - otherwise
-    // get a sparsity ID by round-robin'ing across the nodes that have field data
-    int target_node;
-    if(!target.dense())
-      target_node = ID(target.sparsity).sparsity_creator_node();
-    else if (!domain_transform.ptr_data.empty())
-     target_node =
-         ID(domain_transform
-                .ptr_data[targets.size() % domain_transform.ptr_data.size()]
-                .inst)
-             .instance_owner_node();
-    else
-     target_node =
-         ID(domain_transform
-                .range_data[targets.size() % domain_transform.range_data.size()]
-                .inst)
-             .instance_owner_node();
-    SparsityMap<N,T> sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert<SparsityMap<N,T> >();
-    preimage.sparsity = sparsity;
-
-    targets.push_back(target);
-    preimages.push_back(sparsity);
-
-    return preimage;
-  }
-
-  template <int N, typename T, int N2, typename T2>
-  void PreimageOperation<N, T, N2, T2>::execute(void) {
-   if (domain_transform.type ==
-       DomainTransform<N2, T2, N, T>::DomainTransformType::STRUCTURED) {
-    for (size_t i = 0; i < preimages.size(); i++) {
-     SparsityMapImpl<N, T>::lookup(preimages[i])->set_contributor_count(1);
-    }
-
-    StructuredPreimageMicroOp<N, T, N2, T2> *micro_op =
-        new StructuredPreimageMicroOp<N, T, N2, T2>(
-            domain_transform.structured_transform, parent);
-
-    for (size_t j = 0; j < targets.size(); j++) {
-     micro_op->add_sparsity_output(targets[j], preimages[j]);
-    }
-    micro_op->dispatch(this, true);
-   } else {
-    if (!DeppartConfig::cfg_disable_intersection_optimization) {
-     // build the overlap tester based on the targets, since they're at least
-     // known
-     ComputeOverlapMicroOp<N2, T2> *uop =
-         new ComputeOverlapMicroOp<N2, T2>(this);
-
-     remaining_sparse_images.store(domain_transform.ptr_data.size() +
-                                   domain_transform.range_data.size());
-     contrib_counts.resize(preimages.size(), atomic<int>(0));
-
-     // create a dummy async microop that lives until we've received all the
-     // sparse images
-     dummy_overlap_uop = new AsyncMicroOp(this, 0);
-     add_async_work_item(dummy_overlap_uop);
-
-     // add each target, but also generate a bounding box for all of them
-     Rect<N2, T2> target_bbox;
-     for (size_t i = 0; i < targets.size(); i++) {
-      uop->add_input_space(targets[i]);
-      if (i == 0)
-       target_bbox = targets[i].bounds;
-      else
-       target_bbox = target_bbox.union_bbox(targets[i].bounds);
-     }
-
-     for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) {
-      // in parallel, we will request the approximate images of each instance's
-      //  data (ideally limited to the target_bbox)
-      ImageMicroOp<N2, T2, N, T> *img = new ImageMicroOp<N2, T2, N, T>(
-          target_bbox, domain_transform.ptr_data[i].index_space,
-          domain_transform.ptr_data[i].inst,
-          domain_transform.ptr_data[i].field_offset, false /*ptrs*/);
-      img->add_approx_output(i, this);
-      img->dispatch(this, false /* do not run in this thread */);
-     }
-
-     for (size_t i = 0; i < domain_transform.range_data.size(); i++) {
-      // in parallel, we will request the approximate images of each instance's
-      //  data (ideally limited to the target_bbox)
-      ImageMicroOp<N2, T2, N, T> *img = new ImageMicroOp<N2, T2, N, T>(
-          target_bbox, domain_transform.range_data[i].index_space,
-          domain_transform.range_data[i].inst,
-          domain_transform.range_data[i].field_offset, true /*ranges*/);
-      img->add_approx_output(i + domain_transform.ptr_data.size(), this);
-      img->dispatch(this, false /* do not run in this thread */);
-     }
+		// iterate over sparsity outputs and contribute to all (even if we didn't have any
+		//  points found for it)
+		int empty_count = 0;
+		for (size_t i = 0; i < sparsity_outputs.size(); i++) {
+			SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(sparsity_outputs[i]);
+			typename std::map<int, DenseRectangleList<N, T> *>::const_iterator it2 = rect_map.find(i);
+			if (it2 != rect_map.end()) {
+				impl->contribute_dense_rect_list(it2->second->rects, true /*disjoint*/);
+				delete it2->second;
+			} else {
+				impl->contribute_nothing();
+				empty_count++;
+			}
+		}
+		if (empty_count > 0) {
+			log_part.info() << empty_count << " empty preimages (out of " << sparsity_outputs.size() << ")";
+		}
+	}
 
-     uop->dispatch(this, true /* ok to run in this thread */);
-    } else {
-     for (size_t i = 0; i < preimages.size(); i++)
-      SparsityMapImpl<N, T>::lookup(preimages[i])
-          ->set_contributor_count(domain_transform.ptr_data.size() +
-                                  domain_transform.range_data.size());
-
-     for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) {
-      PreimageMicroOp<N, T, N2, T2> *uop = new PreimageMicroOp<N, T, N2, T2>(
-          parent, domain_transform.ptr_data[i].index_space,
-          domain_transform.ptr_data[i].inst,
-          domain_transform.ptr_data[i].field_offset, false /*ptrs*/);
-      for (size_t j = 0; j < targets.size(); j++)
-       uop->add_sparsity_output(targets[j], preimages[j]);
-      uop->dispatch(this, true /* ok to run in this thread */);
-     }
+	template<int N, typename T, int N2, typename T2>
+	void PreimageMicroOp<N, T, N2, T2>::dispatch(PartitioningOperation *op, bool inline_ok) {
+		// a PreimageMicroOp should always be executed on whichever node the field data lives
+		NodeID exec_node = ID(inst).instance_owner_node();
+
+		if (exec_node != Network::my_node_id) {
+			forward_microop<PreimageMicroOp<N, T, N2, T2> >(exec_node, op, this);
+			return;
+		}
+
+		// Need valid data for the instance space
+		if (!inst_space.dense()) {
+			// it's safe to add the count after the registration only because we initialized
+			//  the count to 2 instead of 1
+			bool registered = SparsityMapImpl<N, T>::lookup(inst_space.sparsity)->add_waiter(this, true /*precise*/);
+			if (registered)
+				wait_count.fetch_add(1);
+		}
+
+		// need valid data for each target
+		for (size_t i = 0; i < targets.size(); i++) {
+			if (!targets[i].dense()) {
+				// it's safe to add the count after the registration only because we initialized
+				//  the count to 2 instead of 1
+				bool registered = SparsityMapImpl<N2, T2>::lookup(targets[i].sparsity)->add_waiter(
+					this, true /*precise*/);
+				if (registered)
+					wait_count.fetch_add(1);
+			}
+		}
+
+		// need valid data for the parent space too
+		if (!parent_space.dense()) {
+			// it's safe to add the count after the registration only because we initialized
+			//  the count to 2 instead of 1
+			bool registered = SparsityMapImpl<N, T>::lookup(parent_space.sparsity)->add_waiter(this, true /*precise*/);
+			if (registered)
+				wait_count.fetch_add(1);
+		}
+
+		finish_dispatch(op, inline_ok);
+	}
 
-     for (size_t i = 0; i < domain_transform.range_data.size(); i++) {
-      PreimageMicroOp<N, T, N2, T2> *uop = new PreimageMicroOp<N, T, N2, T2>(
-          parent, domain_transform.range_data[i].index_space,
-          domain_transform.range_data[i].inst,
-          domain_transform.range_data[i].field_offset, true /*ranges*/);
-      for (size_t j = 0; j < targets.size(); j++)
-       uop->add_sparsity_output(targets[j], preimages[j]);
-      uop->dispatch(this, true /* ok to run in this thread */);
-     }
-    }
-   }
-  }
+	template<int N, typename T, int N2, typename T2>
+	template<typename S>
+	bool PreimageMicroOp<N, T, N2, T2>::serialize_params(S &s) const {
+		return ((s << parent_space) &&
+		        (s << inst_space) &&
+		        (s << inst) &&
+		        (s << field_offset) &&
+		        (s << is_ranged) &&
+		        (s << targets) &&
+		        (s << sparsity_outputs));
+	}
 
-  template <int N, typename T, int N2, typename T2>
-  void PreimageOperation<N,T,N2,T2>::provide_sparse_image(int index, const Rect<N2,T2> *rects, size_t count)
-  {
-    // atomically check the overlap tester's readiness and queue us if not
-    bool tester_ready = false;
-    {
-      AutoLock<> al(mutex);
-      if(overlap_tester != 0) {
-	tester_ready = true;
-      } else {
-	std::vector<Rect<N2,T2> >& r = pending_sparse_images[index];
-	r.insert(r.end(), rects, rects + count);
-      }
-    }
+	template<int N, typename T, int N2, typename T2>
+	template<typename S>
+	PreimageMicroOp<N, T, N2, T2>::PreimageMicroOp(NodeID _requestor,
+	                                               AsyncMicroOp *_async_microop, S &s)
+		: PartitioningMicroOp(_requestor, _async_microop) {
+		bool ok = ((s >> parent_space) &&
+		           (s >> inst_space) &&
+		           (s >> inst) &&
+		           (s >> field_offset) &&
+		           (s >> is_ranged) &&
+		           (s >> targets) &&
+		           (s >> sparsity_outputs));
+		assert(ok);
+		(void) ok;
+	}
 
-    if(tester_ready) {
-      // see which of the targets this image overlaps
-      std::set<int> overlaps;
-      overlap_tester->test_overlap(rects, count, overlaps);
-      if((size_t)index < domain_transform.ptr_data.size()) {
-	log_part.info() << "image of ptr_data[" << index << "] overlaps " << overlaps.size() << " targets";
-        PreimageMicroOp<N, T, N2, T2> *uop = new PreimageMicroOp<N, T, N2, T2>(
-            parent, domain_transform.ptr_data[index].index_space,
-            domain_transform.ptr_data[index].inst,
-            domain_transform.ptr_data[index].field_offset, false /*ptrs*/);
-        for(std::set<int>::const_iterator it2 = overlaps.begin();
-	    it2 != overlaps.end();
-	    it2++) {
-	  int j = *it2;
-	  contrib_counts[j].fetch_add(1);
-	  uop->add_sparsity_output(targets[j], preimages[j]);
+	template<int N, typename T, int N2, typename T2>
+	ActiveMessageHandlerReg<RemoteMicroOpMessage<PreimageMicroOp<N, T, N2, T2> > > PreimageMicroOp<N, T, N2, T2>::areg;
+
+
+	////////////////////////////////////////////////////////////////////////
+	//
+	// class PreimageOperation<N,T,N2,T2>
+
+	template<int N, typename T, int N2, typename T2>
+	PreimageOperation<N, T, N2, T2>::PreimageOperation(
+		const IndexSpace<N, T> &_parent,
+		const DomainTransform<N2, T2, N, T> &_domain_transform,
+		const ProfilingRequestSet &reqs, GenEventImpl *_finish_event,
+		EventImpl::gen_t _finish_gen)
+		: PartitioningOperation(reqs, _finish_event, _finish_gen),
+		  parent(_parent),
+		  domain_transform(_domain_transform),
+		  overlap_tester(0),
+		  dummy_overlap_uop(0) {
+		areg.force_instantiation();
 	}
-	uop->dispatch(this, false /* do not run in this thread */);
-      } else {
-	size_t rel_index = index - domain_transform.ptr_data.size();
-	assert(rel_index < domain_transform.range_data.size());
-	log_part.info() << "image of range_data[" << rel_index << "] overlaps " << overlaps.size() << " targets";
-        PreimageMicroOp<N, T, N2, T2> *uop = new PreimageMicroOp<N, T, N2, T2>(
-            parent, domain_transform.range_data[rel_index].index_space,
-            domain_transform.range_data[rel_index].inst,
-            domain_transform.range_data[rel_index].field_offset,
-            true /*ranges*/);
-        for(std::set<int>::const_iterator it2 = overlaps.begin();
-	    it2 != overlaps.end();
-	    it2++) {
-	  int j = *it2;
-	  contrib_counts[j].fetch_add(1);
-	  uop->add_sparsity_output(targets[j], preimages[j]);
+
+	template<int N, typename T, int N2, typename T2>
+	PreimageOperation<N, T, N2, T2>::~PreimageOperation(void) {
+		if (overlap_tester)
+			delete overlap_tester;
 	}
-	uop->dispatch(this, false /* do not run in this thread */);
-      }
 
-      // if these were the last sparse images, we can now set the contributor counts
-      int v = remaining_sparse_images.fetch_sub(1) - 1;
-      if(v == 0) {
-	for(size_t j = 0; j < preimages.size(); j++) {
-	  log_part.info() << contrib_counts[j].load() << " total contributors to preimage " << j;
-	  SparsityMapImpl<N,T>::lookup(preimages[j])->set_contributor_count(contrib_counts[j].load());
+	template<int N, typename T, int N2, typename T2>
+	IndexSpace<N, T> PreimageOperation<N, T, N2, T2>::add_target(const IndexSpace<N2, T2> &target) {
+		// try to filter out obviously empty targets
+		if (parent.empty() || target.empty())
+			return IndexSpace<N, T>::make_empty();
+
+		// otherwise it'll be something smaller than the current parent
+		IndexSpace<N, T> preimage;
+		preimage.bounds = parent.bounds;
+
+		// if the target has a sparsity map, use the same node - otherwise
+		// get a sparsity ID by round-robin'ing across the nodes that have field data
+		int target_node;
+		if (!target.dense())
+			target_node = ID(target.sparsity).sparsity_creator_node();
+		else if (!domain_transform.ptr_data.empty())
+			target_node =
+					ID(domain_transform
+						.ptr_data[targets.size() % domain_transform.ptr_data.size()]
+						.inst)
+					.instance_owner_node();
+		else
+			target_node =
+					ID(domain_transform
+						.range_data[targets.size() % domain_transform.range_data.size()]
+						.inst)
+					.instance_owner_node();
+		SparsityMap<N, T> sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert<SparsityMap<N,
+			T> >();
+		preimage.sparsity = sparsity;
+
+		targets.push_back(target);
+		preimages.push_back(sparsity);
+
+		return preimage;
 	}
-	dummy_overlap_uop->mark_finished(true /*successful*/);
-      }
-    }
-  }
 
-  template <int N, typename T, int N2, typename T2>
-  void PreimageOperation<N,T,N2,T2>::set_overlap_tester(void *tester)
-  {
-    // atomically set the overlap tester and see if there are any pending entries
-    std::map<int, std::vector<Rect<N2,T2> > > pending;
-    {
-      AutoLock<> al(mutex);
-      assert(overlap_tester == 0);
-      overlap_tester = static_cast<OverlapTester<N2,T2> *>(tester);
-      pending.swap(pending_sparse_images);
-    }
+	template<int N, typename T, int N2, typename T2>
+	void PreimageOperation<N, T, N2, T2>::execute(void) {
+		std::vector<FieldDataDescriptor<IndexSpace<N,T>,Point<N2, T2>> > gpu_ptr_data;
+		std::vector<FieldDataDescriptor<IndexSpace<N,T>,Point<N2, T2>> > cpu_ptr_data;
+		std::vector<FieldDataDescriptor<IndexSpace<N,T>,Rect<N2, T2>> > gpu_rect_data;
+		std::vector<FieldDataDescriptor<IndexSpace<N,T>,Rect<N2, T2>> > cpu_rect_data;
+		for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) {
+			if (domain_transform.ptr_data[i].inst.get_location().kind() ==
+			    Memory::GPU_FB_MEM || domain_transform.ptr_data[i].inst.get_location().kind() == Memory::Z_COPY_MEM) {
+				gpu_ptr_data.push_back(domain_transform.ptr_data[i]);
+			    } else {
+			    	cpu_ptr_data.push_back(domain_transform.ptr_data[i]);
+			    }
+		}
+		for (size_t i = 0; i < domain_transform.range_data.size(); i++) {
+			if (domain_transform.range_data[i].inst.get_location().kind() ==
+			    Memory::GPU_FB_MEM  || domain_transform.range_data[i].inst.get_location().kind() == Memory::Z_COPY_MEM) {
+				gpu_rect_data.push_back(domain_transform.range_data[i]);
+			    } else {
+			    	cpu_rect_data.push_back(domain_transform.range_data[i]);
+			    }
+		}
+		bool gpu_data = !gpu_ptr_data.empty() || !gpu_rect_data.empty();
+	        bool opcount = cpu_ptr_data.size() + cpu_rect_data.size() + gpu_ptr_data.size() + gpu_rect_data.size();
+	        bool exclusive = (gpu_data && (opcount == 1));
+		if (domain_transform.type ==
+		           DomainTransform<N2, T2, N, T>::DomainTransformType::STRUCTURED && !gpu_data) {
+			for (size_t i = 0; i < preimages.size(); i++) {
+				SparsityMapImpl<N, T>::lookup(preimages[i])->set_contributor_count(1);
+			}
+
+			StructuredPreimageMicroOp<N, T, N2, T2> *micro_op =
+					new StructuredPreimageMicroOp<N, T, N2, T2>(
+						domain_transform.structured_transform, parent);
+
+			for (size_t j = 0; j < targets.size(); j++) {
+				micro_op->add_sparsity_output(targets[j], preimages[j]);
+			}
+			micro_op->dispatch(this, true);
+		} else if (!DeppartConfig::cfg_disable_intersection_optimization && !gpu_data) {
+				// build the overlap tester based on the targets, since they're at least
+				// known
+				ComputeOverlapMicroOp<N2, T2> *uop =
+						new ComputeOverlapMicroOp<N2, T2>(this);
+
+				remaining_sparse_images.store(domain_transform.ptr_data.size() +
+				                              domain_transform.range_data.size());
+				contrib_counts.resize(preimages.size(), atomic<int>(0));
+
+				// create a dummy async microop that lives until we've received all the
+				// sparse images
+				dummy_overlap_uop = new AsyncMicroOp(this, 0);
+				add_async_work_item(dummy_overlap_uop);
+
+				// add each target, but also generate a bounding box for all of them
+				Rect<N2, T2> target_bbox;
+				for (size_t i = 0; i < targets.size(); i++) {
+					uop->add_input_space(targets[i]);
+					if (i == 0)
+						target_bbox = targets[i].bounds;
+					else
+						target_bbox = target_bbox.union_bbox(targets[i].bounds);
+				}
+
+				for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) {
+					// in parallel, we will request the approximate images of each instance's
+					//  data (ideally limited to the target_bbox)
+					ImageMicroOp<N2, T2, N, T> *img = new ImageMicroOp<N2, T2, N, T>(
+						target_bbox, domain_transform.ptr_data[i].index_space,
+						domain_transform.ptr_data[i].inst,
+						domain_transform.ptr_data[i].field_offset, false /*ptrs*/);
+					img->add_approx_output(i, this);
+					img->dispatch(this, false /* do not run in this thread */);
+				}
+
+				for (size_t i = 0; i < domain_transform.range_data.size(); i++) {
+					// in parallel, we will request the approximate images of each instance's
+					//  data (ideally limited to the target_bbox)
+					ImageMicroOp<N2, T2, N, T> *img = new ImageMicroOp<N2, T2, N, T>(
+						target_bbox, domain_transform.range_data[i].index_space,
+						domain_transform.range_data[i].inst,
+						domain_transform.range_data[i].field_offset, true /*ranges*/);
+					img->add_approx_output(i + domain_transform.ptr_data.size(), this);
+					img->dispatch(this, false /* do not run in this thread */);
+				}
+
+				uop->dispatch(this, true /* ok to run in this thread */);
+		} else {
+			if (!exclusive) {
+			  for (size_t i = 0; i < preimages.size(); i++)
+			    SparsityMapImpl<N, T>::lookup(preimages[i])
+                                            ->set_contributor_count(opcount);
+			}
+			for (size_t i = 0; i < cpu_ptr_data.size(); i++) {
+				PreimageMicroOp<N, T, N2, T2> *uop = new PreimageMicroOp<N, T, N2, T2>(
+					parent, cpu_ptr_data[i].index_space,
+					cpu_ptr_data[i].inst,
+					cpu_ptr_data[i].field_offset, false /*ptrs*/);
+				for (size_t j = 0; j < targets.size(); j++)
+					uop->add_sparsity_output(targets[j], preimages[j]);
+				uop->dispatch(this, true /* ok to run in this thread */);
+			}
+			for (size_t i = 0; i < cpu_rect_data.size(); i++) {
+				PreimageMicroOp<N, T, N2, T2> *uop = new PreimageMicroOp<N, T, N2, T2>(
+					parent, cpu_rect_data[i].index_space,
+					cpu_rect_data[i].inst,
+					cpu_rect_data[i].field_offset, true /*ranges*/);
+				for (size_t j = 0; j < targets.size(); j++)
+					uop->add_sparsity_output(targets[j], preimages[j]);
+				uop->dispatch(this, true /* ok to run in this thread */);
+			}
+			for (auto ptr_fdd : gpu_ptr_data) {
+				domain_transform.ptr_data = {ptr_fdd};
+				GPUPreimageMicroOp<N, T, N2, T2> *micro_op =
+				   new GPUPreimageMicroOp<N, T, N2, T2>(
+				     domain_transform, parent, exclusive);
+				for (size_t j = 0; j < targets.size(); j++) {
+					micro_op->add_sparsity_output(targets[j], preimages[j]);
+				}
+				micro_op->dispatch(this, true);
+			}
+		        for (auto range_fdd : gpu_rect_data) {
+		          domain_transform.range_data = {range_fdd};
+		          GPUPreimageMicroOp<N, T, N2, T2> *micro_op =
+                             new GPUPreimageMicroOp<N, T, N2, T2>(
+                               domain_transform, parent, exclusive);
+		          for (size_t j = 0; j < targets.size(); j++) {
+		            micro_op->add_sparsity_output(targets[j], preimages[j]);
+		          }
+		          micro_op->dispatch(this, true);
+		        }
+		}
+	}
 
-    // now issue work for any sparse images we got before the tester was ready
-    if(!pending.empty()) {
-      for(typename std::map<int, std::vector<Rect<N2,T2> > >::const_iterator it = pending.begin();
-	  it != pending.end();
-	  it++) {
-	// see which instance this is an image from
-	size_t idx = it->first;
-	// see which of the targets that image overlaps
-	std::set<int> overlaps;
-	overlap_tester->test_overlap(&it->second[0], it->second.size(), overlaps);
-	if(idx < domain_transform.ptr_data.size()) {
-	  log_part.info() << "image of ptr_data[" << idx << "] overlaps " << overlaps.size() << " targets";
-          PreimageMicroOp<N, T, N2, T2> *uop =
-              new PreimageMicroOp<N, T, N2, T2>(
-                  parent, domain_transform.ptr_data[idx].index_space,
-                  domain_transform.ptr_data[idx].inst,
-                  domain_transform.ptr_data[idx].field_offset, false /*ptrs*/);
-          for(std::set<int>::const_iterator it2 = overlaps.begin();
-	      it2 != overlaps.end();
-	      it2++) {
-	    int j = *it2;
-	    contrib_counts[j].fetch_add(1);
-	    uop->add_sparsity_output(targets[j], preimages[j]);
-	  }
-	  uop->dispatch(this, true /* ok to run in this thread */);
-	} else {
-	  size_t rel_index = idx - domain_transform.ptr_data.size();
-	  assert(rel_index < domain_transform.range_data.size());
-	  log_part.info() << "image of range_data[" << rel_index << "] overlaps " << overlaps.size() << " targets";
-          PreimageMicroOp<N, T, N2, T2> *uop =
-              new PreimageMicroOp<N, T, N2, T2>(
-                  parent, domain_transform.range_data[rel_index].index_space,
-                  domain_transform.range_data[rel_index].inst,
-                  domain_transform.range_data[rel_index].field_offset,
-                  true /*ranges*/);
-          for(std::set<int>::const_iterator it2 = overlaps.begin();
-	      it2 != overlaps.end();
-	      it2++) {
-	    int j = *it2;
-	    contrib_counts[j].fetch_add(1);
-	    uop->add_sparsity_output(targets[j], preimages[j]);
-	  }
-	  uop->dispatch(this, true /* ok to run in this thread */);
+	template<int N, typename T, int N2, typename T2>
+	void PreimageOperation<N, T, N2, T2>::provide_sparse_image(int index, const Rect<N2, T2> *rects, size_t count) {
+		// atomically check the overlap tester's readiness and queue us if not
+		bool tester_ready = false;
+		{
+			AutoLock<> al(mutex);
+			if (overlap_tester != 0) {
+				tester_ready = true;
+			} else {
+				std::vector<Rect<N2, T2> > &r = pending_sparse_images[index];
+				r.insert(r.end(), rects, rects + count);
+			}
+		}
+
+		if (tester_ready) {
+			// see which of the targets this image overlaps
+			std::set<int> overlaps;
+			overlap_tester->test_overlap(rects, count, overlaps);
+			if ((size_t) index < domain_transform.ptr_data.size()) {
+				log_part.info() << "image of ptr_data[" << index << "] overlaps " << overlaps.size() << " targets";
+				PreimageMicroOp<N, T, N2, T2> *uop = new PreimageMicroOp<N, T, N2, T2>(
+					parent, domain_transform.ptr_data[index].index_space,
+					domain_transform.ptr_data[index].inst,
+					domain_transform.ptr_data[index].field_offset, false /*ptrs*/);
+				for (std::set<int>::const_iterator it2 = overlaps.begin();
+				     it2 != overlaps.end();
+				     it2++) {
+					int j = *it2;
+					contrib_counts[j].fetch_add(1);
+					uop->add_sparsity_output(targets[j], preimages[j]);
+				}
+				uop->dispatch(this, false /* do not run in this thread */);
+			} else {
+				size_t rel_index = index - domain_transform.ptr_data.size();
+				assert(rel_index < domain_transform.range_data.size());
+				log_part.info() << "image of range_data[" << rel_index << "] overlaps " << overlaps.size() <<
+						" targets";
+				PreimageMicroOp<N, T, N2, T2> *uop = new PreimageMicroOp<N, T, N2, T2>(
+					parent, domain_transform.range_data[rel_index].index_space,
+					domain_transform.range_data[rel_index].inst,
+					domain_transform.range_data[rel_index].field_offset,
+					true /*ranges*/);
+				for (std::set<int>::const_iterator it2 = overlaps.begin();
+				     it2 != overlaps.end();
+				     it2++) {
+					int j = *it2;
+					contrib_counts[j].fetch_add(1);
+					uop->add_sparsity_output(targets[j], preimages[j]);
+				}
+				uop->dispatch(this, false /* do not run in this thread */);
+			}
+
+			// if these were the last sparse images, we can now set the contributor counts
+			int v = remaining_sparse_images.fetch_sub(1) - 1;
+			if (v == 0) {
+				for (size_t j = 0; j < preimages.size(); j++) {
+					log_part.info() << contrib_counts[j].load() << " total contributors to preimage " << j;
+					SparsityMapImpl<N, T>::lookup(preimages[j])->set_contributor_count(contrib_counts[j].load());
+				}
+				dummy_overlap_uop->mark_finished(true /*successful*/);
+			}
+		}
 	}
-      }
 
-      // if these were the last sparse images, we can now set the contributor counts
-      int v = remaining_sparse_images.fetch_sub(pending.size()) - pending.size();
-      if(v == 0) {
-	for(size_t j = 0; j < preimages.size(); j++) {
-	  log_part.info() << contrib_counts[j].load() << " total contributors to preimage " << j;
-	  SparsityMapImpl<N,T>::lookup(preimages[j])->set_contributor_count(contrib_counts[j].load());
+	template<int N, typename T, int N2, typename T2>
+	void PreimageOperation<N, T, N2, T2>::set_overlap_tester(void *tester) {
+		// atomically set the overlap tester and see if there are any pending entries
+		std::map<int, std::vector<Rect<N2, T2> > > pending;
+		{
+			AutoLock<> al(mutex);
+			assert(overlap_tester == 0);
+			overlap_tester = static_cast<OverlapTester<N2, T2> *>(tester);
+			pending.swap(pending_sparse_images);
+		}
+
+		// now issue work for any sparse images we got before the tester was ready
+		if (!pending.empty()) {
+			for (typename std::map<int, std::vector<Rect<N2, T2> > >::const_iterator it = pending.begin();
+			     it != pending.end();
+			     it++) {
+				// see which instance this is an image from
+				size_t idx = it->first;
+				// see which of the targets that image overlaps
+				std::set<int> overlaps;
+				overlap_tester->test_overlap(&it->second[0], it->second.size(), overlaps);
+				if (idx < domain_transform.ptr_data.size()) {
+					log_part.info() << "image of ptr_data[" << idx << "] overlaps " << overlaps.size() << " targets";
+					PreimageMicroOp<N, T, N2, T2> *uop =
+							new PreimageMicroOp<N, T, N2, T2>(
+								parent, domain_transform.ptr_data[idx].index_space,
+								domain_transform.ptr_data[idx].inst,
+								domain_transform.ptr_data[idx].field_offset, false /*ptrs*/);
+					for (std::set<int>::const_iterator it2 = overlaps.begin();
+					     it2 != overlaps.end();
+					     it2++) {
+						int j = *it2;
+						contrib_counts[j].fetch_add(1);
+						uop->add_sparsity_output(targets[j], preimages[j]);
+					}
+					uop->dispatch(this, true /* ok to run in this thread */);
+				} else {
+					size_t rel_index = idx - domain_transform.ptr_data.size();
+					assert(rel_index < domain_transform.range_data.size());
+					log_part.info() << "image of range_data[" << rel_index << "] overlaps " << overlaps.size() <<
+							" targets";
+					PreimageMicroOp<N, T, N2, T2> *uop =
+							new PreimageMicroOp<N, T, N2, T2>(
+								parent, domain_transform.range_data[rel_index].index_space,
+								domain_transform.range_data[rel_index].inst,
+								domain_transform.range_data[rel_index].field_offset,
+								true /*ranges*/);
+					for (std::set<int>::const_iterator it2 = overlaps.begin();
+					     it2 != overlaps.end();
+					     it2++) {
+						int j = *it2;
+						contrib_counts[j].fetch_add(1);
+						uop->add_sparsity_output(targets[j], preimages[j]);
+					}
+					uop->dispatch(this, true /* ok to run in this thread */);
+				}
+			}
+
+			// if these were the last sparse images, we can now set the contributor counts
+			int v = remaining_sparse_images.fetch_sub(pending.size()) - pending.size();
+			if (v == 0) {
+				for (size_t j = 0; j < preimages.size(); j++) {
+					log_part.info() << contrib_counts[j].load() << " total contributors to preimage " << j;
+					SparsityMapImpl<N, T>::lookup(preimages[j])->set_contributor_count(contrib_counts[j].load());
+				}
+				dummy_overlap_uop->mark_finished(true /*successful*/);
+			}
+		}
 	}
-	dummy_overlap_uop->mark_finished(true /*successful*/);
-      }
-    }
-  }
 
-  template <int N, typename T, int N2, typename T2>
-  void PreimageOperation<N,T,N2,T2>::print(std::ostream& os) const
-  {
-    os << "PreimageOperation(" << parent << ")";
-  }
+	template<int N, typename T, int N2, typename T2>
+	void PreimageOperation<N, T, N2, T2>::print(std::ostream &os) const {
+		os << "PreimageOperation(" << parent << ")";
+	}
 
-  template <int N, typename T, int N2, typename T2>
-  ActiveMessageHandlerReg<ApproxImageResponseMessage<PreimageOperation<N,T,N2,T2> > > PreimageOperation<N,T,N2,T2>::areg;
+	template<int N, typename T, int N2, typename T2>
+	ActiveMessageHandlerReg<ApproxImageResponseMessage<PreimageOperation<N, T, N2, T2> > > PreimageOperation<N, T, N2,
+		T2>::areg;
 
 
-  ////////////////////////////////////////////////////////////////////////
-  //
-  // class ApproxImageResponseMessage<T>
+	////////////////////////////////////////////////////////////////////////
+	//
+	// class ApproxImageResponseMessage<T>
 
-  template <typename T>
-  /*static*/ void ApproxImageResponseMessage<T>::handle_message(NodeID sender,
-								const ApproxImageResponseMessage<T> &msg,
-								const void *data, size_t datalen)
-  {
-    T *op = reinterpret_cast<T *>(msg.approx_output_op);
-    op->provide_sparse_image(msg.approx_output_index,
-			     static_cast<const Rect<T::DIM2, typename T::IDXTYPE2> *>(data),
-			     datalen / sizeof(Rect<T::DIM2, typename T::IDXTYPE2>));
-  }
+	template<typename T>
+	/*static*/ void ApproxImageResponseMessage<T>::handle_message(NodeID sender,
+	                                                              const ApproxImageResponseMessage<T> &msg,
+	                                                              const void *data, size_t datalen) {
+		T *op = reinterpret_cast<T *>(msg.approx_output_op);
+		op->provide_sparse_image(msg.approx_output_index,
+		                         static_cast<const Rect<T::DIM2, typename T::IDXTYPE2> *>(data),
+		                         datalen / sizeof(Rect<T::DIM2, typename T::IDXTYPE2>));
+	}
 
-  ////////////////////////////////////////////////////////////////////////
-  //
-  // class StructuredPreimageMicroOp<N,T,N2,T2>
+	////////////////////////////////////////////////////////////////////////
+	//
+	// class StructuredPreimageMicroOp<N,T,N2,T2>
 
-  template <int N, typename T, int N2, typename T2>
-  StructuredPreimageMicroOp<N, T, N2, T2>::StructuredPreimageMicroOp(
-      const StructuredTransform<N2, T2, N, T> &_transform,
-      IndexSpace<N, T> _parent_space)
-      : transform(_transform), parent_space(_parent_space) {}
+	template<int N, typename T, int N2, typename T2>
+	StructuredPreimageMicroOp<N, T, N2, T2>::StructuredPreimageMicroOp(
+		const StructuredTransform<N2, T2, N, T> &_transform,
+		IndexSpace<N, T> _parent_space)
+		: transform(_transform), parent_space(_parent_space) {
+	}
 
-  template <int N, typename T, int N2, typename T2>
-  StructuredPreimageMicroOp<N, T, N2, T2>::~StructuredPreimageMicroOp(void) {}
+	template<int N, typename T, int N2, typename T2>
+	StructuredPreimageMicroOp<N, T, N2, T2>::~StructuredPreimageMicroOp(void) {
+	}
 
-  template <int N, typename T, int N2, typename T2>
-  void StructuredPreimageMicroOp<N, T, N2, T2>::add_sparsity_output(
-      IndexSpace<N2, T2> _target, SparsityMap<N, T> _sparsity) {
-   targets.push_back(_target);
-   sparsity_outputs.push_back(_sparsity);
-  }
+	template<int N, typename T, int N2, typename T2>
+	void StructuredPreimageMicroOp<N, T, N2, T2>::add_sparsity_output(
+		IndexSpace<N2, T2> _target, SparsityMap<N, T> _sparsity) {
+		targets.push_back(_target);
+		sparsity_outputs.push_back(_sparsity);
+	}
 
-  template <int N, typename T, int N2, typename T2>
-  template <typename BM>
-  void StructuredPreimageMicroOp<N, T, N2, T2>::populate_bitmasks(
-      std::map<int, BM *> &bitmasks) {
-   Rect<N2, T2> target_bbox = targets[0].bounds;
-   for (size_t i = 1; i < targets.size(); i++) {
-    target_bbox = target_bbox.union_bbox(targets[i].bounds);
-   }
-   for (IndexSpaceIterator<N, T> it2(parent_space); it2.valid; it2.step()) {
-    Rect<N2, T2> parent_bbox;
-    parent_bbox.lo = transform[it2.rect.lo];
-    parent_bbox.hi = transform[it2.rect.hi];
-
-    if (target_bbox.intersection(parent_bbox).empty()) continue;
-
-    for (PointInRectIterator<N, T> pir(it2.rect); pir.valid; pir.step()) {
-     Point<N2, T2> target_point = transform[pir.p];
-     for (size_t i = 0; i < targets.size(); i++) {
-      if (targets[i].contains(target_point)) {
-       BM *&bmp = bitmasks[i];
-       if (!bmp) bmp = new BM;
-       bmp->add_point(pir.p);
-      }
-     }
-    }
-   }
-  }
+	template<int N, typename T, int N2, typename T2>
+	template<typename BM>
+	void StructuredPreimageMicroOp<N, T, N2, T2>::populate_bitmasks(
+		std::map<int, BM *> &bitmasks) {
+		Rect<N2, T2> target_bbox = targets[0].bounds;
+		for (size_t i = 1; i < targets.size(); i++) {
+			target_bbox = target_bbox.union_bbox(targets[i].bounds);
+		}
+		for (IndexSpaceIterator<N, T> it2(parent_space); it2.valid; it2.step()) {
+			Rect<N2, T2> parent_bbox;
+			parent_bbox.lo = transform[it2.rect.lo];
+			parent_bbox.hi = transform[it2.rect.hi];
+
+			if (target_bbox.intersection(parent_bbox).empty()) continue;
+
+			for (PointInRectIterator<N, T> pir(it2.rect); pir.valid; pir.step()) {
+				Point<N2, T2> target_point = transform[pir.p];
+				for (size_t i = 0; i < targets.size(); i++) {
+					if (targets[i].contains(target_point)) {
+						BM *&bmp = bitmasks[i];
+						if (!bmp) bmp = new BM;
+						bmp->add_point(pir.p);
+					}
+				}
+			}
+		}
+	}
 
-  template <int N, typename T, int N2, typename T2>
-  void StructuredPreimageMicroOp<N,T,N2,T2>::execute(void)
-  {
-    TimeStamp ts("PreimageMicroOp::execute", true, &log_uop_timing);
-    std::map<int, DenseRectangleList<N,T> *> rect_map;
+	template<int N, typename T, int N2, typename T2>
+	void StructuredPreimageMicroOp<N, T, N2, T2>::execute(void) {
+		TimeStamp ts("PreimageMicroOp::execute", true, &log_uop_timing);
+		std::map<int, DenseRectangleList<N, T> *> rect_map;
 
-    populate_bitmasks(rect_map);
+		populate_bitmasks(rect_map);
 #ifdef DEBUG_PARTITIONING
-    std::cout << rect_map.size() << " non-empty preimages present in instance "
-              << inst << std::endl;
-    for (typename std::map<int, DenseRectangleList<N, T> *>::const_iterator it =
-             rect_map.begin();
-         it != rect_map.end(); it++)
-      std::cout << "  " << targets[it->first] << " = "
-                << it->second->rects.size() << " rectangles" << std::endl;
+		std::cout << rect_map.size() << " non-empty preimages present in instance "
+				<< inst << std::endl;
+		for (typename std::map<int, DenseRectangleList<N, T> *>::const_iterator it =
+				     rect_map.begin();
+		     it != rect_map.end(); it++)
+			std::cout << "  " << targets[it->first] << " = "
+					<< it->second->rects.size() << " rectangles" << std::endl;
 #endif
-    // iterate over sparsity outputs and contribute to all (even if we
-    // didn't have any points found for it)
-    int empty_count = 0;
-    for (size_t i = 0; i < sparsity_outputs.size(); i++) {
-      SparsityMapImpl<N, T> *impl =
-          SparsityMapImpl<N, T>::lookup(sparsity_outputs[i]);
-      typename std::map<int, DenseRectangleList<N, T> *>::const_iterator it2 =
-          rect_map.find(i);
-      if (it2 != rect_map.end()) {
-        impl->contribute_dense_rect_list(it2->second->rects, true /*disjoint*/);
-        delete it2->second;
-      } else {
-        impl->contribute_nothing();
-        empty_count++;
-      }
-    }
+		// iterate over sparsity outputs and contribute to all (even if we
+		// didn't have any points found for it)
+		int empty_count = 0;
+		for (size_t i = 0; i < sparsity_outputs.size(); i++) {
+			SparsityMapImpl<N, T> *impl =
+					SparsityMapImpl<N, T>::lookup(sparsity_outputs[i]);
+			typename std::map<int, DenseRectangleList<N, T> *>::const_iterator it2 =
+					rect_map.find(i);
+			if (it2 != rect_map.end()) {
+				impl->contribute_dense_rect_list(it2->second->rects, true /*disjoint*/);
+				delete it2->second;
+			} else {
+				impl->contribute_nothing();
+				empty_count++;
+			}
+		}
+
+		if (empty_count > 0) {
+			log_part.info() << empty_count << " empty preimages (out of "
+					<< sparsity_outputs.size() << ")";
+		}
+	}
 
-    if (empty_count > 0) {
-      log_part.info() << empty_count << " empty preimages (out of "
-                      << sparsity_outputs.size() << ")";
-    }
-  }
+	template<int N, typename T, int N2, typename T2>
+	void StructuredPreimageMicroOp<N, T, N2, T2>::dispatch(
+		PartitioningOperation *op, bool inline_ok) {
+		// need valid data for each target
+		for (size_t i = 0; i < targets.size(); i++) {
+			if (!targets[i].dense()) {
+				// it's safe to add the count after the registration only because we
+				// initialized the count to 2 instead of 1
+				bool registered = SparsityMapImpl<N2, T2>::lookup(targets[i].sparsity)
+						->add_waiter(this, true /*precise*/);
+				if (registered) wait_count.fetch_add(1);
+			}
+		}
+
+		// need valid data for the parent space too
+		if (!parent_space.dense()) {
+			// it's safe to add the count after the registration only because we
+			// initialized the count to 2 instead of 1
+			bool registered = SparsityMapImpl<N, T>::lookup(parent_space.sparsity)
+					->add_waiter(this, true /*precise*/);
+			if (registered) wait_count.fetch_add(1);
+		}
+
+		this->finish_dispatch(op, inline_ok);
+	}
 
-  template <int N, typename T, int N2, typename T2>
-  void StructuredPreimageMicroOp<N, T, N2, T2>::dispatch(
-      PartitioningOperation *op, bool inline_ok) {
-    // need valid data for each target
-    for (size_t i = 0; i < targets.size(); i++) {
-      if (!targets[i].dense()) {
-        // it's safe to add the count after the registration only because we
-        // initialized the count to 2 instead of 1
-        bool registered = SparsityMapImpl<N2, T2>::lookup(targets[i].sparsity)
-                              ->add_waiter(this, true /*precise*/);
-        if (registered) wait_count.fetch_add(1);
-      }
-    }
+	////////////////////////////////////////////////////////////////////////
+	//
+	// class GPUPreimageMicroOp<N,T,N2,T2>
 
-    // need valid data for the parent space too
-    if (!parent_space.dense()) {
-      // it's safe to add the count after the registration only because we
-      // initialized the count to 2 instead of 1
-      bool registered = SparsityMapImpl<N, T>::lookup(parent_space.sparsity)
-                            ->add_waiter(this, true /*precise*/);
-      if (registered) wait_count.fetch_add(1);
-    }
+	template<int N, typename T, int N2, typename T2>
+	GPUPreimageMicroOp<N, T, N2, T2>::GPUPreimageMicroOp(
+		const DomainTransform<N2, T2, N, T> &_domain_transform,
+		IndexSpace<N, T> _parent_space, bool _exclusive)
+		: domain_transform(_domain_transform), parent_space(_parent_space) {
+		this->exclusive = _exclusive;
+	}
 
-    finish_dispatch(op, inline_ok);
-  }
+	template<int N, typename T, int N2, typename T2>
+	GPUPreimageMicroOp<N, T, N2, T2>::~GPUPreimageMicroOp(void) {
+	}
+
+	template<int N, typename T, int N2, typename T2>
+	void GPUPreimageMicroOp<N, T, N2, T2>::add_sparsity_output(
+		IndexSpace<N2, T2> _target, SparsityMap<N, T> _sparsity) {
+		targets.push_back(_target);
+		sparsity_outputs.push_back(_sparsity);
+	}
+
+	template<int N, typename T, int N2, typename T2>
+	void GPUPreimageMicroOp<N, T, N2, T2>::execute(void) {
+		TimeStamp ts("GPUPreimageMicroOp::execute", true, &log_uop_timing);
+	        if (domain_transform.ptr_data.size() > 0) {
+	          gpu_populate_bitmasks();
+	        } else if (domain_transform.range_data.size() > 0) {
+                  gpu_populate_ranges();
+                }
+	}
 
-  // instantiations of templates handled in preimage_tmpl.cc
+	template<int N, typename T, int N2, typename T2>
+	void GPUPreimageMicroOp<N, T, N2, T2>::dispatch(
+		PartitioningOperation *op, bool inline_ok) {
+		// need valid data for each target
+		for (size_t i = 0; i < targets.size(); i++) {
+			if (!targets[i].dense()) {
+				// it's safe to add the count after the registration only because we
+				// initialized the count to 2 instead of 1
+				bool registered = SparsityMapImpl<N2, T2>::lookup(targets[i].sparsity)
+						->add_waiter(this, true /*precise*/);
+				if (registered) this->wait_count.fetch_add(1);
+			}
+		}
+
+		// need valid data for the parent space too
+		if (!parent_space.dense()) {
+			// it's safe to add the count after the registration only because we
+			// initialized the count to 2 instead of 1
+			bool registered = SparsityMapImpl<N, T>::lookup(parent_space.sparsity)
+					->add_waiter(this, true /*precise*/);
+			if (registered) this->wait_count.fetch_add(1);
+		}
+
+		this->finish_dispatch(op, inline_ok);
+	}
 
-}; // namespace Realm
+	// instantiations of templates handled in preimage_tmpl.cc
+}; // namespace Realm
\ No newline at end of file
diff --git a/src/realm/deppart/preimage.h b/src/realm/deppart/preimage.h
index c08c0dfd30..1a67c12aee 100644
--- a/src/realm/deppart/preimage.h
+++ b/src/realm/deppart/preimage.h
@@ -20,7 +20,8 @@
 #ifndef REALM_DEPPART_PREIMAGE_H
 #define REALM_DEPPART_PREIMAGE_H
 
-#include "realm/deppart/partitions.h"
+#include "partitions.h"
+#include "realm/deppart/rectlist.h"
 
 namespace Realm {
 
@@ -152,6 +153,36 @@ namespace Realm {
    std::vector<SparsityMap<N, T> > sparsity_outputs;
   };
 
+  template <int N, typename T, int N2, typename T2>
+  class GPUPreimageMicroOp : public GPUMicroOp<N, T> {
+  public:
+    static const int DIM = N;
+    typedef T IDXTYPE;
+    static const int DIM2 = N2;
+    typedef T2 IDXTYPE2;
+
+    GPUPreimageMicroOp(const DomainTransform<N2, T2, N, T> &_domain_transform,
+                              IndexSpace<N, T> _parent_space, bool _exclusive);
+
+    virtual ~GPUPreimageMicroOp(void);
+
+    void add_sparsity_output(IndexSpace<N2,T2> _target, SparsityMap<N,T> _sparsity);
+
+    virtual void execute(void);
+
+    void dispatch(PartitioningOperation *op, bool inline_ok);
+
+  protected:
+
+    void gpu_populate_ranges();
+    void gpu_populate_bitmasks();
+
+    DomainTransform<N2, T2, N, T> domain_transform;
+    IndexSpace<N, T> parent_space;
+    std::vector<IndexSpace<N2, T2> > targets;
+    std::vector<SparsityMap<N, T> > sparsity_outputs;
+  };
+
   };  // namespace Realm
 
-#endif // REALM_DEPPART_PREIMAGE_H
+#endif // REALM_DEPPART_PREIMAGE_H
\ No newline at end of file
diff --git a/src/realm/deppart/preimage_gpu_impl.hpp b/src/realm/deppart/preimage_gpu_impl.hpp
new file mode 100644
index 0000000000..3793b32458
--- /dev/null
+++ b/src/realm/deppart/preimage_gpu_impl.hpp
@@ -0,0 +1,468 @@
+#pragma once
+#include "realm/deppart/preimage.h"
+#include "realm/deppart/preimage_gpu_kernels.hpp"
+#include "realm/deppart/byfield_gpu_kernels.hpp"
+#include "realm/deppart/partitions_gpu_impl.hpp"
+#include <cub/cub.cuh>
+#include <sstream>
+#include "realm/nvtx.h"
+
+namespace Realm {
+
+  template<int N, typename T, int N2, typename T2>
+  void GPUPreimageMicroOp<N, T, N2, T2>::gpu_populate_ranges() {
+    if (targets.size() == 0) {
+      return;
+    }
+
+    Memory my_mem = domain_transform.range_data[0].inst.get_location();
+
+    const char* val = std::getenv("TILE_SIZE");  // or any env var
+    size_t tile_size = 100000000; //default
+    if (val) {
+      tile_size = atoi(val);
+    }
+
+    RegionInstance fixed_buffer = this->realm_malloc(tile_size, my_mem);
+    Arena buffer_arena(reinterpret_cast<void *>(AffineAccessor<char, 1>(fixed_buffer, 0).base), tile_size);
+
+    NVTX_DEPPART(gpu_preimage);
+
+    cudaStream_t stream = Cuda::get_task_cuda_stream();
+
+    collapsed_space<N, T> inst_space;
+
+    // We combine all of our instances into one to batch work, tracking the offsets between instances.
+    RegionInstance inst_offsets_instance = this->realm_malloc((domain_transform.range_data.size() + 1) * sizeof(size_t), my_mem);
+    inst_space.offsets = reinterpret_cast<size_t*>(AffineAccessor<char,1>(inst_offsets_instance, 0).base);
+    inst_space.num_children = domain_transform.range_data.size();
+
+    RegionInstance inst_entries_instance;
+
+    GPUMicroOp<N, T>::collapse_multi_space(domain_transform.range_data, inst_space, buffer_arena, stream);
+
+    RegionInstance parent_entries_instance;
+    collapsed_space<N, T> collapsed_parent;
+
+    // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs.
+    GPUMicroOp<N, T>::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream);
+
+
+    // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter
+    // to figure out where to write each rectangle.
+    RegionInstance inst_counters_instance = this->realm_malloc((2*domain_transform.range_data.size() + 1) * sizeof(uint32_t), my_mem);
+    uint32_t* d_inst_counters = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(inst_counters_instance, 0).base);
+
+    // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second
+    // to track which instance each rectangle came from in the populate phase.
+    uint32_t* d_inst_prefix = d_inst_counters + domain_transform.range_data.size();
+    RegionInstance out_instance;
+    size_t num_valid_rects;
+
+    Rect<N, T>* d_valid_rects;
+
+    // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from.
+    GPUMicroOp<N, T>::template construct_input_rectlist<Rect<N, T>>(inst_space, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream);
+    inst_entries_instance.destroy();
+    parent_entries_instance.destroy();
+    inst_offsets_instance.destroy();
+
+    if (num_valid_rects == 0) {
+      for (auto it : sparsity_outputs) {
+        SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it);
+        if (this->exclusive) {
+          impl->gpu_finalize();
+        } else {
+          impl->contribute_nothing();
+        }
+      }
+      out_instance.destroy();
+      inst_counters_instance.destroy();
+      return;
+    }
+
+    // Prefix sum the valid rectangles by volume.
+    RegionInstance prefix_rects_instance;
+    size_t total_pts;
+
+    size_t* d_prefix_rects;
+    GPUMicroOp<N, T>::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream);
+
+    nvtx_range_push("cuda", "build target entries");
+
+    collapsed_space<N2, T2> target_space;
+    RegionInstance offsets_instance = this->realm_malloc((targets.size()+1) * sizeof(size_t), my_mem);
+    target_space.offsets = reinterpret_cast<size_t*>(AffineAccessor<char,1>(offsets_instance, 0).base);
+    target_space.num_children = targets.size();
+
+    RegionInstance targets_entries_instance;
+
+    GPUMicroOp<N2, T2>::collapse_multi_space(targets, target_space, buffer_arena, stream);
+
+    Memory zcpy_mem;
+    assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM));
+    RegionInstance accessors_instance = this->realm_malloc(domain_transform.range_data.size() * sizeof(AffineAccessor<Rect<N2,T2>,N,T>), zcpy_mem);
+    AffineAccessor<Rect<N2,T2>,N,T>* d_accessors = reinterpret_cast<AffineAccessor<Rect<N2,T2>,N,T>*>(AffineAccessor<char,1>(accessors_instance, 0).base);
+    for (size_t i = 0; i < domain_transform.range_data.size(); ++i) {
+      d_accessors[i] = AffineAccessor<Rect<N2,T2>,N,T>(domain_transform.range_data[i].inst, domain_transform.range_data[i].field_offset);
+    }
+
+    RegionInstance points_instance;
+    PointDesc<N,T>* d_points;
+    size_t num_valid_points;
+
+    RegionInstance target_counters_instance = this->realm_malloc((2*targets.size()+1) * sizeof(uint32_t), my_mem);
+    uint32_t* d_target_counters = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(target_counters_instance, 0).base);
+    uint32_t* d_targets_prefix = d_target_counters + targets.size();
+    CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, targets.size() * sizeof(uint32_t), stream), stream);
+
+    if (target_space.num_entries > targets.size()) {
+      BVH<N2, T2> preimage_bvh;
+      RegionInstance bvh_instance;
+      GPUMicroOp<N2, T2>::build_bvh(target_space, preimage_bvh, buffer_arena, stream);
+
+      preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices,
+       preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.range_data.size(), preimage_bvh.num_leaves, nullptr, d_target_counters, nullptr);
+      KERNEL_CHECK(stream);
+
+      std::vector<uint32_t> h_target_counters(targets.size()+1);
+      h_target_counters[0] = 0; // prefix sum starts at 0
+      CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+      for (size_t i = 0; i < targets.size(); ++i) {
+        h_target_counters[i+1] += h_target_counters[i];
+      }
+
+      num_valid_points = h_target_counters[targets.size()];
+
+      if (num_valid_points == 0) {
+        for (auto it : sparsity_outputs) {
+          SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it);
+          if (this->exclusive) {
+            impl->gpu_finalize();
+          } else {
+            impl->contribute_nothing();
+          }
+        }
+        target_counters_instance.destroy();
+        accessors_instance.destroy();
+        targets_entries_instance.destroy();
+        offsets_instance.destroy();
+        prefix_rects_instance.destroy();
+        out_instance.destroy();
+        inst_counters_instance.destroy();
+        bvh_instance.destroy();
+        return;
+      }
+
+      CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream);
+
+      points_instance = this->realm_malloc(num_valid_points * sizeof(PointDesc<N,T>), my_mem);
+      d_points = reinterpret_cast<PointDesc<N,T>*>(AffineAccessor<char,1>(points_instance, 0).base);
+
+      CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream);
+
+      preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices,
+       preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.range_data.size(), preimage_bvh.num_leaves, d_targets_prefix, d_target_counters, d_points);
+      KERNEL_CHECK(stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+      bvh_instance.destroy();
+    } else {
+      preimage_dense_populate_bitmasks_kernel < N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts,
+       num_valid_rects, domain_transform.range_data.size(), targets.size(), nullptr, d_target_counters, nullptr);
+      KERNEL_CHECK(stream);
+
+      std::vector<uint32_t> h_target_counters(targets.size()+1);
+      h_target_counters[0] = 0; // prefix sum starts at 0
+      CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+      for (size_t i = 0; i < targets.size(); ++i) {
+        h_target_counters[i+1] += h_target_counters[i];
+      }
+
+      num_valid_points = h_target_counters[targets.size()];
+
+      if (num_valid_points == 0) {
+        for (auto it : sparsity_outputs) {
+          SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it);
+          if (this->exclusive) {
+            impl->gpu_finalize();
+          } else {
+            impl->contribute_nothing();
+          }
+        }
+        target_counters_instance.destroy();
+        accessors_instance.destroy();
+        targets_entries_instance.destroy();
+        offsets_instance.destroy();
+        prefix_rects_instance.destroy();
+        out_instance.destroy();
+        inst_counters_instance.destroy();
+        return;
+      }
+
+      CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream);
+
+      points_instance = this->realm_malloc(num_valid_points * sizeof(PointDesc<N,T>), my_mem);
+      d_points = reinterpret_cast<PointDesc<N,T>*>(AffineAccessor<char,1>(points_instance, 0).base);
+
+      CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream);
+
+      preimage_dense_populate_bitmasks_kernel < N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts,
+       num_valid_rects, domain_transform.range_data.size(), targets.size(), d_targets_prefix, d_target_counters, d_points);
+      KERNEL_CHECK(stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+    }
+
+    target_counters_instance.destroy();
+    accessors_instance.destroy();
+    targets_entries_instance.destroy();
+    offsets_instance.destroy();
+    prefix_rects_instance.destroy();
+    out_instance.destroy();
+    inst_counters_instance.destroy();
+
+    size_t out_rects = 0;
+    RectDesc<N, T>* trash;
+    this->complete_pipeline(d_points, num_valid_points, trash, out_rects, buffer_arena,
+    /* the Container: */  sparsity_outputs,
+    /* getIndex: */       [&](auto const& elem){
+                            // elem is a SparsityMap<N,T> from the vector
+                            return size_t(&elem - sparsity_outputs.data());
+                         },
+    /* getMap: */         [&](auto const& elem){
+                          // return the SparsityMap key itself
+                          return elem;
+                       });
+
+    points_instance.destroy();
+  }
+
+  template<int N, typename T, int N2, typename T2>
+  void GPUPreimageMicroOp<N, T, N2, T2>::gpu_populate_bitmasks() {
+    if (targets.size() == 0) {
+      return;
+    }
+
+    Memory my_mem = domain_transform.ptr_data[0].inst.get_location();
+
+    const char* val = std::getenv("TILE_SIZE");  // or any env var
+    size_t tile_size = 100000000; //default
+    if (val) {
+      tile_size = atoi(val);
+    }
+
+    RegionInstance fixed_buffer = this->realm_malloc(tile_size, my_mem);
+    Arena buffer_arena(reinterpret_cast<void *>(AffineAccessor<char, 1>(fixed_buffer, 0).base), tile_size);
+
+    NVTX_DEPPART(gpu_preimage);
+
+    cudaStream_t stream = Cuda::get_task_cuda_stream();
+
+    collapsed_space<N, T> inst_space;
+
+    // We combine all of our instances into one to batch work, tracking the offsets between instances.
+    RegionInstance inst_offsets_instance = this->realm_malloc((domain_transform.ptr_data.size() + 1) * sizeof(size_t), my_mem);
+    inst_space.offsets = reinterpret_cast<size_t*>(AffineAccessor<char,1>(inst_offsets_instance, 0).base);
+    inst_space.num_children = domain_transform.ptr_data.size();
+
+    RegionInstance inst_entries_instance;
+
+    GPUMicroOp<N, T>::collapse_multi_space(domain_transform.ptr_data, inst_space, buffer_arena, stream);
+
+    RegionInstance parent_entries_instance;
+    collapsed_space<N, T> collapsed_parent;
+
+    // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs.
+    GPUMicroOp<N, T>::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream);
+
+
+    // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter
+    // to figure out where to write each rectangle.
+    RegionInstance inst_counters_instance = this->realm_malloc((2*domain_transform.ptr_data.size() + 1) * sizeof(uint32_t), my_mem);
+    uint32_t* d_inst_counters = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(inst_counters_instance, 0).base);
+
+    // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second
+    // to track which instance each rectangle came from in the populate phase.
+    uint32_t* d_inst_prefix = d_inst_counters + domain_transform.ptr_data.size();
+    RegionInstance out_instance;
+    size_t num_valid_rects;
+
+    Rect<N, T>* d_valid_rects;
+    // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from.
+    GPUMicroOp<N, T>::template construct_input_rectlist<Rect<N, T>>(inst_space, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream);
+    inst_entries_instance.destroy();
+    parent_entries_instance.destroy();
+    inst_offsets_instance.destroy();
+
+    if (num_valid_rects == 0) {
+      for (auto it : sparsity_outputs) {
+        SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it);
+        if (this->exclusive) {
+          impl->gpu_finalize();
+        } else {
+          impl->contribute_nothing();
+        }
+      }
+      out_instance.destroy();
+      inst_counters_instance.destroy();
+      return;
+    }
+
+    // Prefix sum the valid rectangles by volume.
+    RegionInstance prefix_rects_instance;
+    size_t total_pts;
+
+    size_t* d_prefix_rects;
+    GPUMicroOp<N, T>::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream);
+
+    nvtx_range_push("cuda", "build target entries");
+
+    collapsed_space<N2, T2> target_space;
+    RegionInstance offsets_instance = this->realm_malloc((targets.size()+1) * sizeof(size_t), my_mem);
+    target_space.offsets = reinterpret_cast<size_t*>(AffineAccessor<char,1>(offsets_instance, 0).base);
+    target_space.num_children = targets.size();
+
+    RegionInstance targets_entries_instance;
+
+    GPUMicroOp<N2, T2>::collapse_multi_space(targets, target_space, buffer_arena, stream);
+
+    Memory zcpy_mem;
+    assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM));
+    RegionInstance accessors_instance = this->realm_malloc(domain_transform.ptr_data.size() * sizeof(AffineAccessor<Point<N2,T2>,N,T>), zcpy_mem);
+    AffineAccessor<Point<N2,T2>,N,T>* d_accessors = reinterpret_cast<AffineAccessor<Point<N2,T2>,N,T>*>(AffineAccessor<char,1>(accessors_instance, 0).base);
+    for (size_t i = 0; i < domain_transform.ptr_data.size(); ++i) {
+      d_accessors[i] = AffineAccessor<Point<N2,T2>,N,T>(domain_transform.ptr_data[i].inst, domain_transform.ptr_data[i].field_offset);
+    }
+
+    RegionInstance points_instance;
+    PointDesc<N,T>* d_points;
+    size_t num_valid_points;
+
+    RegionInstance target_counters_instance = this->realm_malloc((2*targets.size()+1) * sizeof(uint32_t), my_mem);
+    uint32_t* d_target_counters = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(target_counters_instance, 0).base);
+    uint32_t* d_targets_prefix = d_target_counters + targets.size();
+    CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, targets.size() * sizeof(uint32_t), stream), stream);
+
+    if (target_space.num_entries > targets.size()) {
+      BVH<N2, T2> preimage_bvh;
+      RegionInstance bvh_instance;
+      GPUMicroOp<N2, T2>::build_bvh(target_space, preimage_bvh, buffer_arena, stream);
+
+      preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices,
+       preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.ptr_data.size(), preimage_bvh.num_leaves, nullptr, d_target_counters, nullptr);
+      KERNEL_CHECK(stream);
+
+      std::vector<uint32_t> h_target_counters(targets.size()+1);
+      h_target_counters[0] = 0; // prefix sum starts at 0
+      CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+      for (size_t i = 0; i < targets.size(); ++i) {
+        h_target_counters[i+1] += h_target_counters[i];
+      }
+
+      num_valid_points = h_target_counters[targets.size()];
+
+      if (num_valid_points == 0) {
+        for (auto it : sparsity_outputs) {
+          SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it);
+          if (this->exclusive) {
+            impl->gpu_finalize();
+          } else {
+            impl->contribute_nothing();
+          }
+        }
+        target_counters_instance.destroy();
+        accessors_instance.destroy();
+        targets_entries_instance.destroy();
+        offsets_instance.destroy();
+        prefix_rects_instance.destroy();
+        out_instance.destroy();
+        inst_counters_instance.destroy();
+        bvh_instance.destroy();
+        return;
+      }
+
+      CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream);
+
+      points_instance = this->realm_malloc(num_valid_points * sizeof(PointDesc<N,T>), my_mem);
+      d_points = reinterpret_cast<PointDesc<N,T>*>(AffineAccessor<char,1>(points_instance, 0).base);
+
+      CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream);
+
+      preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices,
+       preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.ptr_data.size(), preimage_bvh.num_leaves, d_targets_prefix, d_target_counters, d_points);
+      KERNEL_CHECK(stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+      bvh_instance.destroy();
+    } else {
+      preimage_dense_populate_bitmasks_kernel< N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts,
+       num_valid_rects, domain_transform.ptr_data.size(), targets.size(), nullptr, d_target_counters, nullptr);
+      KERNEL_CHECK(stream);
+
+      std::vector<uint32_t> h_target_counters(targets.size()+1);
+      h_target_counters[0] = 0; // prefix sum starts at 0
+      CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+      for (size_t i = 0; i < targets.size(); ++i) {
+        h_target_counters[i+1] += h_target_counters[i];
+      }
+
+      num_valid_points = h_target_counters[targets.size()];
+
+      if (num_valid_points == 0) {
+        for (auto it : sparsity_outputs) {
+          SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it);
+          if (this->exclusive) {
+            impl->gpu_finalize();
+          } else {
+            impl->contribute_nothing();
+          }
+        }
+        target_counters_instance.destroy();
+        accessors_instance.destroy();
+        targets_entries_instance.destroy();
+        offsets_instance.destroy();
+        prefix_rects_instance.destroy();
+        out_instance.destroy();
+        inst_counters_instance.destroy();
+        return;
+      }
+
+      CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream);
+
+      points_instance = this->realm_malloc(num_valid_points * sizeof(PointDesc<N,T>), my_mem);
+      d_points = reinterpret_cast<PointDesc<N,T>*>(AffineAccessor<char,1>(points_instance, 0).base);
+
+      CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream);
+
+      preimage_dense_populate_bitmasks_kernel< N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts,
+       num_valid_rects, domain_transform.ptr_data.size(), targets.size(), d_targets_prefix, d_target_counters, d_points);
+      KERNEL_CHECK(stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+    }
+
+    target_counters_instance.destroy();
+    accessors_instance.destroy();
+    targets_entries_instance.destroy();
+    offsets_instance.destroy();
+    prefix_rects_instance.destroy();
+    out_instance.destroy();
+    inst_counters_instance.destroy();
+
+    size_t out_rects = 0;
+    RectDesc<N, T>* trash;
+    this->complete_pipeline(d_points, num_valid_points, trash, out_rects, buffer_arena,
+    /* the Container: */  sparsity_outputs,
+    /* getIndex: */       [&](auto const& elem){
+                            // elem is a SparsityMap<N,T> from the vector
+                            return size_t(&elem - sparsity_outputs.data());
+                         },
+    /* getMap: */         [&](auto const& elem){
+                          // return the SparsityMap key itself
+                          return elem;
+                       });
+
+    points_instance.destroy();
+  }
+}
\ No newline at end of file
diff --git a/src/realm/deppart/preimage_gpu_kernels.hpp b/src/realm/deppart/preimage_gpu_kernels.hpp
new file mode 100644
index 0000000000..10d9c5225c
--- /dev/null
+++ b/src/realm/deppart/preimage_gpu_kernels.hpp
@@ -0,0 +1,256 @@
+#pragma once
+#include "realm/deppart/preimage.h"
+
+namespace Realm {
+
+
+template <int N, typename T>
+__global__ void preimage_build_morton_codes(
+  const SparsityMapEntry<N,T>* d_targets_entries,
+  const size_t* d_offsets_rects,
+  const Rect<N,T>* d_global_bounds,
+  size_t total_rects,
+  size_t num_targets,
+  uint64_t* d_morton_codes,
+  uint64_t* d_indices,
+  uint64_t* d_targets_indices) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= total_rects) return;
+  const auto &entry = d_targets_entries[idx];
+  d_morton_codes[idx] = bvh_morton_code(entry.bounds, *d_global_bounds);
+  d_indices[idx] = idx;
+  size_t low = 0, high = num_targets;
+    while (low < high) {
+    size_t mid = (low + high) >> 1;
+    if (d_offsets_rects[mid+1] <= idx) low = mid + 1;
+    else                                 high = mid;
+  }
+  d_targets_indices[idx] = low;
+}
+
+//
+// 2) Initialize leaf boxes
+//
+template<int N, typename T>
+__global__
+void preimage_init_leaf_boxes_kernel(
+    const SparsityMapEntry<N,T> *rects,    // [G] all flattened Rects
+    const uint64_t    *leafIdx, // [n] maps leaf→orig Rect index
+    size_t total_rects,
+    Rect<N,T> *boxes)                 // [(2n−1)]
+{
+  int k = blockIdx.x*blockDim.x + threadIdx.x;
+  if (k >= total_rects) return;
+
+  size_t orig = leafIdx[k];
+  boxes[k + total_rects - 1] = rects[orig].bounds;
+}
+
+  template<int N, typename T, int N2, typename T2, typename Q>
+__device__ void preimage_queryBVH(
+    const Rect<N2,T2> *boxes,
+    const int*      childLeft,
+    const int*      childRight,
+    const uint64_t* leafIdx,
+    const size_t*   targets_indices,
+    int            root,
+    size_t          numTargetRects,
+    const Q&        in_query,
+    Point<N, T> out_point,
+    uint32_t* d_targets_prefix,
+    uint32_t* d_target_counters,
+    PointDesc<N,T> *d_points)
+{
+  constexpr int MAX_STACK = 64; // max stack size for BVH traversal
+  int stack[MAX_STACK];
+  int sp = 0;
+
+  // start at the root
+  stack[sp++] = -1;
+  int node = root;
+  do
+  {
+
+    int left = childLeft[node];
+    int right = childRight[node];
+
+    bool overlapL;
+    bool overlapR;
+
+    if constexpr (std::is_same_v<Q, Rect<N2,T2>>) {
+      overlapL = boxes[left].overlaps(in_query);
+      overlapR = boxes[right].overlaps(in_query);
+    } else {
+      static_assert(std::is_same_v<Q, Point<N2,T2>>,
+                    "Q must be Rect<N2,T2> or Point<N2,T2>");
+      overlapL = boxes[left].contains(in_query);
+      overlapR = boxes[right].contains(in_query);
+    }
+
+
+    if (overlapL && left >= numTargetRects - 1) {
+      // left child is a leaf
+      uint64_t rect_idx = leafIdx[left - (numTargetRects - 1)];
+      size_t target_idx = targets_indices[rect_idx];
+      uint32_t local = atomicAdd(&d_target_counters[target_idx], 1);
+      if (d_points != nullptr) {
+        PointDesc<N,T> point_desc;
+        point_desc.src_idx = target_idx;
+        point_desc.point = out_point;
+        uint32_t out_idx = d_targets_prefix[target_idx] + local;
+        d_points[out_idx] = point_desc;
+      }
+    }
+    if (overlapR && right >= numTargetRects - 1) {
+      uint64_t rect_idx = leafIdx[right - (numTargetRects - 1)];
+      size_t target_idx = targets_indices[rect_idx];
+      uint32_t local = atomicAdd(&d_target_counters[target_idx], 1);
+      if (d_points != nullptr) {
+        PointDesc<N,T> point_desc;
+        point_desc.src_idx = target_idx;
+        point_desc.point = out_point;
+        uint32_t out_idx = d_targets_prefix[target_idx] + local;
+        d_points[out_idx] = point_desc;
+      }
+    }
+
+    bool traverseL = overlapL && left < numTargetRects - 1;
+    bool traverseR = overlapR && right < numTargetRects - 1;
+
+    if (!traverseL && !traverseR) {
+      node = stack[--sp];
+    } else {
+      node = (traverseL ? left : right);
+      if (traverseL && traverseR) {
+        stack[sp++] = right;
+      }
+    }
+  } while (node != -1);
+}
+
+template <
+  int N, typename T,
+  int N2, typename T2, typename Q
+>
+__global__
+void preimage_gpuPopulateBitmasksPtrsKernel(
+  AffineAccessor<Q,N,T> *accessors,
+  Rect<N,T>* rects,
+  size_t* prefix,
+  uint32_t* inst_offsets,
+  int root,
+  int *childLeft,
+  int *childRight,
+  uint64_t *indices,
+  uint64_t *targets_indices,
+  Rect<N2,T2> *boxes,
+  size_t numPoints,
+  size_t numRects,
+  size_t numInsts,
+  size_t numTargetRects,
+  uint32_t* d_targets_prefix,
+  uint32_t* d_target_counters,
+  PointDesc<N,T> *d_points
+) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= numPoints) return;
+  size_t low = 0, high = numRects;
+  while (low < high) {
+    size_t mid = (low + high) >> 1;
+    if (prefix[mid+1] <= idx) low = mid + 1;
+    else                      high = mid;
+  }
+  size_t r = low;
+  low = 0, high = numInsts;
+  while (low < high) {
+    size_t mid = (low + high) >> 1;
+    if (inst_offsets[mid+1] <= r) low = mid + 1;
+    else                          high = mid;
+  }
+  size_t inst_idx = low;
+  size_t offset = idx - prefix[r];
+  Point<N, T> p;
+  for (int k = N-1; k >= 0; --k) {
+    size_t dim = rects[r].hi[k] + 1 - rects[r].lo[k];
+    p[k]  = rects[r].lo[k] + (offset % dim);
+    offset /= dim;
+  }
+  Q ptr = accessors[inst_idx].read(p);
+  preimage_queryBVH(boxes, childLeft, childRight, indices, targets_indices, root, numTargetRects, ptr, p, d_targets_prefix, d_target_counters, d_points);
+}
+
+template <
+  int N, typename T,
+  int N2, typename T2, typename Q
+>
+__global__
+void preimage_dense_populate_bitmasks_kernel(
+  AffineAccessor<Q,N,T>* accessors,
+  Rect<N,T>* rects,
+  size_t* prefix,
+  uint32_t* inst_offsets,
+  SparsityMapEntry<N2,T2>* targets_entries,
+  size_t* target_offsets,
+  size_t numPoints,
+  size_t numRects,
+  size_t numInsts,
+  size_t numTargets,
+  uint32_t *d_targets_prefix,
+  uint32_t *d_target_counters,
+  PointDesc<N,T> *d_points
+) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= numPoints) return;
+  size_t low = 0, high = numRects;
+  while (low < high) {
+    size_t mid = (low + high) >> 1;
+    if (prefix[mid+1] <= idx) low = mid + 1;
+    else                      high = mid;
+  }
+  size_t r = low;
+  low = 0, high = numInsts;
+  while (low < high) {
+    size_t mid = (low + high) >> 1;
+    if (inst_offsets[mid+1] <= r) low = mid + 1;
+    else                          high = mid;
+  }
+  size_t inst_idx = low;
+  size_t offset = idx - prefix[r];
+  Point<N, T> p;
+  for (int k = N-1; k >= 0; --k) {
+    size_t dim = rects[r].hi[k] + 1 - rects[r].lo[k];
+    p[k]  = rects[r].lo[k] + (offset % dim);
+    offset /= dim;
+  }
+  Q ptr = accessors[inst_idx].read(p);
+  for (size_t i = 0; i < numTargets; i++) {
+    bool inside = false;
+    for (size_t j = target_offsets[i]; j < target_offsets[i+1]; j++) {
+      if constexpr (std::is_same_v<Q, Rect<N2,T2>>) {
+        if (targets_entries[j].bounds.overlaps(ptr)) {
+          inside = true;
+          break;
+        }
+      } else {
+        static_assert(std::is_same_v<Q, Point<N2,T2>>,
+                      "Q must be Rect<N2,T2> or Point<N2,T2>");
+        if (targets_entries[j].bounds.contains(ptr)) {
+          inside = true;
+          break;
+        }
+      }
+    }
+    if (inside) {
+      uint32_t local = atomicAdd(&d_target_counters[i], 1);
+      if (d_points != nullptr) {
+        PointDesc<N,T> point_desc;
+        point_desc.src_idx = i;
+        point_desc.point = p;
+        uint32_t out_idx = d_targets_prefix[i] + local;
+        d_points[out_idx] = point_desc;
+      }
+    }
+  }
+}
+
+} // namespace Realm
\ No newline at end of file
diff --git a/src/realm/deppart/preimage_gpu_tmpl.cu b/src/realm/deppart/preimage_gpu_tmpl.cu
new file mode 100644
index 0000000000..eb532a5a1d
--- /dev/null
+++ b/src/realm/deppart/preimage_gpu_tmpl.cu
@@ -0,0 +1,69 @@
+/* Copyright 2024 Stanford University, NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// per‐dimension instantiator for the GPU version of
+// ImageMicroOp<…>::gpu_populate_bitmasks_ptrs
+
+#define REALM_TEMPLATES_ONLY
+#include "realm/deppart/preimage_gpu_kernels.hpp"
+#include "realm/deppart/preimage_gpu_impl.hpp"
+
+#ifndef INST_N1
+  #error "INST_N1 must be defined before including preimage_gpu_tmpl.cu"
+#endif
+#ifndef INST_N2
+  #error "INST_N2 must be defined before including preimage_gpu_tmpl.cu"
+#endif
+
+// same set of T1,T2 pairs you use on the CPU side:
+#define FOREACH_TT(__func__)       \
+  __func__(int,    int)            \
+  __func__(int,    unsigned)       \
+  __func__(int,    long long)      \
+  __func__(unsigned,int)           \
+  __func__(unsigned,unsigned)      \
+  __func__(unsigned,long long)     \
+  __func__(long long, int)         \
+  __func__(long long, unsigned)    \
+  __func__(long long, long long)
+
+#define FOREACH_T(__func__)       \
+  __func__(int)            \
+  __func__(unsigned)       \
+  __func__(long long)
+
+namespace Realm {
+  #define N1 INST_N1
+  #define N2 INST_N2
+
+  // Replace MyBitmask with whatever bitmask‐type you actually use
+  // (it must have an `as_vector.rects` member that your code touches).
+  //
+  // This explicitly instantiates:
+  //   template void
+  //   ImageMicroOp<N1,T1,N2,T2>::gpu_populate_bitmasks_ptrs<MyBitmask>(
+  //     std::map<int,MyBitmask*>&);
+  //
+  #define DO_DOUBLE(T1,T2) \
+    template class GPUPreimageMicroOp<N1,T1,N2,T2>; \
+    template class PreimageMicroOp<N1,T1,N2,T2>;
+
+  FOREACH_TT(DO_DOUBLE)
+
+  #undef DO_DOUBLE
+  #undef N1
+  #undef N2
+
+} // namespace Realm
\ No newline at end of file
diff --git a/src/realm/deppart/preimage_tmpl.cc b/src/realm/deppart/preimage_tmpl.cc
index 50bc3a1ba8..2d3d73e5b2 100644
--- a/src/realm/deppart/preimage_tmpl.cc
+++ b/src/realm/deppart/preimage_tmpl.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright 2025 Stanford University, NVIDIA Corporation
+* Copyright 2025 Stanford University, NVIDIA Corporation
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -28,15 +28,15 @@
 #endif
 
 #define FOREACH_TT(__func__) \
-  __func__(int,int) \
-  __func__(int,unsigned) \
-  __func__(int,long long) \
-  __func__(unsigned,int) \
-  __func__(unsigned,unsigned) \
-  __func__(unsigned,long long) \
-  __func__(long long,int) \
-  __func__(long long,unsigned) \
-  __func__(long long,long long)
+__func__(int,int) \
+__func__(int,unsigned) \
+__func__(int,long long) \
+__func__(unsigned,int) \
+__func__(unsigned,unsigned) \
+__func__(unsigned,long long) \
+__func__(long long,int) \
+__func__(long long,unsigned) \
+__func__(long long,long long)
 
 namespace Realm {
 
@@ -44,16 +44,21 @@ namespace Realm {
 #define N2 INST_N2
 
 #define DOIT(T1,T2)			    \
-  template class PreimageMicroOp<N1,T1,N2,T2>; \
-  template class StructuredPreimageMicroOp<N1,T1,N2,T2>; \
-  template class PreimageOperation<N1,T1,N2,T2>; \
-  template PreimageMicroOp<N1,T1,N2,T2>::PreimageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \
-  template Event IndexSpace<N1, T1>::create_subspaces_by_preimage(            \
-      const DomainTransform<N2, T2, N1, T1> &, const std::vector<IndexSpace<N2, T2> > &,            \
-      std::vector<IndexSpace<N1, T1> > &, const ProfilingRequestSet &, Event) \
-      const;
+template class PreimageMicroOp<N1,T1,N2,T2>; \
+template class GPUPreimageMicroOp<N1,T1,N2,T2>; \
+template class StructuredPreimageMicroOp<N1,T1,N2,T2>; \
+template class PreimageOperation<N1,T1,N2,T2>; \
+template PreimageMicroOp<N1,T1,N2,T2>::PreimageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \
+template void IndexSpace<N1, T1>::suggest_preimage_buffer_size(						     \
+	const std::vector<DeppartSubspace<N2,T2>>&,							     \
+	const std::vector<DeppartEstimateInput<N1,T1>>&,							     \
+	std::vector<DeppartEstimateSuggestion>&) const;	\
+template Event IndexSpace<N1, T1>::create_subspaces_by_preimage(            \
+const DomainTransform<N2, T2, N1, T1> &, const std::vector<IndexSpace<N2, T2> > &,            \
+std::vector<IndexSpace<N1, T1> > &, const ProfilingRequestSet &, Event) \
+const;
 
   FOREACH_TT(DOIT)
 
 
-};
+};
\ No newline at end of file
diff --git a/src/realm/indexspace.h b/src/realm/indexspace.h
index cf6caf9a26..9ea593b392 100644
--- a/src/realm/indexspace.h
+++ b/src/realm/indexspace.h
@@ -736,6 +736,11 @@ namespace Realm {
         const std::vector<FT> &colors, std::vector<IndexSpace<N, T>> &subspaces,
         const ProfilingRequestSet &reqs, Event wait_on = Event::NO_EVENT) const;
 
+    template <typename FT>
+    REALM_PUBLIC_API void suggest_byfield_buffer_size(
+        const std::vector<DeppartEstimateInput<N,T>>& inputs,
+        std::vector<DeppartEstimateSuggestion>& suggestions) const;
+
     ///@{
     /**
      * Allows the "function" described by the field to be composed with a
@@ -802,7 +807,7 @@ namespace Realm {
         Event wait_on = Event::NO_EVENT) const;
 
     template<int N2, typename T2>
-    REALM_PUBLIC_API void suggest_deppart_buffer_size(
+    REALM_PUBLIC_API void suggest_image_buffer_size(
         const std::vector<DeppartSubspace<N2,T2>>& source_spaces,
         const std::vector<DeppartEstimateInput<N2,T2>>& inputs,
         std::vector<DeppartEstimateSuggestion>& suggestions) const;
@@ -927,6 +932,12 @@ namespace Realm {
         const std::vector<IndexSpace<N2, T2>> &targets,
         std::vector<IndexSpace<N, T>> &preimages, const ProfilingRequestSet &reqs,
         Event wait_on = Event::NO_EVENT) const;
+
+    template<int N2, typename T2>
+    REALM_PUBLIC_API void suggest_preimage_buffer_size(
+        const std::vector<DeppartSubspace<N2,T2>>& target_spaces,
+        const std::vector<DeppartEstimateInput<N,T>>& inputs,
+        std::vector<DeppartEstimateSuggestion>& suggestions) const;
     ///@}
 
     ///@{

From d7e9e478a5588b12f23c59bd8e3267cc834ec631 Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Mon, 2 Feb 2026 17:53:29 -0800
Subject: [PATCH 06/32] Added ifdef REALM_USE_CUDA guards to gpu deppart

---
 src/realm/deppart/byfield.cc       |  7 ++++++
 src/realm/deppart/byfield.h        |  4 ++++
 src/realm/deppart/byfield_tmpl.cc  |  8 ++++++-
 src/realm/deppart/image.cc         | 10 ++++++++-
 src/realm/deppart/image.h          |  2 ++
 src/realm/deppart/image_tmpl.cc    |  8 ++++++-
 src/realm/deppart/partitions.h     | 35 ++++++------------------------
 src/realm/deppart/preimage.cc      |  8 +++++++
 src/realm/deppart/preimage.h       |  4 ++++
 src/realm/deppart/preimage_tmpl.cc |  8 ++++++-
 10 files changed, 62 insertions(+), 32 deletions(-)

diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc
index c6ccacc6ce..9c9d5a4ad1 100644
--- a/src/realm/deppart/byfield.cc
+++ b/src/realm/deppart/byfield.cc
@@ -309,6 +309,7 @@ namespace Realm {
   ActiveMessageHandlerReg<RemoteMicroOpMessage<ByFieldMicroOp<N, T, FT> > > ByFieldMicroOp<N, T, FT>::areg;
 
 
+#ifdef REALM_USE_CUDA
   ////////////////////////////////////////////////////////////////////////
   //
   // class GPUByFieldMicroOp<N, T, FT>
@@ -355,6 +356,8 @@ namespace Realm {
     sparsity_outputs[_val] = _sparsity;
   }
 
+#endif
+
 
   ////////////////////////////////////////////////////////////////////////
   //
@@ -430,6 +433,7 @@ namespace Realm {
 
       uop->dispatch(this, true /* ok to run in this thread */);
     }
+#ifdef REALM_USE_CUDA
     for (auto fdd : gpu_field_data) {
       std::vector<FieldDataDescriptor<IndexSpace<N,T>,FT> > single_gpu_field_data = {fdd};
       GPUByFieldMicroOp<N, T, FT> *uop = new GPUByFieldMicroOp<N, T, FT>(parent, single_gpu_field_data, exclusive);
@@ -438,6 +442,9 @@ namespace Realm {
       }
       uop->dispatch(this, false);
     }
+#else
+    assert(gpu_field_data.empty());
+#endif
   }
 
   template <int N, typename T, typename FT>
diff --git a/src/realm/deppart/byfield.h b/src/realm/deppart/byfield.h
index 92902efbd1..cc21234f32 100644
--- a/src/realm/deppart/byfield.h
+++ b/src/realm/deppart/byfield.h
@@ -68,6 +68,8 @@ namespace Realm {
     std::map<FT, SparsityMap<N,T> > sparsity_outputs;
   };
 
+#ifdef REALM_USE_CUDA
+
   template<int N, typename T, typename FT>
     class GPUByFieldMicroOp : public GPUMicroOp<N, T> {
   public:
@@ -91,6 +93,8 @@ namespace Realm {
     std::map<FT, SparsityMap<N,T> > sparsity_outputs;
   };
 
+#endif
+
   template <int N, typename T, typename FT>
   class ByFieldOperation : public PartitioningOperation {
   public:
diff --git a/src/realm/deppart/byfield_tmpl.cc b/src/realm/deppart/byfield_tmpl.cc
index 7575607ea2..c8e6db0bcd 100644
--- a/src/realm/deppart/byfield_tmpl.cc
+++ b/src/realm/deppart/byfield_tmpl.cc
@@ -43,9 +43,15 @@ namespace Realm {
 #define N1 INST_N1
 #define N2 INST_N2
 
+#ifdef REALM_USE_CUDA
+  #define GPU_BYFIELD_LINE(N, T, ...) template class GPUByFieldMicroOp<N,T,__VA_ARGS__>;
+#else
+  #define GPU_BYFIELD_LINE(N, T, ...) /* no CUDA */
+#endif
+
 #define DOIT(N,T,F) \
   template class ByFieldMicroOp<N,T,F>; \
-  template class GPUByFieldMicroOp<N,T,F>; \
+  GPU_BYFIELD_LINE(N, T, F) \
   template class ByFieldOperation<N,T,F>; \
   template ByFieldMicroOp<N,T,F>::ByFieldMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \
   template Event IndexSpace<N,T>::create_subspaces_by_field(const std::vector<FieldDataDescriptor<IndexSpace<N,T>,F> >&, \
diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc
index d207161b22..d0251687b4 100644
--- a/src/realm/deppart/image.cc
+++ b/src/realm/deppart/image.cc
@@ -689,6 +689,7 @@ namespace Realm {
   								     images[j]);
       	uop->dispatch(this, true /* ok to run in this thread */);
       }
+#ifdef REALM_USE_CUDA
       for (auto ptr_fdd : gpu_ptr_data) {
           	// launch full cross-product of image micro ops right away
           assert(ptr_fdd.scratch_buffer != RegionInstance::NO_INST);
@@ -713,6 +714,9 @@ namespace Realm {
         }
         micro_op->dispatch(this, true);
       }
+#else
+    	assert(!gpu_data);
+#endif
    }
   }
 
@@ -916,7 +920,9 @@ namespace Realm {
 
     ////////////////////////////////////////////////////////////////////////
   //
-  // class StructuredImageMicroOp<N, T, N2, T2>
+  // class GPUImageMicroOp<N, T, N2, T2>
+
+#ifdef REALM_USE_CUDA
 
   template <int N, typename T, int N2, typename T2>
   GPUImageMicroOp<N, T, N2, T2>::GPUImageMicroOp(
@@ -979,6 +985,8 @@ namespace Realm {
       gpu_populate_rngs();
     }
   }
+#endif
+
 
   ////////////////////////////////////////////////////////////////////////
 
diff --git a/src/realm/deppart/image.h b/src/realm/deppart/image.h
index 58131338a3..ab81ecafae 100644
--- a/src/realm/deppart/image.h
+++ b/src/realm/deppart/image.h
@@ -148,6 +148,7 @@ namespace Realm {
         std::vector<IndexSpace<N2, T2> > sources;
         std::vector<SparsityMap<N, T> > sparsity_outputs;
     };
+#ifdef REALM_USE_CUDA
 
     template<int N, typename T, int N2, typename T2>
     class GPUImageMicroOp : public GPUMicroOp<N, T> {
@@ -176,6 +177,7 @@ namespace Realm {
         std::vector<IndexSpace<N2, T2> > sources;
         std::vector<SparsityMap<N, T> > sparsity_outputs;
     };
+#endif
 }; // namespace Realm
 
 #endif // REALM_DEPPART_IMAGE_H
diff --git a/src/realm/deppart/image_tmpl.cc b/src/realm/deppart/image_tmpl.cc
index 8a0e686f22..19242fa9ca 100644
--- a/src/realm/deppart/image_tmpl.cc
+++ b/src/realm/deppart/image_tmpl.cc
@@ -44,10 +44,16 @@ namespace Realm {
 #define N1 INST_N1
 #define N2 INST_N2
 
+#ifdef REALM_USE_CUDA
+  #define GPU_IMAGE_LINE(N1,T1,N2,T2) template class GPUImageMicroOp<N1, T1, N2, T2>;
+#else
+  #define GPU_IMAGE_LINE(N1,T1,N2,T2) /* no CUDA */
+#endif
+
 #define DOIT(T1,T2)			                                                                                             \
   template class StructuredImageMicroOp<N1,T1,N2,T2>;                                                                \
   template class ImageMicroOp<N1,T1,N2,T2>;																			 \
-  template class GPUImageMicroOp<N1, T1, N2, T2>;																	 \
+  GPU_IMAGE_LINE(N1, T1, N2, T2)																	 \
   template class ImageOperation<N1,T1,N2,T2>;                                                                        \
   template ImageMicroOp<N1,T1,N2,T2>::ImageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \
   template void IndexSpace<N1, T1>::suggest_image_buffer_size(						     \
diff --git a/src/realm/deppart/partitions.h b/src/realm/deppart/partitions.h
index 4ec4560984..051717d803 100644
--- a/src/realm/deppart/partitions.h
+++ b/src/realm/deppart/partitions.h
@@ -43,21 +43,7 @@ namespace Realm {
   class PartitioningMicroOp;
   class PartitioningOperation;
 
-  template <typename T>
-  constexpr std::string_view type_name() {
-  #if defined(__clang__)
-      std::string_view p = __PRETTY_FUNCTION__;
-      return {p.data() + 34, p.size() - 34 - 1};
-  #elif defined(__GNUC__)
-      std::string_view p = __PRETTY_FUNCTION__;
-      return {p.data() + 49, p.size() - 49 - 1};
-  #elif defined(_MSC_VER)
-      std::string_view p = __FUNCSIG__;
-      return {p.data() + 84, p.size() - 84 - 7};
-  #else
-      return "unknown";
-  #endif
-  }
+#ifdef REALM_USE_CUDA
 
   template<typename T>
   struct HiFlag {
@@ -139,19 +125,7 @@ namespace Realm {
 
     template <typename T>
     T* alloc(size_t count = 1) {
-      try {
-        if (parity_) {
-          return alloc_right<T>(count);
-        } else {
-          return alloc_left<T>(count);
-        }
-      } catch (arena_oom&) {
-        std::cout << "Arena OOM: requested " << count << " of " << type_name<T>()
-                  << " capacity " << cap_ << " bytes, "
-                  << " used " << used() << " bytes, "
-                  << " left " << (cap_ - left_ - right_) << " bytes.\n";
-        throw arena_oom{};
-      }
+      return parity_ ? alloc_right<T>(count) : alloc_left<T>(count);
     }
 
     void flip_parity(void) noexcept {
@@ -241,6 +215,9 @@ namespace Realm {
     size_t base_right_;
   };
 
+
+#endif
+
   template <int N, typename T>
   class OverlapTester {
   public:
@@ -349,6 +326,7 @@ namespace Realm {
     std::vector<SparsityMapImpl<N,T> *> extra_deps;
   };
 
+#ifdef REALM_USE_CUDA
   //The parent class for all GPU partitioning micro-ops. Provides output utility functions
 
   template<int N, typename T>
@@ -387,6 +365,7 @@ namespace Realm {
     bool exclusive = false;
 
   };
+#endif
 
   ////////////////////////////////////////
   //
diff --git a/src/realm/deppart/preimage.cc b/src/realm/deppart/preimage.cc
index 5df628f2f6..63131916bc 100644
--- a/src/realm/deppart/preimage.cc
+++ b/src/realm/deppart/preimage.cc
@@ -484,6 +484,7 @@ namespace Realm {
 					uop->add_sparsity_output(targets[j], preimages[j]);
 				uop->dispatch(this, true /* ok to run in this thread */);
 			}
+#ifdef REALM_USE_CUDA
 			for (auto ptr_fdd : gpu_ptr_data) {
 				domain_transform.ptr_data = {ptr_fdd};
 				GPUPreimageMicroOp<N, T, N2, T2> *micro_op =
@@ -504,6 +505,10 @@ namespace Realm {
 		          }
 		          micro_op->dispatch(this, true);
 		        }
+#else
+			assert(!gpu_data);
+#endif
+
 		}
 	}
 
@@ -782,6 +787,7 @@ namespace Realm {
 	////////////////////////////////////////////////////////////////////////
 	//
 	// class GPUPreimageMicroOp<N,T,N2,T2>
+#ifdef REALM_USE_CUDA
 
 	template<int N, typename T, int N2, typename T2>
 	GPUPreimageMicroOp<N, T, N2, T2>::GPUPreimageMicroOp(
@@ -837,6 +843,8 @@ namespace Realm {
 
 		this->finish_dispatch(op, inline_ok);
 	}
+#endif
+
 
 	// instantiations of templates handled in preimage_tmpl.cc
 }; // namespace Realm
\ No newline at end of file
diff --git a/src/realm/deppart/preimage.h b/src/realm/deppart/preimage.h
index 1a67c12aee..ed301ad51e 100644
--- a/src/realm/deppart/preimage.h
+++ b/src/realm/deppart/preimage.h
@@ -153,6 +153,8 @@ namespace Realm {
    std::vector<SparsityMap<N, T> > sparsity_outputs;
   };
 
+  #ifdef REALM_USE_CUDA
+
   template <int N, typename T, int N2, typename T2>
   class GPUPreimageMicroOp : public GPUMicroOp<N, T> {
   public:
@@ -183,6 +185,8 @@ namespace Realm {
     std::vector<SparsityMap<N, T> > sparsity_outputs;
   };
 
+#endif
+
   };  // namespace Realm
 
 #endif // REALM_DEPPART_PREIMAGE_H
\ No newline at end of file
diff --git a/src/realm/deppart/preimage_tmpl.cc b/src/realm/deppart/preimage_tmpl.cc
index 2d3d73e5b2..2df0d80502 100644
--- a/src/realm/deppart/preimage_tmpl.cc
+++ b/src/realm/deppart/preimage_tmpl.cc
@@ -43,9 +43,15 @@ namespace Realm {
 #define N1 INST_N1
 #define N2 INST_N2
 
+#ifdef REALM_USE_CUDA
+  #define GPU_PREIMAGE_LINE(N1,T1,N2,T2) template class GPUPreimageMicroOp<N1, T1, N2, T2>;
+#else
+  #define GPU_PREIMAGE_LINE(N1,T1,N2,T2) /* no CUDA */
+#endif
+
 #define DOIT(T1,T2)			    \
 template class PreimageMicroOp<N1,T1,N2,T2>; \
-template class GPUPreimageMicroOp<N1,T1,N2,T2>; \
+GPU_PREIMAGE_LINE(N1,T1,N2,T2) \
 template class StructuredPreimageMicroOp<N1,T1,N2,T2>; \
 template class PreimageOperation<N1,T1,N2,T2>; \
 template PreimageMicroOp<N1,T1,N2,T2>::PreimageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \

From d82566d10092e95be1eba0ada606aa18705decfe Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Mon, 9 Feb 2026 15:02:30 -0800
Subject: [PATCH 07/32] renamed suggested to required and provided target proc
 instead of mem in buffer descriptor

---
 src/realm/deppart/byfield.cc       | 42 +++++++++++++++++++-----------
 src/realm/deppart/byfield_tmpl.cc  |  4 +--
 src/realm/deppart/image.cc         | 29 ++++++++++++++-------
 src/realm/deppart/image_tmpl.cc    |  4 +--
 src/realm/deppart/preimage.cc      | 30 ++++++++++++++-------
 src/realm/deppart/preimage_tmpl.cc |  4 +--
 src/realm/indexspace.h             | 17 ++++++------
 7 files changed, 83 insertions(+), 47 deletions(-)

diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc
index 9c9d5a4ad1..203843d81d 100644
--- a/src/realm/deppart/byfield.cc
+++ b/src/realm/deppart/byfield.cc
@@ -31,28 +31,40 @@ namespace Realm {
 
   template <int N, typename T>
   template<typename FT>
-  void IndexSpace<N,T>::suggest_byfield_buffer_size(
+  void IndexSpace<N,T>::required_byfield_buffer_size(
     const std::vector<DeppartEstimateInput<N,T>>& inputs,
-    std::vector<DeppartEstimateSuggestion>& suggestions) const {
-    suggestions = std::vector<DeppartEstimateSuggestion>(inputs.size());
+    std::vector<DeppartBufferRequirements>& requirements) const {
+    requirements = std::vector<DeppartBufferRequirements>(inputs.size());
     for (size_t i = 0; i < inputs.size(); i++) {
       IndexSpace<N, T> is = inputs[i].space;
       Memory mem = inputs[i].location;
       if (mem.kind() == Memory::GPU_FB_MEM ||
           mem.kind() == Memory::Z_COPY_MEM) {
-        const char* val = std::getenv("MIN_SIZE");  // or any env var
-        size_t device_size = 2000000; //default
-        if (val) {
-          device_size = atoi(val);
-        }
-        size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>);
-        suggestions[i].suggested = mem;
-        suggestions[i].lower_bound = device_size;
-        suggestions[i].upper_bound = max(device_size, optimal_size);
+            const char* val = std::getenv("MIN_SIZE");  // or any env var
+            size_t device_size = 2000000; //default
+            if (val) {
+              device_size = atoi(val);
+            }
+            size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>);
+            std::vector<Machine::ProcessorMemoryAffinity> affinities;
+            unsigned best_bandwidth = 0;
+            Processor best_proc = Processor::NO_PROC;
+            Machine::get_machine().get_proc_mem_affinity(affinities, Processor::NO_PROC, mem);
+            for (auto affinity : affinities) {
+              if (affinity.bandwidth > best_bandwidth) {
+                best_bandwidth = affinity.bandwidth;
+                best_proc = affinity.p;
+              }
+            }
+            requirements[i].target_proc = best_proc;
+            requirements[i].lower_bound = device_size;
+            requirements[i].upper_bound = max(device_size, optimal_size);
+            requirements[i].minimum_alignment = 128;
           } else {
-            suggestions[i].suggested = Memory::NO_MEMORY;
-            suggestions[i].lower_bound = 0;
-            suggestions[i].upper_bound = 0;
+            requirements[i].target_proc = Processor::NO_PROC;
+            requirements[i].lower_bound = 0;
+            requirements[i].upper_bound = 0;
+            requirements[i].minimum_alignment = 0;
           }
     }
   }
diff --git a/src/realm/deppart/byfield_tmpl.cc b/src/realm/deppart/byfield_tmpl.cc
index c8e6db0bcd..fc15f5b94a 100644
--- a/src/realm/deppart/byfield_tmpl.cc
+++ b/src/realm/deppart/byfield_tmpl.cc
@@ -59,9 +59,9 @@ namespace Realm {
 							     std::vector<IndexSpace<N,T> >&, \
 							     const ProfilingRequestSet &, \
 							     Event) const;      \
-  template void IndexSpace<N, T>::suggest_byfield_buffer_size<F>(					\
+  template void IndexSpace<N, T>::required_byfield_buffer_size<F>(					\
 	const std::vector<DeppartEstimateInput<N,T>>&,						\
-	std::vector<DeppartEstimateSuggestion>&) const;
+	std::vector<DeppartBufferRequirements>&) const;
 
   
 FOREACH_NTF(DOIT)
diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc
index d0251687b4..437167a95b 100644
--- a/src/realm/deppart/image.cc
+++ b/src/realm/deppart/image.cc
@@ -32,10 +32,10 @@ namespace Realm {
 
   template <int N, typename T>
   template <int N2, typename T2>
-  void IndexSpace<N,T>::suggest_image_buffer_size(
+  void IndexSpace<N,T>::required_image_buffer_size(
     const std::vector<DeppartSubspace<N2,T2>>& source_spaces,
     const std::vector<DeppartEstimateInput<N2,T2>>& inputs,
-    std::vector<DeppartEstimateSuggestion>& suggestions) const {
+    std::vector<DeppartBufferRequirements>& requirements) const {
     size_t minimal_size = 0;
     size_t source_entries = 0;
     bool bvh = false;
@@ -58,7 +58,7 @@ namespace Realm {
         (2 * source_entries * sizeof(uint64_t)) +
         (source_entries * sizeof(uint64_t));
     }
-    suggestions = std::vector<DeppartEstimateSuggestion>(inputs.size());
+    requirements = std::vector<DeppartBufferRequirements>(inputs.size());
     for (size_t i = 0; i < inputs.size(); i++) {
       IndexSpace<N2, T2> is = inputs[i].space;
       Memory mem = inputs[i].location;
@@ -71,13 +71,24 @@ namespace Realm {
       	}
         minimal_size = max(minimal_size, device_size);
       	size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>) * source_spaces.size() + minimal_size;
-      	suggestions[i].suggested = mem;
-      	suggestions[i].lower_bound = minimal_size;
-      	suggestions[i].upper_bound = optimal_size;
+      	std::vector<Machine::ProcessorMemoryAffinity> affinities;
+        unsigned best_bandwidth = 0;
+        Processor best_proc = Processor::NO_PROC;
+        Machine::get_machine().get_proc_mem_affinity(affinities, Processor::NO_PROC, mem);
+        for (auto affinity : affinities) {
+          if (affinity.bandwidth > best_bandwidth) {
+            best_bandwidth = affinity.bandwidth;
+            best_proc = affinity.p;
+          }
+        }
+        requirements[i].target_proc = best_proc;
+      	requirements[i].lower_bound = minimal_size;
+      	requirements[i].upper_bound = optimal_size;
+        requirements[i].minimum_alignment = 128;
       } else {
-	suggestions[i].suggested = Memory::NO_MEMORY;
-      	suggestions[i].lower_bound = 0;
-      	suggestions[i].upper_bound = 0;
+	requirements[i].target_proc = Processor::NO_PROC;
+      	requirements[i].lower_bound = 0;
+      	requirements[i].upper_bound = 0;
       }
     }
   }
diff --git a/src/realm/deppart/image_tmpl.cc b/src/realm/deppart/image_tmpl.cc
index 19242fa9ca..a2cb2cb9e6 100644
--- a/src/realm/deppart/image_tmpl.cc
+++ b/src/realm/deppart/image_tmpl.cc
@@ -56,10 +56,10 @@ namespace Realm {
   GPU_IMAGE_LINE(N1, T1, N2, T2)																	 \
   template class ImageOperation<N1,T1,N2,T2>;                                                                        \
   template ImageMicroOp<N1,T1,N2,T2>::ImageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \
-  template void IndexSpace<N1, T1>::suggest_image_buffer_size(						     \
+  template void IndexSpace<N1, T1>::required_image_buffer_size(						     \
 	const std::vector<DeppartSubspace<N2,T2>>&,							     \
 	const std::vector<DeppartEstimateInput<N2,T2>>&,							     \
-	std::vector<DeppartEstimateSuggestion>&) const;							     \
+	std::vector<DeppartBufferRequirements>&) const;							     \
   template Event IndexSpace<N1, T1>::create_subspaces_by_image(                                                      \
       const DomainTransform<N1, T1, N2, T2> &, const std::vector<IndexSpace<N2, T2> > &,                             \
       std::vector<IndexSpace<N1, T1> > &, const ProfilingRequestSet &, Event)  const;					\
diff --git a/src/realm/deppart/preimage.cc b/src/realm/deppart/preimage.cc
index 63131916bc..d327df1c74 100644
--- a/src/realm/deppart/preimage.cc
+++ b/src/realm/deppart/preimage.cc
@@ -34,10 +34,10 @@ namespace Realm {
 
   template <int N, typename T>
   template <int N2, typename T2>
-  void IndexSpace<N,T>::suggest_preimage_buffer_size(
+  void IndexSpace<N,T>::required_preimage_buffer_size(
     const std::vector<DeppartSubspace<N2,T2>>& target_spaces,
     const std::vector<DeppartEstimateInput<N,T>>& inputs,
-    std::vector<DeppartEstimateSuggestion>& suggestions) const {
+    std::vector<DeppartBufferRequirements>& requirements) const {
     size_t minimal_size = 0;
     size_t source_entries = 0;
     bool bvh = false;
@@ -60,7 +60,7 @@ namespace Realm {
         (2 * source_entries * sizeof(uint64_t)) +
         (source_entries * sizeof(uint64_t));
     }
-    suggestions = std::vector<DeppartEstimateSuggestion>(inputs.size());
+    requirements = std::vector<DeppartBufferRequirements>(inputs.size());
     for (size_t i = 0; i < inputs.size(); i++) {
       IndexSpace<N, T> is = inputs[i].space;
       Memory mem = inputs[i].location;
@@ -73,13 +73,25 @@ namespace Realm {
         }
         minimal_size = max(minimal_size, device_size);
         size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>) * target_spaces.size() + minimal_size;
-        suggestions[i].suggested = mem;
-        suggestions[i].lower_bound = minimal_size;
-        suggestions[i].upper_bound = optimal_size;
+        std::vector<Machine::ProcessorMemoryAffinity> affinities;
+        unsigned best_bandwidth = 0;
+        Processor best_proc = Processor::NO_PROC;
+        Machine::get_machine().get_proc_mem_affinity(affinities, Processor::NO_PROC, mem);
+        for (auto affinity : affinities) {
+          if (affinity.bandwidth > best_bandwidth) {
+            best_bandwidth = affinity.bandwidth;
+            best_proc = affinity.p;
+          }
+        }
+        requirements[i].target_proc = best_proc;
+        requirements[i].lower_bound = minimal_size;
+        requirements[i].upper_bound = optimal_size;
+        requirements[i].minimum_alignment = 128;
           } else {
-            suggestions[i].suggested = Memory::NO_MEMORY;
-            suggestions[i].lower_bound = 0;
-            suggestions[i].upper_bound = 0;
+            requirements[i].target_proc = Processor::NO_PROC;
+            requirements[i].lower_bound = 0;
+            requirements[i].upper_bound = 0;
+            requirements[i].minimum_alignment = 0;
           }
     }
   }
diff --git a/src/realm/deppart/preimage_tmpl.cc b/src/realm/deppart/preimage_tmpl.cc
index 2df0d80502..ef6725f567 100644
--- a/src/realm/deppart/preimage_tmpl.cc
+++ b/src/realm/deppart/preimage_tmpl.cc
@@ -55,10 +55,10 @@ GPU_PREIMAGE_LINE(N1,T1,N2,T2) \
 template class StructuredPreimageMicroOp<N1,T1,N2,T2>; \
 template class PreimageOperation<N1,T1,N2,T2>; \
 template PreimageMicroOp<N1,T1,N2,T2>::PreimageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \
-template void IndexSpace<N1, T1>::suggest_preimage_buffer_size(						     \
+template void IndexSpace<N1, T1>::required_preimage_buffer_size(						     \
 	const std::vector<DeppartSubspace<N2,T2>>&,							     \
 	const std::vector<DeppartEstimateInput<N1,T1>>&,							     \
-	std::vector<DeppartEstimateSuggestion>&) const;	\
+	std::vector<DeppartBufferRequirements>&) const;	\
 template Event IndexSpace<N1, T1>::create_subspaces_by_preimage(            \
 const DomainTransform<N2, T2, N1, T1> &, const std::vector<IndexSpace<N2, T2> > &,            \
 std::vector<IndexSpace<N1, T1> > &, const ProfilingRequestSet &, Event) \
diff --git a/src/realm/indexspace.h b/src/realm/indexspace.h
index 9ea593b392..61109181fc 100644
--- a/src/realm/indexspace.h
+++ b/src/realm/indexspace.h
@@ -124,10 +124,11 @@ namespace Realm {
     Memory location;
   };
 
-  struct DeppartEstimateSuggestion {
-    Memory suggested;
+  struct DeppartBufferRequirements {
     size_t lower_bound;
     size_t upper_bound;
+    size_t minimum_alignment = 128;
+    Processor target_proc;
   };
 
   /**
@@ -737,9 +738,9 @@ namespace Realm {
         const ProfilingRequestSet &reqs, Event wait_on = Event::NO_EVENT) const;
 
     template <typename FT>
-    REALM_PUBLIC_API void suggest_byfield_buffer_size(
+    REALM_PUBLIC_API void required_byfield_buffer_size(
         const std::vector<DeppartEstimateInput<N,T>>& inputs,
-        std::vector<DeppartEstimateSuggestion>& suggestions) const;
+        std::vector<DeppartBufferRequirements>& suggestions) const;
 
     ///@{
     /**
@@ -807,10 +808,10 @@ namespace Realm {
         Event wait_on = Event::NO_EVENT) const;
 
     template<int N2, typename T2>
-    REALM_PUBLIC_API void suggest_image_buffer_size(
+    REALM_PUBLIC_API void required_image_buffer_size(
         const std::vector<DeppartSubspace<N2,T2>>& source_spaces,
         const std::vector<DeppartEstimateInput<N2,T2>>& inputs,
-        std::vector<DeppartEstimateSuggestion>& suggestions) const;
+        std::vector<DeppartBufferRequirements>& suggestions) const;
 
 
     ///@}
@@ -934,10 +935,10 @@ namespace Realm {
         Event wait_on = Event::NO_EVENT) const;
 
     template<int N2, typename T2>
-    REALM_PUBLIC_API void suggest_preimage_buffer_size(
+    REALM_PUBLIC_API void required_preimage_buffer_size(
         const std::vector<DeppartSubspace<N2,T2>>& target_spaces,
         const std::vector<DeppartEstimateInput<N,T>>& inputs,
-        std::vector<DeppartEstimateSuggestion>& suggestions) const;
+        std::vector<DeppartBufferRequirements>& suggestions) const;
     ///@}
 
     ///@{

From 0d921066e718b8f1b6a5611ecec72d08d0263f34 Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Tue, 10 Feb 2026 10:00:30 -0800
Subject: [PATCH 08/32] deleted default alignment

---
 src/realm/indexspace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/realm/indexspace.h b/src/realm/indexspace.h
index 61109181fc..61ff97da55 100644
--- a/src/realm/indexspace.h
+++ b/src/realm/indexspace.h
@@ -127,7 +127,7 @@ namespace Realm {
   struct DeppartBufferRequirements {
     size_t lower_bound;
     size_t upper_bound;
-    size_t minimum_alignment = 128;
+    size_t minimum_alignment;
     Processor target_proc;
   };
 

From 59ad8780b83a171949893542ccfa8b91870e256b Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Tue, 10 Feb 2026 23:30:34 -0800
Subject: [PATCH 09/32] removed ft from byfield estimate template

---
 src/realm/deppart/byfield.cc      |  1 -
 src/realm/deppart/byfield_tmpl.cc | 14 ++++++++++----
 src/realm/indexspace.h            |  1 -
 tests/deppart.cc                  |  3 +++
 4 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc
index 203843d81d..9af275eb3d 100644
--- a/src/realm/deppart/byfield.cc
+++ b/src/realm/deppart/byfield.cc
@@ -30,7 +30,6 @@ namespace Realm {
   extern Logger log_uop_timing;
 
   template <int N, typename T>
-  template<typename FT>
   void IndexSpace<N,T>::required_byfield_buffer_size(
     const std::vector<DeppartEstimateInput<N,T>>& inputs,
     std::vector<DeppartBufferRequirements>& requirements) const {
diff --git a/src/realm/deppart/byfield_tmpl.cc b/src/realm/deppart/byfield_tmpl.cc
index fc15f5b94a..b9896c5c53 100644
--- a/src/realm/deppart/byfield_tmpl.cc
+++ b/src/realm/deppart/byfield_tmpl.cc
@@ -45,6 +45,13 @@ namespace Realm {
 
 #ifdef REALM_USE_CUDA
   #define GPU_BYFIELD_LINE(N, T, ...) template class GPUByFieldMicroOp<N,T,__VA_ARGS__>;
+  #define DOIT_NT(N, T) \
+      template void IndexSpace<N, T>::required_byfield_buffer_size(						     \
+	const std::vector<DeppartEstimateInput<N,T>>&,							     \
+	std::vector<DeppartBufferRequirements>&) const;
+
+FOREACH_NT(DOIT_NT)
+
 #else
   #define GPU_BYFIELD_LINE(N, T, ...) /* no CUDA */
 #endif
@@ -58,10 +65,9 @@ namespace Realm {
 							     const std::vector<F>&, \
 							     std::vector<IndexSpace<N,T> >&, \
 							     const ProfilingRequestSet &, \
-							     Event) const;      \
-  template void IndexSpace<N, T>::required_byfield_buffer_size<F>(					\
-	const std::vector<DeppartEstimateInput<N,T>>&,						\
-	std::vector<DeppartBufferRequirements>&) const;
+							     Event) const;
+
+
 
   
 FOREACH_NTF(DOIT)
diff --git a/src/realm/indexspace.h b/src/realm/indexspace.h
index 61ff97da55..14c8561e20 100644
--- a/src/realm/indexspace.h
+++ b/src/realm/indexspace.h
@@ -737,7 +737,6 @@ namespace Realm {
         const std::vector<FT> &colors, std::vector<IndexSpace<N, T>> &subspaces,
         const ProfilingRequestSet &reqs, Event wait_on = Event::NO_EVENT) const;
 
-    template <typename FT>
     REALM_PUBLIC_API void required_byfield_buffer_size(
         const std::vector<DeppartEstimateInput<N,T>>& inputs,
         std::vector<DeppartBufferRequirements>& suggestions) const;
diff --git a/tests/deppart.cc b/tests/deppart.cc
index eaf4a012e8..742c9d9c8b 100644
--- a/tests/deppart.cc
+++ b/tests/deppart.cc
@@ -559,6 +559,9 @@ class BasicTest : public TestInterface {
   	log_app.info() << "GPU Preimage complete " << Clock::current_time_in_microseconds() << "\n";
 	log_app.info() << "Starting GPU Image " << Clock::current_time_in_microseconds() << "\n";
 
+    std::vector<DeppartEstimateInput<1, int>> spaces = {};
+    std::vector<DeppartBufferRequirements> requirements;
+    is_nodes.required_byfield_buffer_size(spaces, requirements);
     // an image of p_edges through out_node gives us all the shared nodes, along
     //  with some private nodes
     Event e3 = is_nodes.create_subspaces_by_image(src_field_data_gpu,

From b2f64a9c6f4392194123c2534cd299ffabcee371 Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Thu, 12 Feb 2026 11:48:05 -0800
Subject: [PATCH 10/32] renamed gpu deppart requirement functions

---
 src/realm/deppart/byfield.cc       |  6 +++---
 src/realm/deppart/byfield_tmpl.cc  | 12 +++++-------
 src/realm/deppart/image.cc         |  6 +++---
 src/realm/deppart/image_tmpl.cc    |  2 +-
 src/realm/deppart/preimage.cc      |  6 +++---
 src/realm/deppart/preimage_tmpl.cc |  2 +-
 src/realm/indexspace.h             | 14 +++++++-------
 tests/deppart.cc                   |  2 +-
 8 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc
index 9af275eb3d..cfd2927589 100644
--- a/src/realm/deppart/byfield.cc
+++ b/src/realm/deppart/byfield.cc
@@ -30,7 +30,7 @@ namespace Realm {
   extern Logger log_uop_timing;
 
   template <int N, typename T>
-  void IndexSpace<N,T>::required_byfield_buffer_size(
+  void IndexSpace<N,T>::by_field_buffer_requirements(
     const std::vector<DeppartEstimateInput<N,T>>& inputs,
     std::vector<DeppartBufferRequirements>& requirements) const {
     requirements = std::vector<DeppartBufferRequirements>(inputs.size());
@@ -55,12 +55,12 @@ namespace Realm {
                 best_proc = affinity.p;
               }
             }
-            requirements[i].target_proc = best_proc;
+            requirements[i].affinity_processor = best_proc;
             requirements[i].lower_bound = device_size;
             requirements[i].upper_bound = max(device_size, optimal_size);
             requirements[i].minimum_alignment = 128;
           } else {
-            requirements[i].target_proc = Processor::NO_PROC;
+            requirements[i].affinity_processor = Processor::NO_PROC;
             requirements[i].lower_bound = 0;
             requirements[i].upper_bound = 0;
             requirements[i].minimum_alignment = 0;
diff --git a/src/realm/deppart/byfield_tmpl.cc b/src/realm/deppart/byfield_tmpl.cc
index b9896c5c53..3da5121f04 100644
--- a/src/realm/deppart/byfield_tmpl.cc
+++ b/src/realm/deppart/byfield_tmpl.cc
@@ -45,12 +45,6 @@ namespace Realm {
 
 #ifdef REALM_USE_CUDA
   #define GPU_BYFIELD_LINE(N, T, ...) template class GPUByFieldMicroOp<N,T,__VA_ARGS__>;
-  #define DOIT_NT(N, T) \
-      template void IndexSpace<N, T>::required_byfield_buffer_size(						     \
-	const std::vector<DeppartEstimateInput<N,T>>&,							     \
-	std::vector<DeppartBufferRequirements>&) const;
-
-FOREACH_NT(DOIT_NT)
 
 #else
   #define GPU_BYFIELD_LINE(N, T, ...) /* no CUDA */
@@ -67,9 +61,13 @@ FOREACH_NT(DOIT_NT)
 							     const ProfilingRequestSet &, \
 							     Event) const;
 
+#define DOIT_NT(N, T) \
+  template void IndexSpace<N, T>::by_field_buffer_requirements(						     \
+  const std::vector<DeppartEstimateInput<N,T>>&,							     \
+  std::vector<DeppartBufferRequirements>&) const;
 
 
-  
+FOREACH_NT(DOIT_NT)
 FOREACH_NTF(DOIT)
 
 #define ZP(N,T) Point<N,T>
diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc
index 437167a95b..ff1122d820 100644
--- a/src/realm/deppart/image.cc
+++ b/src/realm/deppart/image.cc
@@ -32,7 +32,7 @@ namespace Realm {
 
   template <int N, typename T>
   template <int N2, typename T2>
-  void IndexSpace<N,T>::required_image_buffer_size(
+  void IndexSpace<N,T>::by_image_buffer_requirements(
     const std::vector<DeppartSubspace<N2,T2>>& source_spaces,
     const std::vector<DeppartEstimateInput<N2,T2>>& inputs,
     std::vector<DeppartBufferRequirements>& requirements) const {
@@ -81,12 +81,12 @@ namespace Realm {
             best_proc = affinity.p;
           }
         }
-        requirements[i].target_proc = best_proc;
+        requirements[i].affinity_processor = best_proc;
       	requirements[i].lower_bound = minimal_size;
       	requirements[i].upper_bound = optimal_size;
         requirements[i].minimum_alignment = 128;
       } else {
-	requirements[i].target_proc = Processor::NO_PROC;
+	requirements[i].affinity_processor = Processor::NO_PROC;
       	requirements[i].lower_bound = 0;
       	requirements[i].upper_bound = 0;
       }
diff --git a/src/realm/deppart/image_tmpl.cc b/src/realm/deppart/image_tmpl.cc
index a2cb2cb9e6..a0d3d7319a 100644
--- a/src/realm/deppart/image_tmpl.cc
+++ b/src/realm/deppart/image_tmpl.cc
@@ -56,7 +56,7 @@ namespace Realm {
   GPU_IMAGE_LINE(N1, T1, N2, T2)																	 \
   template class ImageOperation<N1,T1,N2,T2>;                                                                        \
   template ImageMicroOp<N1,T1,N2,T2>::ImageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \
-  template void IndexSpace<N1, T1>::required_image_buffer_size(						     \
+  template void IndexSpace<N1, T1>::by_image_buffer_requirements(						     \
 	const std::vector<DeppartSubspace<N2,T2>>&,							     \
 	const std::vector<DeppartEstimateInput<N2,T2>>&,							     \
 	std::vector<DeppartBufferRequirements>&) const;							     \
diff --git a/src/realm/deppart/preimage.cc b/src/realm/deppart/preimage.cc
index d327df1c74..4ae8cd4ddc 100644
--- a/src/realm/deppart/preimage.cc
+++ b/src/realm/deppart/preimage.cc
@@ -34,7 +34,7 @@ namespace Realm {
 
   template <int N, typename T>
   template <int N2, typename T2>
-  void IndexSpace<N,T>::required_preimage_buffer_size(
+  void IndexSpace<N,T>::by_preimage_buffer_requirements(
     const std::vector<DeppartSubspace<N2,T2>>& target_spaces,
     const std::vector<DeppartEstimateInput<N,T>>& inputs,
     std::vector<DeppartBufferRequirements>& requirements) const {
@@ -83,12 +83,12 @@ namespace Realm {
             best_proc = affinity.p;
           }
         }
-        requirements[i].target_proc = best_proc;
+        requirements[i].affinity_processor = best_proc;
         requirements[i].lower_bound = minimal_size;
         requirements[i].upper_bound = optimal_size;
         requirements[i].minimum_alignment = 128;
           } else {
-            requirements[i].target_proc = Processor::NO_PROC;
+            requirements[i].affinity_processor = Processor::NO_PROC;
             requirements[i].lower_bound = 0;
             requirements[i].upper_bound = 0;
             requirements[i].minimum_alignment = 0;
diff --git a/src/realm/deppart/preimage_tmpl.cc b/src/realm/deppart/preimage_tmpl.cc
index ef6725f567..dadf4b8aa6 100644
--- a/src/realm/deppart/preimage_tmpl.cc
+++ b/src/realm/deppart/preimage_tmpl.cc
@@ -55,7 +55,7 @@ GPU_PREIMAGE_LINE(N1,T1,N2,T2) \
 template class StructuredPreimageMicroOp<N1,T1,N2,T2>; \
 template class PreimageOperation<N1,T1,N2,T2>; \
 template PreimageMicroOp<N1,T1,N2,T2>::PreimageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \
-template void IndexSpace<N1, T1>::required_preimage_buffer_size(						     \
+template void IndexSpace<N1, T1>::by_preimage_buffer_requirements(						     \
 	const std::vector<DeppartSubspace<N2,T2>>&,							     \
 	const std::vector<DeppartEstimateInput<N1,T1>>&,							     \
 	std::vector<DeppartBufferRequirements>&) const;	\
diff --git a/src/realm/indexspace.h b/src/realm/indexspace.h
index 14c8561e20..82071fd6ae 100644
--- a/src/realm/indexspace.h
+++ b/src/realm/indexspace.h
@@ -128,7 +128,7 @@ namespace Realm {
     size_t lower_bound;
     size_t upper_bound;
     size_t minimum_alignment;
-    Processor target_proc;
+    Processor affinity_processor;
   };
 
   /**
@@ -737,9 +737,9 @@ namespace Realm {
         const std::vector<FT> &colors, std::vector<IndexSpace<N, T>> &subspaces,
         const ProfilingRequestSet &reqs, Event wait_on = Event::NO_EVENT) const;
 
-    REALM_PUBLIC_API void required_byfield_buffer_size(
+    REALM_PUBLIC_API void by_field_buffer_requirements(
         const std::vector<DeppartEstimateInput<N,T>>& inputs,
-        std::vector<DeppartBufferRequirements>& suggestions) const;
+        std::vector<DeppartBufferRequirements>& requirements) const;
 
     ///@{
     /**
@@ -807,10 +807,10 @@ namespace Realm {
         Event wait_on = Event::NO_EVENT) const;
 
     template<int N2, typename T2>
-    REALM_PUBLIC_API void required_image_buffer_size(
+    REALM_PUBLIC_API void by_image_buffer_requirements(
         const std::vector<DeppartSubspace<N2,T2>>& source_spaces,
         const std::vector<DeppartEstimateInput<N2,T2>>& inputs,
-        std::vector<DeppartBufferRequirements>& suggestions) const;
+        std::vector<DeppartBufferRequirements>& requirements) const;
 
 
     ///@}
@@ -934,10 +934,10 @@ namespace Realm {
         Event wait_on = Event::NO_EVENT) const;
 
     template<int N2, typename T2>
-    REALM_PUBLIC_API void required_preimage_buffer_size(
+    REALM_PUBLIC_API void by_preimage_buffer_requirements(
         const std::vector<DeppartSubspace<N2,T2>>& target_spaces,
         const std::vector<DeppartEstimateInput<N,T>>& inputs,
-        std::vector<DeppartBufferRequirements>& suggestions) const;
+        std::vector<DeppartBufferRequirements>& requirements) const;
     ///@}
 
     ///@{
diff --git a/tests/deppart.cc b/tests/deppart.cc
index 742c9d9c8b..448d3a60d0 100644
--- a/tests/deppart.cc
+++ b/tests/deppart.cc
@@ -561,7 +561,7 @@ class BasicTest : public TestInterface {
 
     std::vector<DeppartEstimateInput<1, int>> spaces = {};
     std::vector<DeppartBufferRequirements> requirements;
-    is_nodes.required_byfield_buffer_size(spaces, requirements);
+    is_nodes.by_field_buffer_requirements(spaces, requirements);
     // an image of p_edges through out_node gives us all the shared nodes, along
     //  with some private nodes
     Event e3 = is_nodes.create_subspaces_by_image(src_field_data_gpu,

From 9f7be25d0d207dfa037c71d1114957114f4428b8 Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Thu, 12 Feb 2026 12:25:40 -0800
Subject: [PATCH 11/32] Added default initializations to
 DeppartBufferRequirements

---
 src/realm/indexspace.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/realm/indexspace.h b/src/realm/indexspace.h
index 82071fd6ae..c1a61b21cb 100644
--- a/src/realm/indexspace.h
+++ b/src/realm/indexspace.h
@@ -125,9 +125,9 @@ namespace Realm {
   };
 
   struct DeppartBufferRequirements {
-    size_t lower_bound;
-    size_t upper_bound;
-    size_t minimum_alignment;
+    size_t lower_bound = 0;
+    size_t upper_bound = 0;
+    size_t minimum_alignment = 0;
     Processor affinity_processor;
   };
 

From a72be3ed5522d2eaf50231a5d79b120b3dc00369 Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Tue, 17 Feb 2026 19:15:56 -0800
Subject: [PATCH 12/32] updated 1d image range

---
 src/realm/deppart/image.cc                   |  15 +-
 src/realm/deppart/image_gpu_impl.hpp         | 283 +++++++++++--------
 src/realm/deppart/partitions_gpu_impl.hpp    |  25 +-
 src/realm/deppart/partitions_gpu_kernels.hpp |   1 +
 src/realm/deppart/rectlist.inl               |   6 +-
 tests/deppart.cc                             |  30 ++
 6 files changed, 231 insertions(+), 129 deletions(-)

diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc
index ff1122d820..b0dcd4383a 100644
--- a/src/realm/deppart/image.cc
+++ b/src/realm/deppart/image.cc
@@ -38,7 +38,7 @@ namespace Realm {
     std::vector<DeppartBufferRequirements>& requirements) const {
     size_t minimal_size = 0;
     size_t source_entries = 0;
-    bool bvh = false;
+    bool bvh = true;
     for (auto subspace : source_spaces) {
       source_entries += subspace.entries == 0 ? 1 : subspace.entries;
     }
@@ -70,7 +70,7 @@ namespace Realm {
       		device_size = atoi(val);
       	}
         minimal_size = max(minimal_size, device_size);
-      	size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>) * source_spaces.size() + minimal_size;
+      	size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>) * source_spaces.size() * 10 + minimal_size;
       	std::vector<Machine::ProcessorMemoryAffinity> affinities;
         unsigned best_bandwidth = 0;
         Processor best_proc = Processor::NO_PROC;
@@ -285,6 +285,7 @@ namespace Realm {
 		if(!bmpp) bmpp = &bitmasks[i];
 		if(!*bmpp) *bmpp = new BM;
 		(*bmpp)->add_rect(it3.rect);
+
 	      }
 	    }
 	  }
@@ -704,10 +705,11 @@ namespace Realm {
       for (auto ptr_fdd : gpu_ptr_data) {
           	// launch full cross-product of image micro ops right away
           assert(ptr_fdd.scratch_buffer != RegionInstance::NO_INST);
-          domain_transform.ptr_data = {ptr_fdd};
+          DomainTransform<N, T, N2, T2> domain_transform_copy = domain_transform;
+          domain_transform_copy.ptr_data = {ptr_fdd};
       	  GPUImageMicroOp<N, T, N2, T2> *micro_op =
            new GPUImageMicroOp<N, T, N2, T2>(
-  	   parent, domain_transform, exclusive);
+  	   parent, domain_transform_copy, exclusive);
       	  for (size_t j = 0; j < sources.size(); j++) {
       		  micro_op->add_sparsity_output(sources[j], images[j]);
       	  }
@@ -716,10 +718,11 @@ namespace Realm {
       for (auto rect_fdd : gpu_rect_data) {
         // launch full cross-product of image micro ops right away
         assert(rect_fdd.scratch_buffer != RegionInstance::NO_INST);
-        domain_transform.range_data = {rect_fdd};
+        DomainTransform<N, T, N2, T2> domain_transform_copy = domain_transform;
+        domain_transform_copy.range_data = {rect_fdd};
         GPUImageMicroOp<N, T, N2, T2> *micro_op =
            new GPUImageMicroOp<N, T, N2, T2>(
-             parent, domain_transform, exclusive);
+             parent, domain_transform_copy, exclusive);
         for (size_t j = 0; j < sources.size(); j++) {
           micro_op->add_sparsity_output(sources[j], images[j]);
         }
diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp
index b3c38789f5..ce83e03639 100644
--- a/src/realm/deppart/image_gpu_impl.hpp
+++ b/src/realm/deppart/image_gpu_impl.hpp
@@ -46,74 +46,40 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
 
     NVTX_DEPPART(gpu_image);
 
-    Memory my_mem = domain_transform.range_data[0].inst.get_location();
+    RegionInstance buffer = domain_transform.range_data[0].scratch_buffer;
+    size_t tile_size = buffer.get_layout()->bytes_used;
+    std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
+    Arena buffer_arena(reinterpret_cast<void *>(AffineAccessor<char, 1>(buffer, 0).base), tile_size);
 
     cudaStream_t stream = Cuda::get_task_cuda_stream();
 
-    const char* val = std::getenv("TILE_SIZE");  // or any env var
-    size_t tile_size = 100000000; //default
-    if (val) {
-      tile_size = atoi(val);
-    }
-
-    RegionInstance fixed_buffer = this->realm_malloc(tile_size, my_mem);
-    Arena buffer_arena(reinterpret_cast<void *>(AffineAccessor<char, 1>(fixed_buffer, 0).base), tile_size);
-
     collapsed_space<N2, T2> src_space;
-    RegionInstance offsets_instance = this->realm_malloc((sources.size()+1) * sizeof(size_t), my_mem);
-    src_space.offsets = reinterpret_cast<size_t*>(AffineAccessor<char,1>(offsets_instance, 0).base);
+    src_space.offsets = buffer_arena.alloc<size_t>(sources.size()+1);
     src_space.num_children = sources.size();
-
     GPUMicroOp<N2, T2>::collapse_multi_space(sources, src_space, buffer_arena, stream);
 
     collapsed_space<N2, T2> inst_space;
   
     // We combine all of our instances into one to batch work, tracking the offsets between instances.
-    RegionInstance inst_offsets_instance = this->realm_malloc((domain_transform.range_data.size() + 1) * sizeof(size_t), my_mem);
-    inst_space.offsets = reinterpret_cast<size_t*>(AffineAccessor<char,1>(inst_offsets_instance, 0).base);
+    inst_space.offsets = buffer_arena.alloc<size_t>(domain_transform.range_data.size() + 1);
     inst_space.num_children = domain_transform.range_data.size();
   
-    GPUMicroOp<N2, T2>::collapse_multi_space(domain_transform.range_data, inst_space, buffer_arena, stream);
+    Arena sys_arena;
+    GPUMicroOp<N2, T2>::collapse_multi_space(domain_transform.range_data, inst_space, sys_arena, stream);
 
     // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter
     // to figure out where to write each rectangle.
-    RegionInstance inst_counters_instance = this->realm_malloc((2*domain_transform.range_data.size() + 1) * sizeof(uint32_t), my_mem);
-    uint32_t* d_inst_counters = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(inst_counters_instance, 0).base);
+    uint32_t* d_inst_counters = buffer_arena.alloc<uint32_t>(2 * domain_transform.range_data.size()+1);
   
     // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second
     // to track which instance each rectangle came from in the populate phase.
     uint32_t* d_inst_prefix = d_inst_counters + domain_transform.range_data.size();
-    RegionInstance valid_rects_instance;
-    size_t num_valid_rects;
-    RectDesc<N2, T2>* d_valid_rects;
-  
-    // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from.
-    GPUMicroOp<N2, T2>::template construct_input_rectlist<RectDesc<N2, T2>>(inst_space, src_space, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream);
-    inst_offsets_instance.destroy();
-
-    if (num_valid_rects == 0) {
-      for (SparsityMap<N, T> it : sparsity_outputs) {
-            SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it);
-            if (this->exclusive) {
-              impl->gpu_finalize();
-            } else {
-              impl->contribute_nothing();
-            }
-      }
-      valid_rects_instance.destroy();
-      inst_counters_instance.destroy();
-      return;
-    }
-
-    // Prefix sum the valid rectangles by volume.
-    size_t* d_prefix_rects;
-    size_t total_pts;
-
-    GPUMicroOp<N2, T2>::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream);
+    size_t num_valid_rects = tile_size;
 
-    RegionInstance rngs_instance = this->realm_malloc(total_pts * sizeof(RectDesc<N,T>), my_mem);
-    RectDesc<N,T>* d_rngs = reinterpret_cast<RectDesc<N,T>*>(AffineAccessor<char,1>(rngs_instance, 0).base);
+    collapsed_space<N, T> collapsed_parent;
 
+    // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs.
+    GPUMicroOp<N, T>::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream);
 
     Memory zcpy_mem;
     assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM));
@@ -123,88 +89,174 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
       d_accessors[i] = AffineAccessor<Rect<N,T>,N2,T2>(domain_transform.range_data[i].inst, domain_transform.range_data[i].field_offset);
     }
 
-    image_gpuPopulateBitmasksRngsKernel<N,T,N2,T2><<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, total_pts, num_valid_rects, domain_transform.range_data.size(), d_rngs);
-    KERNEL_CHECK(stream);
+    uint32_t* d_src_counters = buffer_arena.alloc<uint32_t>(2 * sources.size() + 1);
+    uint32_t* d_src_prefix = d_src_counters + sources.size();
 
-    RegionInstance parent_entries_instance;
-    collapsed_space<N, T> collapsed_parent;
+    buffer_arena.commit(false);
+    size_t left = buffer_arena.used();
 
-    // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs.
-    GPUMicroOp<N, T>::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream);
-  
+    size_t num_output = 0;
+    RectDesc<N, T>* output_start = nullptr;
+    size_t num_completed = 0;
+    size_t curr_tile = tile_size / 2;
+    int count = 0;
+    while (num_completed < inst_space.num_entries) {
+      try {
+        std::cout << "Tile iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
+        buffer_arena.start();
+        buffer_arena.flip_parity();
+        if (num_completed + curr_tile > inst_space.num_entries) {
+          curr_tile = inst_space.num_entries - num_completed;
+        }
+        collapsed_space<N2, T2> inst_space_tile = inst_space;
+        inst_space_tile.num_entries = curr_tile;
+        inst_space_tile.entries_buffer = buffer_arena.alloc<SparsityMapEntry<N2,T2>>(curr_tile);
+        CUDA_CHECK(cudaMemcpyAsync(inst_space_tile.entries_buffer, inst_space.entries_buffer + num_completed, curr_tile * sizeof(SparsityMapEntry<N2,T2>), cudaMemcpyHostToDevice, stream), stream);
+
+        RectDesc<N2, T2>* d_valid_rects;
+        // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from.
+        GPUMicroOp<N2, T2>::template construct_input_rectlist<RectDesc<N2, T2>>(inst_space_tile, src_space, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream);
+
+        if (num_valid_rects == 0) {
+          num_completed += curr_tile;
+          curr_tile = tile_size / 2;
+          subtract_const<<<COMPUTE_GRID(domain_transform.range_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile);
+          KERNEL_CHECK(stream);
+          CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+          continue;
+        }
+
+        // Prefix sum the valid rectangles by volume.
+        size_t* d_prefix_rects;
+        size_t total_pts;
 
-    RegionInstance src_counters_instance = this->realm_malloc(sources.size() * sizeof(uint32_t), my_mem);
-    uint32_t* d_src_counters = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(src_counters_instance, 0).base);
-    CUDA_CHECK(cudaMemsetAsync(d_src_counters, 0, sources.size() * sizeof(uint32_t), stream), stream);
+        GPUMicroOp<N2, T2>::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream);
 
+        buffer_arena.flip_parity();
+        RectDesc<N,T>* d_rngs = buffer_arena.alloc<RectDesc<N,T>>(total_pts);
 
-    //Finally, we do another two pass count + emit to intersect with the parent rectangles
-    image_intersect_output<N,T><<<COMPUTE_GRID(collapsed_parent.num_entries * total_pts), THREADS_PER_BLOCK, 0, stream>>>(collapsed_parent.entries_buffer, d_rngs, nullptr, collapsed_parent.num_entries, total_pts, d_src_counters, nullptr);
-    KERNEL_CHECK(stream);
+        image_gpuPopulateBitmasksRngsKernel<N,T,N2,T2><<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, total_pts, num_valid_rects, domain_transform.range_data.size(), d_rngs);
+        KERNEL_CHECK(stream);
 
-    std::vector<uint32_t> h_src_counters(sources.size()+1);
-    h_src_counters[0] = 0; // prefix sum starts at 0
-    CUDA_CHECK(cudaMemcpyAsync(h_src_counters.data()+1, d_src_counters, sources.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream);
 
-    CUDA_CHECK(cudaStreamSynchronize(stream), stream);
-    valid_rects_instance.destroy();
-    accessors_instance.destroy();
+        CUDA_CHECK(cudaMemsetAsync(d_src_counters, 0, sources.size() * sizeof(uint32_t), stream), stream);
 
-    for (size_t i = 0; i < sources.size(); ++i) {
-      h_src_counters[i+1] += h_src_counters[i];
-    }
 
-    size_t num_valid_output = h_src_counters[sources.size()];
+        //Finally, we do another two pass count + emit to intersect with the parent rectangles
+        image_intersect_output<N,T><<<COMPUTE_GRID(collapsed_parent.num_entries * total_pts), THREADS_PER_BLOCK, 0, stream>>>(collapsed_parent.entries_buffer, d_rngs, nullptr, collapsed_parent.num_entries, total_pts, d_src_counters, nullptr);
+        KERNEL_CHECK(stream);
 
-    if (num_valid_output == 0) {
-      for (SparsityMap<N, T> it : sparsity_outputs) {
-        SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it);
-        if (this->exclusive) {
-          impl->gpu_finalize();
-        } else {
-          impl->contribute_nothing();
+        std::vector<uint32_t> h_src_counters(sources.size()+1);
+        h_src_counters[0] = 0; // prefix sum starts at 0
+        CUDA_CHECK(cudaMemcpyAsync(h_src_counters.data()+1, d_src_counters, sources.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream);
+        CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+
+        for (size_t i = 0; i < sources.size(); ++i) {
+          h_src_counters[i+1] += h_src_counters[i];
         }
-      }
-      parent_entries_instance.destroy();
-      src_counters_instance.destroy();
-      rngs_instance.destroy();
-      return;
-    }
 
+        size_t num_valid_output = h_src_counters[sources.size()];
 
-    RegionInstance valid_intersect_instance = this->realm_malloc(num_valid_output * sizeof(RectDesc<N,T>), my_mem);
-    RectDesc<N,T>* d_valid_intersect = reinterpret_cast<RectDesc<N,T>*>(AffineAccessor<char,1>(valid_intersect_instance, 0).base);
+        if (num_valid_output == 0) {
+          num_completed += curr_tile;
+          curr_tile = tile_size / 2;
+          subtract_const<<<COMPUTE_GRID(domain_transform.range_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile);
+          KERNEL_CHECK(stream);
+          CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+          continue;
+        }
 
-    RegionInstance src_prefix_instance = this->realm_malloc((sources.size() + 1) * sizeof(uint32_t), my_mem);
-    uint32_t* d_src_prefix = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(src_prefix_instance, 0).base);
-    CUDA_CHECK(cudaMemcpyAsync(d_src_prefix, h_src_counters.data(), (sources.size() + 1) * sizeof(size_t), cudaMemcpyHostToDevice, stream), stream);
+        buffer_arena.flip_parity();
+        RectDesc<N,T>* d_valid_intersect = buffer_arena.alloc<RectDesc<N,T>>(num_valid_output);
 
-    CUDA_CHECK(cudaMemsetAsync(d_src_counters, 0, sources.size() * sizeof(uint32_t), stream), stream);
+        CUDA_CHECK(cudaMemcpyAsync(d_src_prefix, h_src_counters.data(), (sources.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream);
+        CUDA_CHECK(cudaMemsetAsync(d_src_counters, 0, sources.size() * sizeof(uint32_t), stream), stream);
 
-    image_intersect_output<N,T><<<COMPUTE_GRID(collapsed_parent.num_entries * total_pts), THREADS_PER_BLOCK, 0, stream>>>(collapsed_parent.entries_buffer, d_rngs, d_src_prefix, collapsed_parent.num_entries, total_pts, d_src_counters, d_valid_intersect);
-    KERNEL_CHECK(stream);
+        image_intersect_output<N,T><<<COMPUTE_GRID(collapsed_parent.num_entries * total_pts), THREADS_PER_BLOCK, 0, stream>>>(collapsed_parent.entries_buffer, d_rngs, d_src_prefix, collapsed_parent.num_entries, total_pts, d_src_counters, d_valid_intersect);
+        KERNEL_CHECK(stream);
 
-    CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+        CUDA_CHECK(cudaStreamSynchronize(stream), stream);
 
-    src_prefix_instance.destroy();
-    parent_entries_instance.destroy();
-    src_counters_instance.destroy();
-    rngs_instance.destroy();
+        size_t num_new_rects = 2;
+        assert(!buffer_arena.get_parity());
+        RectDesc<N, T>* d_new_rects;
 
-    size_t out_rects = 0;
-    RectDesc<N, T>* trash;
-    this->complete_rect_pipeline(d_valid_intersect, num_valid_output, trash, out_rects, buffer_arena,
-    /* the Container: */  sparsity_outputs,
-    /* getIndex: */       [&](auto const& elem){
-                            // elem is a SparsityMap<N,T> from the vector
-                            return size_t(&elem - sparsity_outputs.data());
-                         },
-    /* getMap: */         [&](auto const& elem){
-                          // return the SparsityMap key itself
-                          return elem;
-                       });
+        //Send it off for processing
+        this->complete_rect_pipeline(d_valid_intersect, num_valid_output, d_new_rects, num_new_rects, buffer_arena,
+       /* the Container: */  sparsity_outputs,
+       /* getIndex: */       [&](auto const& elem){
+                               // elem is a SparsityMap<N,T> from the vector
+                               return size_t(&elem - sparsity_outputs.data());
+                            },
+       /* getMap: */         [&](auto const& elem){
+                             // return the SparsityMap key itself
+                             return elem;
+                          });
+
+          //Set our first set of output rectangles
+          if (num_output==0) {
+
+            //We need to place the new output at the rightmost end of the buffer
+            buffer_arena.flip_parity();
+            buffer_arena.reset(true);
+            output_start = buffer_arena.alloc<RectDesc<N, T>>(num_new_rects);
+            buffer_arena.commit(true);
+            CUDA_CHECK(cudaMemcpyAsync(output_start, d_new_rects, num_new_rects * sizeof(RectDesc<N,T>), cudaMemcpyDeviceToDevice, stream), stream);
+            num_output = num_new_rects;
+            num_completed += curr_tile;
+            subtract_const<<<COMPUTE_GRID(domain_transform.range_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile);
+            KERNEL_CHECK(stream);
+            curr_tile = tile_size / 2;
+            CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+            continue;
+          }
+
+          //Otherwise we merge with existing rectangles
+          RectDesc<N, T>* d_old_rects = buffer_arena.alloc<RectDesc<N, T>>(num_output);
+          assert(d_old_rects == d_new_rects + num_new_rects);
+          CUDA_CHECK(cudaMemcpyAsync(d_old_rects, output_start, num_output * sizeof(RectDesc<N,T>), cudaMemcpyDeviceToDevice, stream), stream);
+          CUDA_CHECK(cudaStreamSynchronize(stream), stream);
 
-  valid_intersect_instance.destroy();
+          size_t num_final_rects = 1;
+
+          //Send it off for processing
+          this->complete_rect_pipeline(d_new_rects, num_output + num_new_rects, output_start, num_final_rects, buffer_arena,
+          /* the Container: */  sparsity_outputs,
+          /* getIndex: */       [&](auto const& elem){
+                                  // elem is a SparsityMap<N,T> from the vector
+                                  return size_t(&elem - sparsity_outputs.data());
+                               },
+          /* getMap: */         [&](auto const& elem){
+                                // return the SparsityMap key itself
+                                return elem;
+                             });
+          num_completed += curr_tile;
+          num_output = num_final_rects;
+          subtract_const<<<COMPUTE_GRID(domain_transform.range_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile);
+          KERNEL_CHECK(stream);
+          curr_tile = tile_size / 2;
+          CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+      }
+      catch (arena_oom&) {
+        std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl;
+        std::cout << buffer_arena.used() << " bytes used in arena." << std::endl;
+        curr_tile /= 2;
+        if (curr_tile == 0) {
+          throw;
+        }
+      }
+  }
+  CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+  KERNEL_CHECK(stream);
+  this->send_output(output_start, num_output, buffer_arena, sparsity_outputs,
+      /* getIndex: */       [&](auto const& elem){
+                              // elem is a SparsityMap<N,T> from the vector
+                              return size_t(&elem - sparsity_outputs.data());
+                           },
+      /* getMap: */         [&](auto const& elem){
+                            // return the SparsityMap key itself
+                            return elem;
+                         });
 
 }
 
@@ -249,8 +301,8 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
     inst_space.offsets = buffer_arena.alloc<size_t>(domain_transform.ptr_data.size()+1);
     inst_space.num_children = domain_transform.ptr_data.size();
 
-    Arena no;
-    GPUMicroOp<N2, T2>::collapse_multi_space(domain_transform.ptr_data, inst_space, no, stream);
+    Arena sys_arena;
+    GPUMicroOp<N2, T2>::collapse_multi_space(domain_transform.ptr_data, inst_space, sys_arena, stream);
 
     // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter
     // to figure out where to write each rectangle.
@@ -305,15 +357,11 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
         RectDesc<N2, T2>* d_valid_rects;
         GPUMicroOp<N2, T2>::template construct_input_rectlist<RectDesc<N2, T2>>(inst_space_tile, src_space, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream);
 
-        if (num_valid_rects == std::numeric_limits<size_t>::max()) {
-          curr_tile /= 2;
-          continue;
-        }
-
         if (num_valid_rects == 0) {
           num_completed += curr_tile;
           curr_tile = tile_size / 2;
           subtract_const<<<COMPUTE_GRID(domain_transform.ptr_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile);
+          KERNEL_CHECK(stream);
           CUDA_CHECK(cudaStreamSynchronize(stream), stream);
           continue;
         }
@@ -344,6 +392,7 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
           num_completed += curr_tile;
           curr_tile = tile_size / 2;
           subtract_const<<<COMPUTE_GRID(domain_transform.ptr_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile);
+          KERNEL_CHECK(stream);
           CUDA_CHECK(cudaStreamSynchronize(stream), stream);
           continue;
         }
@@ -385,6 +434,7 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
           num_output = num_new_rects;
           num_completed += curr_tile;
           subtract_const<<<COMPUTE_GRID(domain_transform.ptr_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile);
+          KERNEL_CHECK(stream);
           curr_tile = tile_size / 2;
           CUDA_CHECK(cudaStreamSynchronize(stream), stream);
           continue;
@@ -411,6 +461,7 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
         num_completed += curr_tile;
         num_output = num_final_rects;
         subtract_const<<<COMPUTE_GRID(domain_transform.ptr_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile);
+        KERNEL_CHECK(stream);
         curr_tile = tile_size / 2;
         CUDA_CHECK(cudaStreamSynchronize(stream), stream);
       }
diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp
index b1459f2ede..42b640660b 100644
--- a/src/realm/deppart/partitions_gpu_impl.hpp
+++ b/src/realm/deppart/partitions_gpu_impl.hpp
@@ -104,6 +104,8 @@ namespace Realm {
   void GPUMicroOp<N,T>::collapse_multi_space(const std::vector<space_t>& spaces, collapsed_space<N, T> &out_space, Arena &my_arena, cudaStream_t stream)
   {
 
+    out_space.bounds = Rect<N, T>::make_empty();
+
     char *val = std::getenv("SHATTER_SIZE");  // or any env var
     int shatter_size = 1; //default
     if (val) {
@@ -123,6 +125,7 @@ namespace Realm {
       } else {
         my_space = spaces[i].index_space;
       }
+      out_space.bounds = out_space.bounds.union_bbox(my_space.bounds);
       if (my_space.dense()) {
         if constexpr (std::is_same_v<space_t, IndexSpace<N,T>>) {
           out_space.num_entries += 1;
@@ -208,6 +211,7 @@ namespace Realm {
       entry.bounds = parent_space.bounds;
       out_space.entries_buffer = my_arena.alloc<SparsityMapEntry<N, T>>(1);
       out_space.num_entries = 1;
+      out_space.bounds = parent_space.bounds;
       CUDA_CHECK(cudaMemcpyAsync(out_space.entries_buffer, &entry, sizeof(SparsityMapEntry<N,T>), cudaMemcpyHostToDevice, stream), stream);
     } else {
       span<SparsityMapEntry<N, T>> tmp =  parent_space.sparsity.impl()->get_entries();
@@ -225,7 +229,6 @@ namespace Realm {
   template<int N, typename T>
   void GPUMicroOp<N, T>::build_bvh(const collapsed_space<N, T> &space, BVH<N, T> &result, Arena &my_arena, cudaStream_t stream)
   {
-
       //We want to keep the entire BVH that we return in one instance for convenience.
       size_t indices_instance_size = space.num_entries * sizeof(uint64_t);
       size_t labels_instance_size = space.offsets == nullptr ? 0 : space.num_entries * sizeof(size_t);
@@ -1316,18 +1319,28 @@ namespace Realm {
       CUDA_CHECK(cudaMemcpyAsync(&last_grp, &group_ids[num_intermediate-1], sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream);
       CUDA_CHECK(cudaStreamSynchronize(stream), stream);
 
-      my_arena.rollback(prev);
       my_arena.flip_parity();
       assert(my_arena.get_parity());
-      my_arena.reset(true);
+
+      if (out_rects == 1) {
+        my_arena.reset(true);
+      }
       d_rects_out = my_arena.alloc<RectDesc<N,T>>(last_grp);
-      my_arena.commit(true);
+      if (out_rects == 1) {
+        my_arena.commit(true);
+      }
 
       init_rects_dim<<<grid_size, threads_per_block, 0, stream>>>(d_rects_in, d_hi_flags_out, break_points, group_ids, d_rects_out, num_intermediate, 0);
       KERNEL_CHECK(stream);
 
       num_intermediate = last_grp;
-      std::swap(d_rects_in, d_rects_out);
+      if (out_rects == 2) {
+        my_arena.flip_parity();
+        d_rects_in = my_arena.alloc<RectDesc<N,T>>(num_intermediate);
+        CUDA_CHECK(cudaMemcpyAsync(d_rects_in, d_rects_out, num_intermediate * sizeof(RectDesc<N,T>), cudaMemcpyDeviceToDevice, stream), stream);
+      } else {
+        std::swap(d_rects_in, d_rects_out);
+      }
 
       CUDA_CHECK(cudaStreamSynchronize(stream), stream);
     }
@@ -1496,6 +1509,8 @@ namespace Realm {
     CUDA_CHECK(cudaMemsetAsync(d_ends, 0, ctr.size()*sizeof(size_t),stream), stream);
 
 
+    CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+
     //Convert RectDesc to SparsityMapEntry and determine where each src's rectangles start and end.
     build_final_output<<<COMPUTE_GRID(total_rects), THREADS_PER_BLOCK, 0, stream>>>(d_rects, final_entries, final_rects, d_starts, d_ends, total_rects);
     KERNEL_CHECK(stream);
diff --git a/src/realm/deppart/partitions_gpu_kernels.hpp b/src/realm/deppart/partitions_gpu_kernels.hpp
index f3c1dd514e..2f607930d9 100644
--- a/src/realm/deppart/partitions_gpu_kernels.hpp
+++ b/src/realm/deppart/partitions_gpu_kernels.hpp
@@ -674,6 +674,7 @@ void mark_breaks_dim(const RectDesc<N,T>* in,
                      int                   d)
 {
   size_t i = blockIdx.x*blockDim.x + threadIdx.x;
+
   if(i >= M) return;
   if(i == 0) { brk[0] = 1; return; }
 
diff --git a/src/realm/deppart/rectlist.inl b/src/realm/deppart/rectlist.inl
index 621476e511..233d14c5c2 100644
--- a/src/realm/deppart/rectlist.inl
+++ b/src/realm/deppart/rectlist.inl
@@ -647,8 +647,10 @@ namespace Realm {
       // as_map.rbegin()->second << "\n";
       //  bigger than everything - see if we can merge with the last guy
       T &last = as_map.rbegin()->second;
-      if(last == (r.lo[0] - 1))
-        last = r.hi[0];
+      if(last >= (r.lo[0] - 1)) {
+        if (last < r.hi[0])
+          last = r.hi[0];
+      }
       else if(last < (r.lo[0] - 1))
         as_map[r.lo[0]] = r.hi[0];
     } else {
diff --git a/tests/deppart.cc b/tests/deppart.cc
index 448d3a60d0..70c6e9dfc1 100644
--- a/tests/deppart.cc
+++ b/tests/deppart.cc
@@ -1612,11 +1612,41 @@ class RangeTest : public TestInterface {
     std::vector<IndexSpace<1>> p_garbage_rects, p_garbage_colors;
     log_app.info() << "WARMING UP " << "\n";
 
+    std::vector<DeppartEstimateInput<1, int>> field_estimate_input(rect_id_data_gpu.size());
+    std::vector<DeppartBufferRequirements> field_estimate_output(rect_id_data_gpu.size());
+    std::vector<DeppartEstimateInput<1, int>> image_estimate_input(rect_val_data_gpu.size());
+    std::vector<DeppartBufferRequirements> image_estimate_output(rect_val_data_gpu.size());
+    std::vector<DeppartSubspace<1, int>> subspace_input(colors.size());
+    for (size_t i = 0; i < rect_id_data_gpu.size(); i++) {
+      field_estimate_input[i].location = rect_id_data_gpu[i].inst.get_location();
+      field_estimate_input[i].space = rect_id_data_gpu[i].index_space;
+    }
+    for (size_t i = 0; i < rect_val_data_gpu.size(); i++) {
+      image_estimate_input[i].location = rect_val_data_gpu[i].inst.get_location();
+      image_estimate_input[i].space = rect_val_data_gpu[i].index_space;
+    }
+
+    is_rects.by_field_buffer_requirements(field_estimate_input, field_estimate_output);
+    std::vector<size_t> byte_fields = {sizeof(char)};
+    for (size_t i = 0; i < rect_id_data_gpu.size(); i++) {
+      IndexSpace<1> instance_index_space(Rect<1>(0, field_estimate_output[i].upper_bound-1));
+      RegionInstance::create_instance(rect_id_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
+    }
+
     Event e001 = is_rects.create_subspaces_by_field(rect_id_data_gpu,
                                                   colors,
                                                   p_garbage_colors,
                                                   Realm::ProfilingRequestSet());
     if (wait_on_events) e001.wait();
+    for (size_t i = 0; i < colors.size(); i++) {
+      subspace_input[i].space = p_garbage_colors[i];
+      subspace_input[i].entries = p_garbage_colors[i].sparsity.impl()->get_entries().size();
+    }
+    is_nodes.by_image_buffer_requirements(subspace_input, image_estimate_input, image_estimate_output);
+    for (size_t i = 0; i < rect_val_data_gpu.size(); i++) {
+      IndexSpace<1> instance_index_space(Rect<1>(0, (image_estimate_output[i].upper_bound)/4-1));
+      RegionInstance::create_instance(rect_val_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
+    }
     Event e002 = is_nodes.create_subspaces_by_image(rect_val_data_gpu,
                                                      p_garbage_colors,
                                                      p_garbage_rects,

From c9325aed3803e157d0866df1ebb4464368c1896f Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Wed, 18 Feb 2026 22:00:54 -0800
Subject: [PATCH 13/32] working multidimensional, no fixed buffer

---
 src/realm/deppart/byfield.cc              |   3 +-
 src/realm/deppart/byfield_gpu_impl.hpp    |  88 ++--
 src/realm/deppart/partitions.h            |  24 +-
 src/realm/deppart/partitions_gpu_impl.hpp |  23 +-
 tests/deppart.cc                          | 543 +++++++++++++++++++++-
 5 files changed, 594 insertions(+), 87 deletions(-)

diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc
index cfd2927589..b9d4bf5e43 100644
--- a/src/realm/deppart/byfield.cc
+++ b/src/realm/deppart/byfield.cc
@@ -44,7 +44,7 @@ namespace Realm {
             if (val) {
               device_size = atoi(val);
             }
-            size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>);
+            size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>) * 100;
             std::vector<Machine::ProcessorMemoryAffinity> affinities;
             unsigned best_bandwidth = 0;
             Processor best_proc = Processor::NO_PROC;
@@ -446,6 +446,7 @@ namespace Realm {
     }
 #ifdef REALM_USE_CUDA
     for (auto fdd : gpu_field_data) {
+      assert(fdd.scratch_buffer != RegionInstance::NO_INST);
       std::vector<FieldDataDescriptor<IndexSpace<N,T>,FT> > single_gpu_field_data = {fdd};
       GPUByFieldMicroOp<N, T, FT> *uop = new GPUByFieldMicroOp<N, T, FT>(parent, single_gpu_field_data, exclusive);
       for (size_t i = 0; i < colors.size(); i++) {
diff --git a/src/realm/deppart/byfield_gpu_impl.hpp b/src/realm/deppart/byfield_gpu_impl.hpp
index f2aa8c3288..c7e619e06d 100644
--- a/src/realm/deppart/byfield_gpu_impl.hpp
+++ b/src/realm/deppart/byfield_gpu_impl.hpp
@@ -23,24 +23,15 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
 
   cudaStream_t stream = Cuda::get_task_cuda_stream();
 
-  Memory my_mem = field_data[0].inst.get_location();
-
   collapsed_space<N, T> inst_space;
 
-  const char* val = std::getenv("TILE_SIZE");  // or any env var
-  size_t tile_size = 100000000; //default
-  if (val) {
-    tile_size = atoi(val);
-  }
+  size_t tile_size = field_data[0].scratch_buffer.get_layout()->bytes_used;
 
-  RegionInstance fixed_buffer = this->realm_malloc(tile_size, my_mem);
-  Arena buffer_arena(reinterpret_cast<void *>(AffineAccessor<char, 1>(fixed_buffer, 0).base), tile_size);
+  Arena buffer_arena(reinterpret_cast<void *>(AffineAccessor<char, 1>(field_data[0].scratch_buffer, 0).base), tile_size);
 
   inst_space.offsets = buffer_arena.alloc<size_t>(field_data.size() + 1);
   inst_space.num_children = field_data.size();
 
-  GPUMicroOp<N, T>::collapse_multi_space(field_data, inst_space, buffer_arena, stream);
-
   collapsed_space<N, T> collapsed_parent;
 
   // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs.
@@ -49,8 +40,7 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
 
   // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter
   // to figure out where to write each rectangle.
-  RegionInstance inst_counters_instance = this->realm_malloc((2*field_data.size() + 1) * sizeof(uint32_t), my_mem);
-  uint32_t* d_inst_counters = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(inst_counters_instance, 0).base);
+  uint32_t* d_inst_counters = buffer_arena.alloc<uint32_t>(2*field_data.size() + 1);
 
   // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second
   // to track which instance each rectangle came from in the populate phase.
@@ -58,37 +48,7 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
   size_t num_valid_rects = 0;
   Rect<N, T>* d_valid_rects;
 
-  // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from.
-  GPUMicroOp<N, T>::template construct_input_rectlist<Rect<N, T>>(inst_space, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream);
-
-
-  // Early out if we don't have any rectangles.
-  if (num_valid_rects == 0) {
-    for (std::pair<const FT, SparsityMap<N, T>> it : sparsity_outputs) {
-      SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it.second);
-      if (this->exclusive) {
-        impl->gpu_finalize();
-      } else {
-        impl->contribute_nothing();
-      }
-    }
-    inst_counters_instance.destroy();
-    return;
-  }
-
-
-  // Prefix sum the valid rectangles by volume.
-  size_t total_pts;
-
-  size_t* d_prefix_rects;
-  GPUMicroOp<N, T>::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream);
-
-  // Now we have everything we need to actually populate our outputs.
-  RegionInstance points_instance = this->realm_malloc(total_pts * sizeof(PointDesc<N,T>), my_mem);
-  PointDesc<N,T>* d_points = reinterpret_cast<PointDesc<N,T>*>(AffineAccessor<char,1>(points_instance, 0).base);
-
   FT* d_colors;
-  RegionInstance colors_instance;
 
 
   // Memcpying a boolean vector breaks things for some reason so we have this disgusting workaround.
@@ -97,13 +57,11 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
     for (size_t i = 0; i < colors.size(); i++) {
       flat_colors[i] = colors[i] ? 1 : 0;
     }
-    colors_instance = this->realm_malloc(total_pts * sizeof(PointDesc<N,T>), my_mem);
-    uint8_t* d_flat_colors = reinterpret_cast<uint8_t*>(AffineAccessor<char,1>(colors_instance, 0).base);
+    uint8_t* d_flat_colors = buffer_arena.alloc<uint8_t>(colors.size());
     CUDA_CHECK(cudaMemcpyAsync(d_flat_colors, flat_colors.data(), colors.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, stream), stream);
     d_colors = reinterpret_cast<FT*>(d_flat_colors);
   } else {
-    colors_instance = this->realm_malloc(colors.size() * sizeof(FT), my_mem);
-    d_colors = reinterpret_cast<FT*>(AffineAccessor<char,1>(colors_instance, 0).base);
+    d_colors = buffer_arena.alloc<FT>(colors.size());
     CUDA_CHECK(cudaMemcpyAsync(d_colors, colors.data(), colors.size() * sizeof(FT), cudaMemcpyHostToDevice, stream), stream);
   }
 
@@ -118,8 +76,39 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
     d_accessors[i] = AffineAccessor<FT,N,T>(field_data[i].inst, field_data[i].field_offset);
   }
 
+  buffer_arena.commit(false);
+
+  GPUMicroOp<N, T>::collapse_multi_space(field_data, inst_space, buffer_arena, stream);
+
+  // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from.
+  GPUMicroOp<N, T>::template construct_input_rectlist<Rect<N, T>>(inst_space, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream);
 
 
+  // Early out if we don't have any rectangles.
+  if (num_valid_rects == 0) {
+    for (std::pair<const FT, SparsityMap<N, T>> it : sparsity_outputs) {
+      SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it.second);
+      if (this->exclusive) {
+        impl->gpu_finalize();
+      } else {
+        impl->contribute_nothing();
+      }
+    }
+    return;
+  }
+
+
+  // Prefix sum the valid rectangles by volume.
+  size_t total_pts;
+
+  size_t* d_prefix_rects;
+  GPUMicroOp<N, T>::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream);
+
+  // Now we have everything we need to actually populate our outputs.
+  buffer_arena.flip_parity();
+  assert(!buffer_arena.get_parity());
+  PointDesc<N,T>* d_points = buffer_arena.alloc<PointDesc<N,T>>(total_pts);
+
   // This is where the work is actually done - each thread figures out which points to read, reads it, marks a PointDesc with its color, and writes it out.
   byfield_gpuPopulateBitmasksKernel<N,T,FT><<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, d_colors, total_pts, colors.size(), num_valid_rects, field_data.size(), d_points);
   KERNEL_CHECK(stream);
@@ -132,9 +121,6 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
   }
 
   CUDA_CHECK(cudaStreamSynchronize(stream), stream);
-  colors_instance.destroy();
-  accessors_instance.destroy();
-  inst_counters_instance.destroy();
 
   // Ship off the points for final processing.
   size_t out_rects = 0;
@@ -149,7 +135,5 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
                           // return the SparsityMap key itself
                           return kv.second;
                        });
-
-    points_instance.destroy();
 }
 }
diff --git a/src/realm/deppart/partitions.h b/src/realm/deppart/partitions.h
index 051717d803..222e553ee5 100644
--- a/src/realm/deppart/partitions.h
+++ b/src/realm/deppart/partitions.h
@@ -123,6 +123,18 @@ namespace Realm {
       }
     }
 
+    size_t mark(bool dir) const noexcept {
+      return dir ? right_ : left_;
+    }
+
+    void rollback(size_t mark, bool dir) noexcept {
+      if (dir) {
+        right_ = mark;
+      } else {
+        left_ = mark;
+      }
+    }
+
     template <typename T>
     T* alloc(size_t count = 1) {
       return parity_ ? alloc_right<T>(count) : alloc_left<T>(count);
@@ -171,16 +183,22 @@ namespace Realm {
 
     void* alloc_left_bytes(size_t bytes, size_t align = alignof(std::max_align_t)) {
       const size_t aligned = align_up(left_, align);
-      if (aligned + bytes + right_ > cap_) throw arena_oom{};
+      if (aligned + bytes + right_ > cap_) {
+        throw arena_oom{};
+      }
       void* p = base_ + aligned;
       left_ = aligned + bytes;
       return p;
     }
 
     void* alloc_right_bytes(size_t bytes, size_t align = alignof(std::max_align_t)) {
-      if (bytes + right_ > cap_) throw arena_oom{};
+      if (bytes + right_ > cap_) {
+        throw arena_oom{};
+      }
       const size_t aligned = align_down(cap_ - right_ - bytes, align);
-      if (aligned < left_) throw arena_oom{};
+      if (aligned < left_) {
+        throw arena_oom{};
+      }
       void *p = base_ + aligned;
       right_ = cap_ - aligned;
       return p;
diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp
index 42b640660b..de21b7fc99 100644
--- a/src/realm/deppart/partitions_gpu_impl.hpp
+++ b/src/realm/deppart/partitions_gpu_impl.hpp
@@ -679,8 +679,6 @@ namespace Realm {
 
     RegionInstance exc_sum_instance = this->realm_malloc(num_corners * total_rects * sizeof(size_t), my_mem);
 
-    size_t per_elem_size = 2*alloc_size_1 + sizeof(uint8_t) + sizeof(size_t);
-
     size_t* d_src_keys_in = reinterpret_cast<size_t*>(AffineAccessor<char,1>(shared_instance, 0).base);
     size_t* d_src_keys_out = reinterpret_cast<size_t*>(AffineAccessor<char,1>(shared_instance, 0).base) + num_corners * total_rects;
     T* d_coord_keys_in = reinterpret_cast<T*>(AffineAccessor<char,1>(shared_instance, 0).base);
@@ -962,17 +960,6 @@ namespace Realm {
         CUDA_CHECK(cudaMemcpyAsync(&last_count, &d_seg_counters[num_segments-1], sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream);
         CUDA_CHECK(cudaStreamSynchronize(stream), stream);
         next_round += last_count;
-        if (out_rects > 0 && (next_round + last_count) * per_elem_size > out_rects) {
-          shared_instance.destroy();
-          flags_instance.destroy();
-          exc_sum_instance.destroy();
-          seg_bound_instance.destroy();
-          seg_counters.destroy();
-          seg_counters_out.destroy();
-          corners_instance.destroy();
-          out_rects = std::numeric_limits<size_t>::max();
-          return;
-        }
 
         num_intermediate = next_round;
 
@@ -1190,10 +1177,6 @@ namespace Realm {
       CUDA_CHECK(cudaStreamSynchronize(stream), stream);
     }
 
-    heads_instance.destroy();
-    shared_instance.destroy();
-    tmp_instance.destroy();
-
     //And... we're done
     if (out_rects > 0) {
       d_out_rects = d_rects_in;
@@ -1369,7 +1352,8 @@ namespace Realm {
 
     NVTX_DEPPART(complete_pipeline);
 
-    size_t prev = my_arena.mark();
+    my_arena.flip_parity();
+
 
     cudaStream_t stream = Cuda::get_task_cuda_stream();
 
@@ -1466,7 +1450,7 @@ namespace Realm {
         num_intermediate = last_grp;
         std::swap(d_rects_in, d_rects_out);
       }
-      my_arena.rollback(prev);
+      my_arena.flip_parity();
       d_out_rects = my_arena.alloc<RectDesc<N, T>>(num_intermediate);
       CUDA_CHECK(cudaMemcpyAsync(d_out_rects, d_rects_in, num_intermediate * sizeof(RectDesc<N,T>), cudaMemcpyDeviceToDevice, stream), stream);
       CUDA_CHECK(cudaStreamSynchronize(stream), stream);
@@ -1476,7 +1460,6 @@ namespace Realm {
       out_rects = num_intermediate;
     } else {
       this->send_output(d_rects_in, num_intermediate, my_arena, ctr, getIndex, getMap);
-      my_arena.rollback(prev);
     }
   }
 
diff --git a/tests/deppart.cc b/tests/deppart.cc
index 70c6e9dfc1..8fde66845d 100644
--- a/tests/deppart.cc
+++ b/tests/deppart.cc
@@ -44,7 +44,7 @@ enum
   INIT_BASIC_DATA_TASK,
   INIT_TILE_DATA_TASK,
   INIT_RANGE_DATA_TASK,
-  INIT_2D_DATA_TASK,
+  INIT_RANGE2D_DATA_TASK,
   INIT_PENNANT_DATA_TASK,
   INIT_MINIAERO_DATA_TASK,
 };
@@ -501,6 +501,19 @@ class BasicTest : public TestInterface {
     }
 	wait_on_events = true;
         log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n";
+        const char* val = std::getenv("TILE_SIZE");  // or any env var
+        size_t tile_size = 100000000; //default
+        if (val) {
+          tile_size = atoi(val);
+        }
+        std::vector<size_t> byte_fields = {sizeof(char)};
+        IndexSpace<1> instance_index_space(Rect<1>(0, tile_size-1));
+        for (size_t i = 0; i < piece_field_data_gpu.size(); i++) {
+          RegionInstance::create_instance(piece_field_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
+        }
+        for (size_t i = 0; i < src_field_data_gpu.size(); i++) {
+          RegionInstance::create_instance(src_field_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
+        }
         std::vector<IndexSpace<1> > p_garbage_nodes, p_garbage_edges, p_garbage_rd, p_garbage_preimage_edges;
     Event e01 = is_nodes.create_subspaces_by_field(piece_field_data_gpu,
                                                   colors,
@@ -516,16 +529,6 @@ class BasicTest : public TestInterface {
 
     // an image of p_edges through out_node gives us all the shared nodes, along
     //  with some private nodes
-    const char* val = std::getenv("TILE_SIZE");  // or any env var
-    size_t tile_size = 100000000; //default
-    if (val) {
-      tile_size = atoi(val);
-    }
-    std::vector<size_t> byte_fields = {sizeof(char)};
-    IndexSpace<1> instance_index_space(Rect<1>(0, tile_size-1));
-    for (size_t i = 0; i < src_field_data_gpu.size(); i++) {
-      RegionInstance::create_instance(src_field_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
-    }
     Event e03 = is_nodes.create_subspaces_by_image(src_field_data_gpu,
                                                   p_garbage_edges,
                                                   p_garbage_rd,
@@ -1749,6 +1752,518 @@ class RangeTest : public TestInterface {
   }
 };
 
+class Range2DTest : public TestInterface {
+public:
+  // graph config parameters
+  int num_nodes = 1000;
+  int num_rects = 1000;
+  int max_rect_size = 10;
+  int num_pieces = 4;
+
+  Range2DTest(int argc, const char *argv[])
+  {
+    for(int i = 1; i < argc; i++) {
+
+      if(!strcmp(argv[i], "-p")) {
+	num_pieces = atoi(argv[++i]);
+	continue;
+      }
+
+        if(!strcmp(argv[i], "-n")) {
+          num_nodes = atoi(argv[++i]);
+          continue;
+        }
+
+        if (!strcmp(argv[i], "-r")) {
+          num_rects = atoi(argv[++i]);
+          continue;
+        }
+
+        if (!strcmp(argv[i], "-m")) {
+          max_rect_size = atoi(argv[++i]);
+          continue;
+        }
+    }
+
+    if (num_nodes <= 0 || num_rects <= 0) {
+      log_app.error() << "Invalid graph dimensions in input file: rects=" << num_rects << " nodes=" << num_nodes;
+      exit(1);
+    }
+
+  }
+
+
+
+  struct InitDataArgs {
+    int index;
+    RegionInstance ri_nodes;
+    RegionInstance ri_rects;
+  };
+
+  enum PRNGStreams {
+    NODE_SUBGRAPH_STREAM,
+  };
+
+  void random_rect_data(int idx, int& subgraph)
+  {
+    if(random_colors)
+      subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces);
+    else
+      subgraph = idx * num_pieces / num_rects;
+  }
+
+  void random_node_data(int idx, int& subgraph)
+  {
+    if(true)
+      subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces);
+    else
+      subgraph = idx * num_pieces / num_nodes;
+  }
+
+  void initialize_rect_data(int idx, Rect<2> &rect, int max_rect_size = 10)
+  {
+
+    int x = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes);
+    int y = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, num_nodes);
+    int length = Philox_2x32<>::rand_int(random_seed, idx + 2, NODE_SUBGRAPH_STREAM, max_rect_size);
+    int height = Philox_2x32<>::rand_int(random_seed, idx + 3, NODE_SUBGRAPH_STREAM, max_rect_size);
+    rect.lo[0] = x;
+    rect.hi[0] = x + length;
+    rect.lo[1] = y;
+    rect.hi[1] = y + height;
+  }
+
+
+  static void init_data_task_wrapper(const void *args, size_t arglen,
+				     const void *userdata, size_t userlen, Processor p)
+  {
+    Range2DTest *me = (Range2DTest *)testcfg;
+    me->init_data_task(args, arglen, p);
+  }
+
+  void init_data_task(const void *args, size_t arglen, Processor p)
+  {
+    const InitDataArgs& i_args = *(const InitDataArgs *)args;
+
+    log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes << ", ri_rects=" << i_args.ri_rects << ")";
+
+    i_args.ri_nodes.fetch_metadata(p).wait();
+    i_args.ri_rects.fetch_metadata(p).wait();
+
+    IndexSpace<2> is_nodes = i_args.ri_nodes.get_indexspace<2>();
+    IndexSpace<1> is_rects = i_args.ri_rects.get_indexspace<1>();
+
+    log_app.debug() << "N: " << is_nodes;
+    log_app.debug() << "E: " << is_rects;
+
+    {
+      AffineAccessor<int,1> a_piece_id(i_args.ri_rects, 0 /* offset */);
+
+      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) {
+	      int subgraph;
+	      random_rect_data(i, subgraph);
+	      a_piece_id.write(i, subgraph);
+      }
+    }
+    {
+      AffineAccessor<int,2> a_piece_id(i_args.ri_nodes, 0 /* offset */);
+
+      for(int i = is_nodes.bounds.lo[0]; i <= is_nodes.bounds.hi[0]; i++) {
+        for (int j = is_nodes.bounds.lo[1]; j <= is_nodes.bounds.hi[1]; j++) {
+          int idx = i * (is_nodes.bounds.hi[1] - is_nodes.bounds.lo[1] + 1) + j;
+          int subgraph;
+          random_node_data(idx, subgraph);
+          a_piece_id.write(Point<2>(i, j), subgraph);
+        }
+      }
+    }
+
+
+    {
+
+      AffineAccessor<Rect<2>, 1> a_rect(i_args.ri_rects, 1 * sizeof(int) /* offset */);
+
+      // Read edges line by line
+      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) {
+        Rect<2> rect;
+        initialize_rect_data(i, rect, max_rect_size);
+        a_rect.write(i, rect);
+      }
+    }
+
+    if(show_graph) {
+      AffineAccessor<int,2> a_piece_id(i_args.ri_nodes, 0 /* offset */);
+
+      for(int i = is_nodes.bounds.lo[0]; i <= is_nodes.bounds.hi[1]; i++) {
+        for (int j = is_nodes.bounds.lo[1]; j <= is_nodes.bounds.hi[1]; j++) {
+          Point<2> p(i, j);
+          log_app.info() << "node_id[" << p << "] = " << a_piece_id.read(p) << "\n";
+        }
+      }
+
+      AffineAccessor<int,1> a_rect_id(i_args.ri_rects, 0 * sizeof(Point<1>) /* offset */);
+
+      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++)
+	log_app.info() << "rect_id[" << i << "] = " << a_rect_id.read(i) << "\n";
+
+      AffineAccessor<Rect<2>,1> a_rect_val(i_args.ri_rects, 1 * sizeof(int) /* offset */);
+
+      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++)
+	log_app.info() << "rect_val[" << i << "] = " << a_rect_val.read(i) << "\n";
+    }
+  }
+
+  IndexSpace<1> is_rects;
+  IndexSpace<2> is_nodes;
+  std::vector<RegionInstance> ri_nodes;
+  std::vector<FieldDataDescriptor<IndexSpace<2>, int> > node_id_field_data;
+  std::vector<RegionInstance> ri_rects;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, int> > rect_id_field_data;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, Rect<2> > > rect_val_field_data;
+
+  virtual void print_info(void)
+  {
+    printf("Realm dependent partitioning test - 2D ranges: %d nodes, %d rects, %d pieces\n",
+	   (int)num_nodes, (int)num_rects, (int)num_pieces);
+  }
+
+  virtual Event initialize_data(const std::vector<Memory>& memories,
+				const std::vector<Processor>& procs)
+  {
+    // now create index spaces for nodes and edges
+    is_nodes = Rect<2>(Point<2>(0, 0), Point<2>(num_nodes - 1, num_nodes - 1));
+    is_rects = Rect<1>(0, num_rects - 1);
+
+    // equal partition is used to do initial population of edges and nodes
+    std::vector<IndexSpace<2> > ss_nodes_eq;
+    std::vector<IndexSpace<1> > ss_rects_eq;
+
+    log_app.info() << "Creating equal subspaces" << "\n";
+
+    is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait();
+    is_rects.create_equal_subspaces(num_pieces, 1, ss_rects_eq, Realm::ProfilingRequestSet()).wait();
+
+    log_app.debug() << "Initial partitions:\n";
+    for(size_t i = 0; i < ss_nodes_eq.size(); i++)
+      log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i];
+    for(size_t i = 0; i < ss_rects_eq.size(); i++)
+      log_app.debug() << " Rects #" << i << ": " << ss_rects_eq[i];
+
+    // create instances for each of these subspaces
+    std::vector<size_t> node_fields, rect_fields;
+    node_fields.push_back(sizeof(int));  // piece_id
+    rect_fields.push_back(sizeof(int));  // src_node
+    rect_fields.push_back(sizeof(Rect<2>));  // dst_node
+
+    ri_nodes.resize(num_pieces);
+    node_id_field_data.resize(num_pieces);
+
+    for(size_t i = 0; i < ss_nodes_eq.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri,
+				      memories[i % memories.size()],
+				      ss_nodes_eq[i],
+				      node_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+      ri_nodes[i] = ri;
+
+      node_id_field_data[i].index_space = ss_nodes_eq[i];
+      node_id_field_data[i].inst = ri_nodes[i];
+      node_id_field_data[i].field_offset = 0;
+    }
+
+    ri_rects.resize(num_pieces);
+    rect_id_field_data.resize(num_pieces);
+    rect_val_field_data.resize(num_pieces);
+
+    for(size_t i = 0; i < ss_rects_eq.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri,
+				      memories[i % memories.size()],
+				      ss_rects_eq[i],
+				      rect_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+      ri_rects[i] = ri;
+
+      rect_id_field_data[i].index_space = ss_rects_eq[i];
+      rect_id_field_data[i].inst = ri_rects[i];
+      rect_id_field_data[i].field_offset = 0;
+
+      rect_val_field_data[i].index_space = ss_rects_eq[i];
+      rect_val_field_data[i].inst = ri_rects[i];
+      rect_val_field_data[i].field_offset = 1 * sizeof(int);
+    }
+
+    // fire off tasks to initialize data
+    std::set<Event> events;
+    for(int i = 0; i < num_pieces; i++) {
+      Processor p = procs[i % procs.size()];
+      InitDataArgs args;
+      args.index = i;
+      args.ri_nodes = ri_nodes[i];
+      args.ri_rects = ri_rects[i];
+      Event e = p.spawn(INIT_RANGE2D_DATA_TASK, &args, sizeof(args));
+      events.insert(e);
+    }
+
+    return Event::merge_events(events);
+  }
+
+  // the outputs of our partitioning will be:
+  //  is_private, is_shared - subsets of is_nodes based on private/shared
+  //  p_rd, p_wr, p_ghost - subsets of the above split by subckt
+  //  p_edges               - subsets of is_edges for each subckt
+
+  std::vector<IndexSpace<1> > p_colored_rects;
+  std::vector<IndexSpace<2>> p_rects, p_intersect, p_diff;
+  std::vector<IndexSpace<1>> p_colored_rects_cpu;
+  std::vector<IndexSpace<2>> p_rects_cpu, p_intersect_cpu, p_diff_cpu;
+
+  IndexSpace<2> cpu_union, gpu_union, garbage_union;
+
+  virtual Event perform_partitioning(void)
+  {
+    // first partition nodes by subckt id (this is the independent partition,
+    //  but not actually used by the app)
+
+    std::vector<int> colors(num_pieces);
+    for(int i = 0; i < num_pieces; i++)
+      colors[i] = i;
+
+    Memory gpu_memory;
+    bool found_gpu_memory = false;
+    Machine machine = Machine::get_machine();
+    std::set<Memory> all_memories;
+    machine.get_all_memories(all_memories);
+    for(auto& memory : all_memories) {
+      if(memory.kind() == Memory::GPU_FB_MEM) {
+        gpu_memory = memory;
+        found_gpu_memory = true;
+        break;
+      }
+    }
+    assert(found_gpu_memory);
+    std::vector<size_t> rect_fields;
+    rect_fields.push_back(sizeof(int));
+    rect_fields.push_back(sizeof(Rect<2>));
+    std::vector<size_t> node_fields;
+    node_fields.push_back(sizeof(int));
+
+    std::vector<FieldDataDescriptor<IndexSpace<2>, int > > node_id_data_gpu;
+    std::vector<FieldDataDescriptor<IndexSpace<1>, int > > rect_id_data_gpu;
+    std::vector<FieldDataDescriptor<IndexSpace<1>, Rect<2>>> rect_val_data_gpu;
+    node_id_data_gpu.resize(num_pieces);
+    rect_id_data_gpu.resize(num_pieces);
+    rect_val_data_gpu.resize(num_pieces);
+    for (int i = 0; i < num_pieces; i++) {
+	RegionInstance node_id_instance;
+	RegionInstance rect_id_instance;
+    	RegionInstance rect_val_instance;
+        RegionInstance::create_instance(node_id_instance,
+				      gpu_memory,
+				      node_id_field_data[i].index_space,
+				      node_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+        RegionInstance::create_instance(rect_id_instance,
+				      gpu_memory,
+				      rect_id_field_data[i].index_space,
+				      rect_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+    	RegionInstance::create_instance(rect_val_instance,
+					  gpu_memory,
+					  rect_val_field_data[i].index_space,
+					  rect_fields,
+					  0 /*SOA*/,
+					  Realm::ProfilingRequestSet()).wait();
+      CopySrcDstField node_id_gpu_field, node_id_cpu_field, rect_id_gpu_field, rect_id_cpu_field, rect_val_gpu_field, rect_val_cpu_field;
+      node_id_gpu_field.inst = node_id_instance;
+      node_id_gpu_field.size = sizeof(int);
+      node_id_gpu_field.field_id = 0;
+      node_id_cpu_field.inst = node_id_field_data[i].inst;
+      node_id_cpu_field.size = sizeof(int);
+      node_id_cpu_field.field_id = 0;
+      rect_id_gpu_field.inst = rect_id_instance;
+      rect_id_gpu_field.size = sizeof(int);
+      rect_id_gpu_field.field_id = 0;
+      rect_id_cpu_field.inst = rect_id_field_data[i].inst;
+      rect_id_cpu_field.size = sizeof(int);
+      rect_id_cpu_field.field_id = 0;
+      rect_val_gpu_field.inst = rect_val_instance;
+      rect_val_gpu_field.size = sizeof(Rect<2>);
+      rect_val_gpu_field.field_id = sizeof(int);
+      rect_val_cpu_field.inst = rect_val_field_data[i].inst;
+      rect_val_cpu_field.size = sizeof(Rect<2>);
+      rect_val_cpu_field.field_id = sizeof(int);
+      std::vector<CopySrcDstField> node_id_gpu_data, node_id_cpu_data, rect_id_gpu_data, rect_id_cpu_data, rect_val_gpu_data, rect_val_cpu_data;
+      node_id_gpu_data.push_back(node_id_gpu_field);
+      node_id_cpu_data.push_back(node_id_cpu_field);
+      rect_id_gpu_data.push_back(rect_id_gpu_field);
+      rect_id_cpu_data.push_back(rect_id_cpu_field);
+      rect_val_gpu_data.push_back(rect_val_gpu_field);
+      rect_val_cpu_data.push_back(rect_val_cpu_field);
+      Event copy_event = node_id_field_data[i].index_space.copy(node_id_cpu_data, node_id_gpu_data, Realm::ProfilingRequestSet());
+      copy_event.wait();
+      Event second_copy_event = rect_id_field_data[i].index_space.copy(rect_id_cpu_data, rect_id_gpu_data, Realm::ProfilingRequestSet());
+      second_copy_event.wait();
+      Event third_copy_event = rect_val_field_data[i].index_space.copy(rect_val_cpu_data, rect_val_gpu_data, Realm::ProfilingRequestSet());
+      third_copy_event.wait();
+      node_id_data_gpu[i].inst = node_id_instance;
+      node_id_data_gpu[i].index_space = node_id_field_data[i].index_space;
+      node_id_data_gpu[i].field_offset = 0;
+      rect_id_data_gpu[i].inst = rect_id_instance;
+      rect_id_data_gpu[i].index_space = rect_id_field_data[i].index_space;
+      rect_id_data_gpu[i].field_offset = 0;
+      rect_val_data_gpu[i].inst = rect_val_instance;
+      rect_val_data_gpu[i].index_space = rect_val_field_data[i].index_space;
+      rect_val_data_gpu[i].field_offset = sizeof(int);
+    }
+    wait_on_events = true;
+    std::vector<IndexSpace<1>> p_garbage_colors;
+    std::vector<IndexSpace<2>> p_garbage_rects;
+    log_app.info() << "WARMING UP " << "\n";
+
+    std::vector<DeppartEstimateInput<1, int>> field_estimate_input(rect_id_data_gpu.size());
+    std::vector<DeppartBufferRequirements> field_estimate_output(rect_id_data_gpu.size());
+    std::vector<DeppartEstimateInput<1, int>> image_estimate_input(rect_val_data_gpu.size());
+    std::vector<DeppartBufferRequirements> image_estimate_output(rect_val_data_gpu.size());
+    std::vector<DeppartSubspace<1, int>> subspace_input(colors.size());
+    for (size_t i = 0; i < rect_id_data_gpu.size(); i++) {
+      field_estimate_input[i].location = rect_id_data_gpu[i].inst.get_location();
+      field_estimate_input[i].space = rect_id_data_gpu[i].index_space;
+    }
+    for (size_t i = 0; i < rect_val_data_gpu.size(); i++) {
+      image_estimate_input[i].location = rect_val_data_gpu[i].inst.get_location();
+      image_estimate_input[i].space = rect_val_data_gpu[i].index_space;
+    }
+
+    is_rects.by_field_buffer_requirements(field_estimate_input, field_estimate_output);
+    std::vector<size_t> byte_fields = {sizeof(char)};
+    for (size_t i = 0; i < rect_id_data_gpu.size(); i++) {
+      IndexSpace<1> instance_index_space(Rect<1>(0, field_estimate_output[i].upper_bound-1));
+      RegionInstance::create_instance(rect_id_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
+    }
+
+    Event e001 = is_rects.create_subspaces_by_field(rect_id_data_gpu,
+                                                  colors,
+                                                  p_garbage_colors,
+                                                  Realm::ProfilingRequestSet());
+    if (wait_on_events) e001.wait();
+    for (size_t i = 0; i < colors.size(); i++) {
+      subspace_input[i].space = p_garbage_colors[i];
+      subspace_input[i].entries = p_garbage_colors[i].sparsity.impl()->get_entries().size();
+    }
+    is_nodes.by_image_buffer_requirements(subspace_input, image_estimate_input, image_estimate_output);
+    for (size_t i = 0; i < rect_val_data_gpu.size(); i++) {
+      IndexSpace<1> instance_index_space(Rect<1>(0, (image_estimate_output[i].upper_bound*5)-1));
+      RegionInstance::create_instance(rect_val_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
+    }
+    Event e002 = is_nodes.create_subspaces_by_image(rect_val_data_gpu,
+                                                     p_garbage_colors,
+                                                     p_garbage_rects,
+                                                     Realm::ProfilingRequestSet(),
+                                                     e001);
+    if(wait_on_events) e002.wait();
+
+    log_app.info() << "FINISHED WARMING UP " << "\n";
+    log_app.info() << "starting GPU  partitioning " << Clock::current_time_in_microseconds() << "\n";
+
+    log_app.info() << "STARTING GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
+
+    Event e01 = is_rects.create_subspaces_by_field(rect_id_data_gpu,
+                                                  colors,
+                                                  p_colored_rects,
+                                                  Realm::ProfilingRequestSet());
+        if (wait_on_events) e01.wait();
+
+    log_app.info() << "FINISHED GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
+    log_app.info() << "STARTING GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
+    Event e02 = is_nodes.create_subspaces_by_image(rect_val_data_gpu,
+                                                     p_colored_rects,
+                                                     p_rects,
+                                                     Realm::ProfilingRequestSet(),
+                                                     e01);
+    if(wait_on_events) e02.wait();
+    log_app.info() << "FINISHED GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
+    log_app.info() << "GPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n";
+
+    log_app.info() << "STARTING CPU  partitioning " << Clock::current_time_in_microseconds() << "\n";
+    log_app.info() << "STARTING CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
+    Event e1 = is_rects.create_subspaces_by_field(rect_id_field_data,
+                                                  colors,
+                                                  p_colored_rects_cpu,
+                                                  Realm::ProfilingRequestSet());
+    if (wait_on_events) e1.wait();
+    log_app.info() << "FINISHED CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
+    log_app.info() << "STARTING CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
+    Event e2 = is_nodes.create_subspaces_by_image(rect_val_field_data,
+                                                     p_colored_rects_cpu,
+                                                     p_rects_cpu,
+                                                     Realm::ProfilingRequestSet(),
+                                                     e1);
+    if(wait_on_events) e2.wait();
+    log_app.info() << "FINISHED CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
+    log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n";
+    return e2;
+  }
+
+
+
+  virtual int perform_dynamic_checks(void)
+  {
+    return 0;
+  }
+
+  virtual int check_partitioning(void)
+  {
+    log_app.info() << "Checking correctness of partitioning " << "\n";
+    int errors = 0;
+
+    for (int i = 0; i < num_pieces; i++) {
+      for (IndexSpaceIterator<1> it(p_colored_rects[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+          if(!p_colored_rects_cpu[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU has extra colored rect point " << point.p
+                            << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_colored_rects_cpu[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+          if (!p_colored_rects[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU is missing colored rect point " << point.p
+                            << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+      for (IndexSpaceIterator<2> it(p_rects[i]); it.valid; it.step()) {
+        for (PointInRectIterator<2> point(it.rect); point.valid; point.step()) {
+          if (!p_rects_cpu[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU has extra rect point " << point.p
+                            << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+      for (IndexSpaceIterator<2> it(p_rects_cpu[i]); it.valid; it.step()) {
+        for (PointInRectIterator<2> point(it.rect); point.valid; point.step()) {
+          if (!p_rects[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU is missing rect point " << point.p
+                            << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+    }
+    return errors;
+  }
+};
+
 class MiniAeroTest : public TestInterface {
 public:
   enum ProblemType
@@ -4430,6 +4945,11 @@ int main(int argc, char **argv)
       break;
     }
 
+    if (!strcmp(argv[i], "multi")) {
+      testcfg = new Range2DTest(argc - i, const_cast<const char **>(argv + i));
+      break;
+    }
+
     if(!strcmp(argv[i], "pennant")) {
       testcfg = new PennantTest(argc - i, const_cast<const char **>(argv + i));
       break;
@@ -4469,6 +4989,7 @@ int main(int argc, char **argv)
   rt.register_task(INIT_BASIC_DATA_TASK, BasicTest::init_data_task_wrapper);
   rt.register_task(INIT_TILE_DATA_TASK, TileTest::init_data_task_wrapper);
   rt.register_task(INIT_RANGE_DATA_TASK, RangeTest::init_data_task_wrapper);
+  rt.register_task(INIT_RANGE2D_DATA_TASK, Range2DTest::init_data_task_wrapper);
   rt.register_task(INIT_MINIAERO_DATA_TASK, MiniAeroTest::init_data_task_wrapper);
 
   signal(SIGALRM, sigalrm_handler);

From 761cd1b32bce41c459d090aa33dbab596e454b45 Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Thu, 19 Feb 2026 13:36:56 -0800
Subject: [PATCH 14/32] working multidimensional

---
 src/realm/deppart/byfield_gpu_impl.hpp    |  10 +-
 src/realm/deppart/partitions.h            |  19 +-
 src/realm/deppart/partitions_gpu_impl.hpp | 424 +++++++++++-----------
 3 files changed, 225 insertions(+), 228 deletions(-)

diff --git a/src/realm/deppart/byfield_gpu_impl.hpp b/src/realm/deppart/byfield_gpu_impl.hpp
index c7e619e06d..8765a57f11 100644
--- a/src/realm/deppart/byfield_gpu_impl.hpp
+++ b/src/realm/deppart/byfield_gpu_impl.hpp
@@ -33,10 +33,12 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
   inst_space.num_children = field_data.size();
 
   collapsed_space<N, T> collapsed_parent;
+  collapsed_parent.offsets = buffer_arena.alloc<size_t>(2);
+  collapsed_parent.num_children = 1;
+  std::vector<IndexSpace<N, T>> parent_spaces = {parent_space};
 
   // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs.
-  GPUMicroOp<N, T>::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream);
-
+  GPUMicroOp<N, T>::collapse_multi_space(parent_spaces, collapsed_parent, buffer_arena, stream);
 
   // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter
   // to figure out where to write each rectangle.
@@ -107,7 +109,9 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
   // Now we have everything we need to actually populate our outputs.
   buffer_arena.flip_parity();
   assert(!buffer_arena.get_parity());
-  PointDesc<N,T>* d_points = buffer_arena.alloc<PointDesc<N,T>>(total_pts);
+
+  RegionInstance points_instance = this->realm_malloc(total_pts * sizeof(PointDesc<N,T>), zcpy_mem);
+  PointDesc<N,T>* d_points = reinterpret_cast<PointDesc<N, T>*>(AffineAccessor<char,1>(points_instance, 0).base);
 
   // This is where the work is actually done - each thread figures out which points to read, reads it, marks a PointDesc with its color, and writes it out.
   byfield_gpuPopulateBitmasksKernel<N,T,FT><<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, d_colors, total_pts, colors.size(), num_valid_rects, field_data.size(), d_points);
diff --git a/src/realm/deppart/partitions.h b/src/realm/deppart/partitions.h
index 222e553ee5..4a8899e251 100644
--- a/src/realm/deppart/partitions.h
+++ b/src/realm/deppart/partitions.h
@@ -137,7 +137,12 @@ namespace Realm {
 
     template <typename T>
     T* alloc(size_t count = 1) {
-      return parity_ ? alloc_right<T>(count) : alloc_left<T>(count);
+      static_assert(!std::is_void_v<T>, "alloc<void> is invalid");
+      return reinterpret_cast<T*>(alloc_bytes(count * sizeof(T), alignof(T)));
+    }
+
+    void* alloc_bytes(size_t bytes, size_t align = alignof(std::max_align_t)) {
+      return parity_ ? alloc_right_bytes(bytes, align) : alloc_left_bytes(bytes, align);
     }
 
     void flip_parity(void) noexcept {
@@ -204,18 +209,6 @@ namespace Realm {
       return p;
     }
 
-    template <typename T>
-    T* alloc_left(size_t count = 1) {
-      static_assert(!std::is_void_v<T>, "alloc<void> is invalid");
-      return reinterpret_cast<T*>(alloc_left_bytes(sizeof(T) * count, alignof(T)));
-    }
-
-    template <typename T>
-    T* alloc_right(size_t count = 1) {
-      static_assert(!std::is_void_v<T>, "alloc<void> is invalid");
-      return reinterpret_cast<T*>(alloc_right_bytes(sizeof(T) * count, alignof(T)));
-    }
-
     static size_t align_up(size_t x, size_t a) noexcept {
       return (x + (a - 1)) & ~(a - 1);
     }
diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp
index de21b7fc99..565a413fa0 100644
--- a/src/realm/deppart/partitions_gpu_impl.hpp
+++ b/src/realm/deppart/partitions_gpu_impl.hpp
@@ -515,35 +515,30 @@ namespace Realm {
     cudaStream_t stream = Cuda::get_task_cuda_stream();
 
     Memory my_mem;
-    bool found = find_memory(my_mem, Memory::GPU_FB_MEM);
-    assert(found);
+    assert(find_memory(my_mem, Memory::GPU_FB_MEM));
 
-    RegionInstance srcs_instance = this->realm_malloc(4*total_rects*sizeof(int32_t), my_mem);
-    RegionInstance crds_instance = this->realm_malloc(4*total_rects*sizeof(T), my_mem);
-    RegionInstance heads_instance = this->realm_malloc(2*total_rects * sizeof(uint8_t), my_mem);
-    RegionInstance sum_instance = this->realm_malloc(2*total_rects * sizeof(size_t), my_mem);
+    assert(!my_arena.get_parity());
+    size_t beginning = my_arena.mark();
 
-    RegionInstance B_src_inst[N];
-    RegionInstance B_coord_inst[N];
+    uint32_t* srcs_ptr = my_arena.alloc<uint32_t>(4 * total_rects);
+    T* crds_ptr = my_arena.alloc<T>(4 * total_rects);
+    uint8_t* heads_ptr = my_arena.alloc<uint8_t>(2 * total_rects);
+    size_t* sums_ptr = my_arena.alloc<size_t>(2 * total_rects);
+
+    size_t left_restore = my_arena.mark();
+    size_t right_restore = my_arena.mark(true);
 
     size_t *B_starts[N];
     size_t *B_ends[N];
 
     T* B_coord[N];
     size_t B_size[N];
-
-    RegionInstance B_ptrs_instance = this->realm_malloc(2 * N * sizeof(size_t*), my_mem);
-    size_t** B_start_ptrs = reinterpret_cast<size_t**>(AffineAccessor<char,1>(B_ptrs_instance, 0).base);
-    size_t** B_end_ptrs = reinterpret_cast<size_t**>(AffineAccessor<char,1>(B_ptrs_instance, 0).base) + N;
-
-    RegionInstance B_coord_ptrs_instance = this->realm_malloc(N * sizeof(T*), my_mem);
-    T** B_coord_ptrs = reinterpret_cast<T**>(AffineAccessor<char,1>(B_coord_ptrs_instance, 0).base);
     
     int threads_per_block = 256;
     size_t grid_size = (total_rects + threads_per_block - 1) / threads_per_block;
 
-    RegionInstance tmp_instance;
     size_t orig_tmp = 0;
+    size_t temp_restore = my_arena.mark();
     void *tmp_storage = nullptr;
 
     //Our first step is to find all the unique "boundaries" in each dimension (lo coord or hi+1 coord)
@@ -553,10 +548,10 @@ namespace Realm {
 
         //We need the coordinates to be sorted by our curent dim and separated by src idx
         grid_size = (total_rects + threads_per_block - 1) / threads_per_block;
-        uint32_t* d_srcs_in = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(srcs_instance, 0).base);
-        uint32_t* d_srcs_out = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(srcs_instance, 0).base) + 2* total_rects;
-        T* d_coord_keys_in = reinterpret_cast<T*>(AffineAccessor<char,1>(crds_instance,0).base);
-        T* d_coord_keys_out = reinterpret_cast<T*>(AffineAccessor<char,1>(crds_instance,0).base) + 2 * total_rects;
+        uint32_t* d_srcs_in = srcs_ptr;
+        uint32_t* d_srcs_out = srcs_ptr + 2* total_rects;
+        T* d_coord_keys_in = crds_ptr;
+        T* d_coord_keys_out = crds_ptr + 2 * total_rects;
         mark_endpoints<<<grid_size, threads_per_block, 0, stream>>>(d_rects, total_rects, d, d_srcs_in, d_coord_keys_in);
         KERNEL_CHECK(stream);
         size_t temp_bytes;
@@ -566,11 +561,10 @@ namespace Realm {
                                             2 * total_rects, 0, 8*sizeof(T), stream);
         if (temp_bytes > orig_tmp) {
           if (orig_tmp > 0) {
-            tmp_instance.destroy();
+            my_arena.rollback(temp_restore);
           }
-          tmp_instance = this->realm_malloc(temp_bytes, my_mem);
           orig_tmp = temp_bytes;
-          tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
+          tmp_storage = reinterpret_cast<void*>(my_arena.alloc<char>(temp_bytes));
         }
         cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes,
                                                   d_coord_keys_in, d_coord_keys_out,
@@ -584,11 +578,10 @@ namespace Realm {
                                             2 * total_rects, 0, 8*sizeof(uint32_t), stream);
         if (temp_bytes > orig_tmp) {
           if (orig_tmp > 0) {
-            tmp_instance.destroy();
+            my_arena.rollback(temp_restore);
           }
-          tmp_instance = this->realm_malloc(temp_bytes, my_mem);
           orig_tmp = temp_bytes;
-          tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
+          tmp_storage = reinterpret_cast<void*>(my_arena.alloc<char>(temp_bytes));
         }
         cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes,
                                             d_srcs_in, d_srcs_out,
@@ -597,19 +590,18 @@ namespace Realm {
 
         //Now mark the unique keys
         grid_size = (2*total_rects + threads_per_block - 1) / threads_per_block;
-        uint8_t * d_heads = reinterpret_cast<uint8_t*>(AffineAccessor<char,1>(heads_instance, 0).base);
-        size_t *d_output = reinterpret_cast<size_t *>(AffineAccessor<char,1>(sum_instance, 0).base);
+        uint8_t * d_heads = heads_ptr;
+        size_t *d_output = sums_ptr;
         mark_heads<<<grid_size, threads_per_block, 0, stream>>>(d_srcs_out, d_coord_keys_out, 2 * total_rects, d_heads);
         KERNEL_CHECK(stream);
 
         cub::DeviceScan::ExclusiveSum(nullptr, temp_bytes, d_heads, d_output, 2 * total_rects, stream);
         if (temp_bytes > orig_tmp) {
           if (orig_tmp > 0) {
-            tmp_instance.destroy();
+            my_arena.rollback(temp_restore);
           }
-          tmp_instance = this->realm_malloc(temp_bytes, my_mem);
           orig_tmp = temp_bytes;
-          tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
+          tmp_storage = reinterpret_cast<void*>(my_arena.alloc<char>(temp_bytes));
         }
         cub::DeviceScan::ExclusiveSum(tmp_storage, temp_bytes, d_heads, d_output, 2 * total_rects, stream);
 
@@ -620,13 +612,21 @@ namespace Realm {
         CUDA_CHECK(cudaStreamSynchronize(stream), stream);
         num_unique += last_bit;
 
+        my_arena.flip_parity();
+        assert(my_arena.get_parity());
+        my_arena.rollback(right_restore);
+
         //Collect all the data we'll need later for this dimension - starts/ends by src, unique boundaries, unique boundaries count
-        B_coord_inst[d] = this->realm_malloc(num_unique * sizeof(T), my_mem);
-        B_src_inst[d] = this->realm_malloc(2*ctr.size() * sizeof(size_t), my_mem);
-        B_starts[d] = reinterpret_cast<size_t*>(AffineAccessor<char,1>(B_src_inst[d], 0).base);
-        B_ends[d] = reinterpret_cast<size_t*>(AffineAccessor<char,1>(B_src_inst[d], 0).base) + ctr.size();
-        B_coord[d] = reinterpret_cast<T*>(AffineAccessor<char,1>(B_coord_inst[d], 0).base);
+        B_starts[d] = my_arena.alloc<size_t>(2 *ctr.size());
+        B_ends[d] = B_starts[d] + ctr.size();
+        B_coord[d] = my_arena.alloc<T>(num_unique);
         B_size[d] = num_unique;
+
+        right_restore = my_arena.mark();
+        my_arena.flip_parity();
+        assert(!my_arena.get_parity());
+        my_arena.rollback(left_restore);
+
         CUDA_CHECK(cudaMemsetAsync(B_starts[d], 0, ctr.size() * sizeof(size_t), stream), stream);
         CUDA_CHECK(cudaMemsetAsync(B_ends[d], 0, ctr.size() * sizeof(size_t), stream), stream);
         scatter_unique<<<grid_size, threads_per_block, 0, stream>>>(d_srcs_out, d_coord_keys_out, d_output, d_heads, 2 * total_rects, B_starts[d], B_ends[d], B_coord[d]);
@@ -645,13 +645,24 @@ namespace Realm {
         CUDA_CHECK(cudaMemcpyAsync(B_ends[d], d_ends_host.data(), ctr.size() * sizeof(size_t), cudaMemcpyHostToDevice, stream), stream);
       }
 
+      assert(!my_arena.get_parity());
+      my_arena.rollback(beginning);
+
       CUDA_CHECK(cudaStreamSynchronize(stream), stream);
     }
-    srcs_instance.destroy();
-    crds_instance.destroy();
-    heads_instance.destroy();
-    sum_instance.destroy();
 
+    orig_tmp = 0;
+
+    my_arena.flip_parity();
+    assert(my_arena.get_parity());
+    my_arena.rollback(right_restore);
+    
+    size_t** B_start_ptrs = my_arena.alloc<size_t*>(2 * N);
+    size_t** B_end_ptrs = B_start_ptrs + N;
+
+    T** B_coord_ptrs = my_arena.alloc<T*>(N);
+
+    right_restore = my_arena.mark();
 
     //We need the arrays themselves on the device
     CUDA_CHECK(cudaMemcpyAsync(B_coord_ptrs, B_coord, N * sizeof(T*), cudaMemcpyHostToDevice, stream), stream);
@@ -660,50 +671,54 @@ namespace Realm {
 
     //Next up, we generate all the corners of all the rectangles and mark them by parity
     size_t num_corners = (1 << N);
-    RegionInstance corners_instance = this->realm_malloc(2 * num_corners * total_rects * sizeof(CornerDesc<N, T>), my_mem);
-    CornerDesc<N, T>* d_corners_in = reinterpret_cast<CornerDesc<N, T>*>(AffineAccessor<char,1>(corners_instance, 0).base);
-    CornerDesc<N, T>* d_corners_out = reinterpret_cast<CornerDesc<N, T>*>(AffineAccessor<char,1>(corners_instance, 0).base) + num_corners * total_rects;
+    CornerDesc<N, T>* d_corners_in = my_arena.alloc<CornerDesc<N, T>>(2 * num_corners * total_rects);
+    CornerDesc<N, T>* d_corners_out = d_corners_in + num_corners * total_rects;
+
+    size_t corner_restore = my_arena.mark();
+
+    my_arena.flip_parity();
+    assert(!my_arena.get_parity());
+    my_arena.flip_parity();
+    my_arena.rollback(corner_restore);
 
     populate_corners<<<grid_size, threads_per_block, 0, stream>>>(d_rects, total_rects, d_corners_in);
     KERNEL_CHECK(stream);
 
 
     // We have a LOT of bookkeeping to do
-    std::set<Event> RLE_alloc_events;
 
     size_t alloc_size_1 = std::max({sizeof(size_t), sizeof(T), sizeof(int32_t), sizeof(DeltaFlag)});
+    size_t align_1 = std::max({alignof(size_t), alignof(T), alignof(int32_t), alignof(DeltaFlag)});
 
-    RegionInstance shared_instance = this->realm_malloc(2 * num_corners * total_rects * alloc_size_1, my_mem);
-
-    RegionInstance flags_instance = this->realm_malloc(num_corners * total_rects * sizeof(uint8_t), my_mem);
-
-    RegionInstance exc_sum_instance = this->realm_malloc(num_corners * total_rects * sizeof(size_t), my_mem);
+    char* shared_ptr = reinterpret_cast<char *>(my_arena.alloc_bytes(2 * num_corners * total_rects * alloc_size_1, align_1));
+    uint8_t* d_flags = my_arena.alloc<uint8_t>(num_corners * total_rects);
+    size_t* d_exc_sum = my_arena.alloc<size_t>(num_corners * total_rects);
 
-    size_t* d_src_keys_in = reinterpret_cast<size_t*>(AffineAccessor<char,1>(shared_instance, 0).base);
-    size_t* d_src_keys_out = reinterpret_cast<size_t*>(AffineAccessor<char,1>(shared_instance, 0).base) + num_corners * total_rects;
-    T* d_coord_keys_in = reinterpret_cast<T*>(AffineAccessor<char,1>(shared_instance, 0).base);
-    T* d_coord_keys_out = reinterpret_cast<T*>(AffineAccessor<char,1>(shared_instance, 0).base) + num_corners * total_rects;
-    int32_t* d_deltas = reinterpret_cast<int32_t*>(AffineAccessor<char,1>(shared_instance, 0).base);
-    int32_t* d_deltas_out = reinterpret_cast<int32_t*>(AffineAccessor<char,1>(shared_instance, 0).base) + num_corners * total_rects;
-    DeltaFlag* d_delta_flags_in = reinterpret_cast<DeltaFlag*>(AffineAccessor<char,1>(shared_instance, 0).base);
-    DeltaFlag* d_delta_flags_out = reinterpret_cast<DeltaFlag*>(AffineAccessor<char,1>(shared_instance, 0).base) + num_corners * total_rects;
-    uint8_t* d_flags = reinterpret_cast<uint8_t*>(AffineAccessor<char,1>(flags_instance, 0).base);
-    size_t* d_exc_sum = reinterpret_cast<size_t*>(AffineAccessor<char,1>(exc_sum_instance, 0).base);
+    size_t* d_src_keys_in = reinterpret_cast<size_t*>(shared_ptr);
+    size_t* d_src_keys_out = d_src_keys_in + num_corners * total_rects;
+    T* d_coord_keys_in = reinterpret_cast<T*>(shared_ptr);
+    T* d_coord_keys_out = d_coord_keys_in + num_corners * total_rects;
+    int32_t* d_deltas = reinterpret_cast<int32_t*>(shared_ptr);
+    int32_t* d_deltas_out = d_deltas + num_corners * total_rects;
+    DeltaFlag* d_delta_flags_in = reinterpret_cast<DeltaFlag*>(shared_ptr);
+    DeltaFlag* d_delta_flags_out = d_delta_flags_in + num_corners * total_rects;
 
-    RegionInstance seg_bound_instance;
     size_t* seg_starts;
     size_t* seg_ends;
 
-    RegionInstance seg_counters;
     uint32_t* d_seg_counters;
 
-    RegionInstance seg_counters_out;
     uint32_t* d_seg_counters_out;
 
     grid_size = (num_corners * total_rects + threads_per_block - 1) / threads_per_block;
 
+    orig_tmp = 0;
+    temp_restore = my_arena.mark();
+    tmp_storage = nullptr;
+
     //We need to reduce duplicate corners by their parity, so we sort to get duplicates next to each other and then reduce by key
     {
+
       NVTX_DEPPART(sort_corners);
       for (int dim = 0; dim < N; dim++) {
         build_coord_key<<<grid_size, threads_per_block, 0, stream>>>(d_coord_keys_in, d_corners_in, num_corners * total_rects, dim);
@@ -714,10 +729,11 @@ namespace Realm {
                                             d_corners_in, d_corners_out,
                                             num_corners * total_rects, 0, 8*sizeof(T), stream);
         if (temp_bytes > orig_tmp) {
-          tmp_instance.destroy();
-          tmp_instance = this->realm_malloc(temp_bytes, my_mem);
+          if (orig_tmp > 0) {
+            my_arena.rollback(temp_restore);
+          }
           orig_tmp = temp_bytes;
-          tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
+          tmp_storage = reinterpret_cast<void*>(my_arena.alloc<char>(temp_bytes));
         }
         cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes,
                                                         d_coord_keys_in, d_coord_keys_out,
@@ -737,10 +753,11 @@ namespace Realm {
                                             d_corners_in, d_corners_out,
                                             num_corners * total_rects, 0, 8*sizeof(size_t), stream);
     if (temp_bytes > orig_tmp) {
-      tmp_instance.destroy();
-      tmp_instance = this->realm_malloc(temp_bytes, my_mem);
+      if (orig_tmp > 0) {
+        my_arena.rollback(temp_restore);
+      }
       orig_tmp = temp_bytes;
-      tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
+      tmp_storage = reinterpret_cast<void*>(my_arena.alloc<char>(temp_bytes));
     }
     cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes,
                                                     d_src_keys_in, d_src_keys_out,
@@ -751,8 +768,8 @@ namespace Realm {
     get_delta<<<grid_size, threads_per_block, 0, stream>>>(d_deltas, d_corners_in, num_corners * total_rects);
     KERNEL_CHECK(stream);
 
-    RegionInstance num_runs_instance = this->realm_malloc(sizeof(int), my_mem);
-    int* d_num_runs = reinterpret_cast<int*>(AffineAccessor<char,1>(num_runs_instance, 0).base);
+    my_arena.rollback(temp_restore);
+    int* d_num_runs = my_arena.alloc<int>(1);
 
     //See above, we have custom equality and reduction operators for CornerDesc
     CustomSum red_op;
@@ -765,12 +782,7 @@ namespace Realm {
         /*num_items=*/(int) (num_corners * total_rects),
          /*stream=*/stream);
 
-    if (temp_bytes > orig_tmp) {
-      tmp_instance.destroy();
-      tmp_instance = this->realm_malloc(temp_bytes, my_mem);
-      orig_tmp = temp_bytes;
-      tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
-    }
+    tmp_storage = reinterpret_cast<void*>(my_arena.alloc<char>(temp_bytes));
     cub::DeviceReduce::ReduceByKey(
         tmp_storage, temp_bytes,
         d_corners_in, d_corners_out,
@@ -784,7 +796,7 @@ namespace Realm {
     CUDA_CHECK(cudaMemcpyAsync(&num_unique_corners, d_num_runs, sizeof(int), cudaMemcpyDeviceToHost, stream), stream);
     CUDA_CHECK(cudaStreamSynchronize(stream), stream);
 
-    num_runs_instance.destroy();
+    my_arena.rollback(temp_restore);
 
     grid_size = (num_unique_corners + threads_per_block - 1) / threads_per_block;
     set_delta<<<grid_size, threads_per_block, 0, stream>>>(d_deltas_out, d_corners_out, num_unique_corners);
@@ -813,12 +825,10 @@ namespace Realm {
                                             d_coord_keys_in, d_coord_keys_out,
                                             d_corners_in, d_corners_out,
                                             num_intermediate, 0, 8*sizeof(T), stream);
-        if (temp_bytes > orig_tmp) {
-          tmp_instance.destroy();
-          tmp_instance = this->realm_malloc(temp_bytes, my_mem);
-          orig_tmp = temp_bytes;
-          tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
-        }
+
+        my_arena.rollback(temp_restore);
+        tmp_storage = reinterpret_cast<void*>(my_arena.alloc<char>(temp_bytes));
+
         cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes,
                                                         d_coord_keys_in, d_coord_keys_out,
                                                         d_corners_in, d_corners_out,
@@ -838,12 +848,10 @@ namespace Realm {
                                               d_coord_keys_in, d_coord_keys_out,
                                               d_corners_in, d_corners_out,
                                               num_intermediate, 0, 8*sizeof(T), stream);
-          if (temp_bytes > orig_tmp) {
-            tmp_instance.destroy();
-            tmp_instance = this->realm_malloc(temp_bytes, my_mem);
-            orig_tmp = temp_bytes;
-            tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
-          }
+
+          my_arena.rollback(temp_restore);
+          tmp_storage = reinterpret_cast<void*>(my_arena.alloc<char>(temp_bytes));
+
           cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes,
                                                           d_coord_keys_in, d_coord_keys_out,
                                                           d_corners_in, d_corners_out,
@@ -859,12 +867,10 @@ namespace Realm {
                                                 d_src_keys_in, d_src_keys_out,
                                                 d_corners_in, d_corners_out,
                                                 num_intermediate, 0, 8*sizeof(size_t), stream);
-        if (temp_bytes > orig_tmp) {
-          tmp_instance.destroy();
-          tmp_instance = this->realm_malloc(temp_bytes, my_mem);
-          orig_tmp = temp_bytes;
-          tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
-        }
+
+        my_arena.rollback(temp_restore);
+        tmp_storage = reinterpret_cast<void*>(my_arena.alloc<char>(temp_bytes));
+
         cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes,
                                                         d_src_keys_in, d_src_keys_out,
                                                         d_corners_in, d_corners_out,
@@ -879,23 +885,20 @@ namespace Realm {
         KERNEL_CHECK(stream);
 
         cub::DeviceScan::InclusiveSum(nullptr, temp_bytes, d_flags, d_exc_sum, num_intermediate, stream);
-        if (temp_bytes > orig_tmp) {
-          if (orig_tmp > 0) {
-            tmp_instance.destroy();
-          }
-          tmp_instance = this->realm_malloc(temp_bytes, my_mem);
-          orig_tmp = temp_bytes;
-          tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
-        }
+
+        my_arena.rollback(temp_restore);
+        tmp_storage = reinterpret_cast<void*>(my_arena.alloc<char>(temp_bytes));
+
         cub::DeviceScan::InclusiveSum(tmp_storage, temp_bytes, d_flags, d_exc_sum, num_intermediate, stream);
 
         CUDA_CHECK(cudaMemcpyAsync(&num_segments, &d_exc_sum[num_intermediate-1], sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream);
         CUDA_CHECK(cudaStreamSynchronize(stream), stream);
 
         //Mark the beginning and end of each segment for our kernel to use in binary search
-        seg_bound_instance = this->realm_malloc(2 * num_segments * sizeof(size_t), my_mem);
-        seg_starts = reinterpret_cast<size_t*>(AffineAccessor<char,1>(seg_bound_instance, 0).base);
-        seg_ends = reinterpret_cast<size_t*>(AffineAccessor<char,1>(seg_bound_instance, 0).base) + num_segments;
+        seg_starts = my_arena.alloc<size_t>(2 * num_segments);
+        seg_ends = seg_starts + num_segments;
+
+        temp_restore = my_arena.mark();
 
         seg_boundaries<<<grid_size, threads_per_block, 0, stream>>>(d_flags, d_exc_sum, num_intermediate, seg_starts, seg_ends);
         KERNEL_CHECK(stream);
@@ -911,12 +914,8 @@ namespace Realm {
           /*stream=*/    stream
         );
 
-        if (temp_bytes > orig_tmp) {
-          tmp_instance.destroy();
-          tmp_instance = this->realm_malloc(temp_bytes, my_mem);
-          orig_tmp = temp_bytes;
-          tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
-        }
+        my_arena.rollback(temp_restore);
+        tmp_storage = reinterpret_cast<void*>(my_arena.alloc<char>(temp_bytes));
 
         cub::DeviceScan::InclusiveScan(
           /*d_temp=*/    tmp_storage,
@@ -932,26 +931,21 @@ namespace Realm {
 
         //Per usual, we do a count + emit pass to track active segments and limit memory usage. If the evaluated prefix sum for a boundary within a segment
         //is 0, we can skip it because it won't contribute anything to future sums and also won't be emitted.
-        seg_counters = this->realm_malloc(num_segments * sizeof(uint32_t), my_mem);
-        d_seg_counters = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(seg_counters, 0).base);
+        d_seg_counters = my_arena.alloc<uint32_t>(2 * num_segments);
+        d_seg_counters_out = d_seg_counters + num_segments;
         CUDA_CHECK(cudaMemsetAsync(d_seg_counters, 0, num_segments * sizeof(uint32_t), stream), stream);
 
+        temp_restore = my_arena.mark();
+
         grid_size = ((num_segments*B_size[d]) + threads_per_block - 1) / threads_per_block;
         count_segments<<<grid_size, threads_per_block, 0, stream>>>(d_delta_flags_out, seg_starts, seg_ends, B_starts[d], B_ends[d], d_corners_in, B_coord[d], B_size[d], num_segments, d, d_seg_counters);
         KERNEL_CHECK(stream);
 
-        seg_counters_out = this->realm_malloc(num_segments * sizeof(uint32_t), my_mem);
-        d_seg_counters_out = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(seg_counters_out, 0).base);
-
         cub::DeviceScan::ExclusiveSum(nullptr, temp_bytes, d_seg_counters, d_seg_counters_out, num_segments, stream);
-        if (temp_bytes > orig_tmp) {
-          if (orig_tmp > 0) {
-            tmp_instance.destroy();
-          }
-          tmp_instance = this->realm_malloc(temp_bytes, my_mem);
-          orig_tmp = temp_bytes;
-          tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
-        }
+
+        my_arena.rollback(temp_restore);
+        tmp_storage = reinterpret_cast<void*>(my_arena.alloc<char>(temp_bytes));
+
         cub::DeviceScan::ExclusiveSum(tmp_storage, temp_bytes, d_seg_counters, d_seg_counters_out, num_segments, stream);
 
         uint32_t next_round;
@@ -968,52 +962,57 @@ namespace Realm {
           break;
         }
 
-        RegionInstance next_corners_instance = this->realm_malloc(2 * next_round * sizeof(CornerDesc<N, T>), my_mem);
-        CornerDesc<N, T>* d_next_corners = reinterpret_cast<CornerDesc<N, T>*>(AffineAccessor<char,1>(next_corners_instance, 0).base);
+        my_arena.flip_parity();
+        if (my_arena.get_parity()) {
+          my_arena.rollback(right_restore);
+        }
+
+        CornerDesc<N, T>* d_next_corners = my_arena.alloc<CornerDesc<N, T>>(2 * next_round);
         CUDA_CHECK(cudaMemsetAsync(d_seg_counters, 0, num_segments*sizeof(uint32_t), stream), stream);
 
+        corner_restore = my_arena.mark();
+        my_arena.flip_parity();
+        my_arena.flip_parity();
+        my_arena.rollback(corner_restore);
+
         write_segments<<<grid_size, threads_per_block, 0, stream>>>(d_delta_flags_out, seg_starts, seg_ends, B_starts[d], B_ends[d], d_corners_in, B_coord[d], d_seg_counters_out, B_size[d], num_segments, d, d_seg_counters, d_next_corners);
         KERNEL_CHECK(stream);
 
         CUDA_CHECK(cudaStreamSynchronize(stream), stream);
-        corners_instance.destroy();
-        corners_instance = next_corners_instance;
         d_corners_in = d_next_corners;
         d_corners_out = d_next_corners + next_round;
 
         //The segment count in each iter is not monotonic, so we have to realloc each time
+        shared_ptr = reinterpret_cast<char *>(my_arena.alloc_bytes(2 * num_intermediate * alloc_size_1, align_1));
+        d_flags = my_arena.alloc<uint8_t>(num_intermediate);
+        d_exc_sum = my_arena.alloc<size_t>(num_intermediate);
+
+        temp_restore = my_arena.mark();
+
+        d_src_keys_in = reinterpret_cast<size_t*>(shared_ptr);
+        d_src_keys_out = reinterpret_cast<size_t*>(shared_ptr) + num_intermediate;
+
+        d_coord_keys_in = reinterpret_cast<T*>(shared_ptr);
+        d_coord_keys_out = reinterpret_cast<T*>(shared_ptr) + num_intermediate;
 
-        shared_instance.destroy();
-        flags_instance.destroy();
-        exc_sum_instance.destroy();
-        seg_bound_instance.destroy();
-        seg_counters.destroy();
-        seg_counters_out.destroy();
-
-        shared_instance = this->realm_malloc(2 * num_intermediate * alloc_size_1, my_mem);
-        flags_instance = this->realm_malloc(num_intermediate * sizeof(uint8_t), my_mem);
-        exc_sum_instance = this->realm_malloc(num_intermediate * sizeof(size_t), my_mem);
-
-        d_src_keys_in = reinterpret_cast<size_t*>(AffineAccessor<char,1>(shared_instance, 0).base);
-        d_src_keys_out = reinterpret_cast<size_t*>(AffineAccessor<char,1>(shared_instance, 0).base) + num_intermediate;
-        d_coord_keys_in = reinterpret_cast<T*>(AffineAccessor<char,1>(shared_instance, 0).base);
-        d_coord_keys_out = reinterpret_cast<T*>(AffineAccessor<char,1>(shared_instance, 0).base) + num_intermediate;
-        d_deltas = reinterpret_cast<int32_t*>(AffineAccessor<char,1>(shared_instance, 0).base);
-        d_deltas_out = reinterpret_cast<int32_t*>(AffineAccessor<char,1>(shared_instance, 0).base) + num_intermediate;
-
-        d_flags = reinterpret_cast<uint8_t*>(AffineAccessor<char,1>(flags_instance, 0).base);
-        d_exc_sum = reinterpret_cast<size_t*>(AffineAccessor<char,1>(exc_sum_instance, 0).base);
-        d_delta_flags_in = reinterpret_cast<DeltaFlag*>(AffineAccessor<char,1>(shared_instance, 0).base);
-        d_delta_flags_out = reinterpret_cast<DeltaFlag*>(AffineAccessor<char,1>(shared_instance, 0).base) + num_intermediate;
+        d_deltas = reinterpret_cast<int32_t*>(shared_ptr);
+        d_deltas_out = reinterpret_cast<int32_t*>(shared_ptr) + num_intermediate;
+
+        d_delta_flags_in = reinterpret_cast<DeltaFlag*>(shared_ptr);
+        d_delta_flags_out = reinterpret_cast<DeltaFlag*>(shared_ptr) + num_intermediate;
 
       }
     }
 
+    //Get to a known state
+    my_arena.flip_parity();
+    if (my_arena.get_parity()) {
+      my_arena.rollback(right_restore);
+    }
+
 
     //For our last dim, we emit rectangles rather than segments. These rectangles are a disjoint, precise covering of the original set.
-    RegionInstance rects_out_instance = this->realm_malloc(2 * num_intermediate * sizeof(RectDesc<N,T>), my_mem);
-    RectDesc<N,T>* d_rects_out = reinterpret_cast<RectDesc<N,T>*>(AffineAccessor<char,1>(rects_out_instance, 0).base);
-    RectDesc<N, T>* d_rects_in = reinterpret_cast<RectDesc<N,T>*>(AffineAccessor<char,1>(rects_out_instance, 0).base) + num_intermediate;
+    RectDesc<N,T>* d_rects_out = my_arena.alloc<RectDesc<N, T>>(num_intermediate);
     CUDA_CHECK(cudaMemsetAsync(d_seg_counters, 0, num_segments*sizeof(uint32_t), stream), stream);
 
     write_segments<<<grid_size, threads_per_block, 0, stream>>>(d_delta_flags_out, seg_starts, seg_ends, B_start_ptrs, B_end_ptrs, d_corners_in, B_coord_ptrs, d_seg_counters_out, B_size[0], num_segments, d_seg_counters, d_rects_out);
@@ -1021,36 +1020,37 @@ namespace Realm {
 
     CUDA_CHECK(cudaStreamSynchronize(stream), stream);
 
-    //Don't need these anymore
-    flags_instance.destroy();
-    exc_sum_instance.destroy();
-    seg_bound_instance.destroy();
-    seg_counters.destroy();
-    seg_counters_out.destroy();
-    corners_instance.destroy();
-    for (int d = 0; d < N; d++) {
-      B_coord_inst[d].destroy();
-      B_src_inst[d].destroy();
+    //Force the rectangles to the left side of the buffer
+    if (my_arena.get_parity()) {
+      my_arena.flip_parity();
+      RectDesc<N, T>* tmp_out = my_arena.alloc<RectDesc<N, T>>(num_intermediate);
+      CUDA_CHECK(cudaMemcpyAsync(tmp_out, d_rects_out, num_intermediate * sizeof(RectDesc<N, T>), cudaMemcpyDeviceToDevice, stream), stream);
     }
-    B_ptrs_instance.destroy();
-    B_coord_ptrs_instance.destroy();
 
-    std::swap(d_rects_out, d_rects_in);
+    //Clear everything out, we should be on the left
+    my_arena.flip_parity();
+    my_arena.flip_parity();
+    assert(!my_arena.get_parity());
+
+    RectDesc<N, T>* d_rects_in = my_arena.alloc<RectDesc<N, T>>(2 * num_intermediate);
+    d_rects_out = d_rects_in + num_intermediate;
 
-    shared_instance.destroy();
     size_t alloc_size_2 = max(sizeof(size_t), sizeof(T));
+    size_t align_2 = max(alignof(size_t), alignof(T));
+
+
+    shared_ptr = reinterpret_cast<char *>(my_arena.alloc_bytes(2 * num_intermediate * alloc_size_2, align_2));
 
-    shared_instance = this->realm_malloc(2 * num_intermediate * alloc_size_2, my_mem);
+    d_src_keys_in = reinterpret_cast<size_t*>(shared_ptr);
+    d_src_keys_out = reinterpret_cast<size_t*>(shared_ptr) + num_intermediate;
+    d_coord_keys_in = reinterpret_cast<T*>(shared_ptr);
+    d_coord_keys_out = reinterpret_cast<T*>(shared_ptr) + num_intermediate;
 
-    d_src_keys_in = reinterpret_cast<size_t*>(AffineAccessor<char,1>(shared_instance, 0).base);
-    d_src_keys_out = reinterpret_cast<size_t*>(AffineAccessor<char,1>(shared_instance, 0).base) + num_intermediate;
-    d_coord_keys_in = reinterpret_cast<T*>(AffineAccessor<char,1>(shared_instance, 0).base);
-    d_coord_keys_out = reinterpret_cast<T*>(AffineAccessor<char,1>(shared_instance, 0).base) + num_intermediate;
+    size_t* group_ids = reinterpret_cast<size_t*>(shared_ptr);
 
-    RegionInstance break_points_instance = this->realm_malloc(num_intermediate * sizeof(uint8_t), my_mem);
-    uint8_t* break_points = reinterpret_cast<uint8_t*>(AffineAccessor<char,1>(break_points_instance, 0).base);
+    uint8_t* break_points = my_arena.alloc<uint8_t>(num_intermediate);
 
-    size_t* group_ids = reinterpret_cast<size_t*>(AffineAccessor<char,1>(shared_instance, 0).base);
+    temp_restore = my_arena.mark();
 
     //Now that we have disjoint rectangles, we can do our usual sort and coalesce pass
     size_t last = INT_MAX;
@@ -1074,12 +1074,10 @@ namespace Realm {
                                               d_coord_keys_in, d_coord_keys_out,
                                               d_rects_in, d_rects_out,
                                               num_intermediate, 0, 8*sizeof(T), stream);
-          if (temp_bytes > orig_tmp) {
-            tmp_instance.destroy();
-            tmp_instance = this->realm_malloc(temp_bytes, my_mem);
-            orig_tmp = temp_bytes;
-            tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
-          }
+
+          my_arena.rollback(temp_restore);
+          tmp_storage = reinterpret_cast<void*>(my_arena.alloc<char>(temp_bytes));
+
           cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes,
                                                           d_coord_keys_in, d_coord_keys_out,
                                                           d_rects_in, d_rects_out,
@@ -1097,12 +1095,10 @@ namespace Realm {
                                                 d_coord_keys_in, d_coord_keys_out,
                                                 d_rects_in, d_rects_out,
                                                 num_intermediate, 0, 8*sizeof(T), stream);
-            if (temp_bytes > orig_tmp) {
-              tmp_instance.destroy();
-              tmp_instance = this->realm_malloc(temp_bytes, my_mem);
-              orig_tmp = temp_bytes;
-              tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
-            }
+
+            my_arena.rollback(temp_restore);
+            tmp_storage = reinterpret_cast<void*>(my_arena.alloc<char>(temp_bytes));
+
             cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes,
                                                             d_coord_keys_in, d_coord_keys_out,
                                                             d_rects_in, d_rects_out,
@@ -1115,12 +1111,10 @@ namespace Realm {
                                                 d_coord_keys_in, d_coord_keys_out,
                                                 d_rects_in, d_rects_out,
                                                 num_intermediate, 0, 8*sizeof(T), stream);
-            if (temp_bytes > orig_tmp) {
-              tmp_instance.destroy();
-              tmp_instance = this->realm_malloc(temp_bytes, my_mem);
-              orig_tmp = temp_bytes;
-              tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
-            }
+
+            my_arena.rollback(temp_restore);
+            tmp_storage = reinterpret_cast<void*>(my_arena.alloc<char>(temp_bytes));
+
             cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes,
                                                             d_coord_keys_in, d_coord_keys_out,
                                                             d_rects_in, d_rects_out,
@@ -1136,12 +1130,10 @@ namespace Realm {
                                                   d_src_keys_in, d_src_keys_out,
                                                   d_rects_in, d_rects_out,
                                                   num_intermediate, 0, 8*sizeof(size_t), stream);
-          if (temp_bytes > orig_tmp) {
-            tmp_instance.destroy();
-            tmp_instance = this->realm_malloc(temp_bytes, my_mem);
-            orig_tmp = temp_bytes;
-            tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
-          }
+
+          my_arena.rollback(temp_restore);
+          tmp_storage = reinterpret_cast<void*>(my_arena.alloc<char>(temp_bytes));
+
           cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes,
                                                           d_src_keys_in, d_src_keys_out,
                                                           d_rects_in, d_rects_out,
@@ -1154,12 +1146,8 @@ namespace Realm {
 
           cub::DeviceScan::InclusiveSum(nullptr, temp_bytes, break_points, group_ids, num_intermediate, stream);
 
-          if (temp_bytes > orig_tmp) {
-            tmp_instance.destroy();
-            tmp_instance = this->realm_malloc(temp_bytes, my_mem);
-            orig_tmp = temp_bytes;
-            tmp_storage = reinterpret_cast<void*>(AffineAccessor<char,1>(tmp_instance, 0).base);
-          }
+          my_arena.rollback(temp_restore);
+          tmp_storage = reinterpret_cast<void*>(my_arena.alloc<char>(temp_bytes));
 
           cub::DeviceScan::InclusiveSum(tmp_storage, temp_bytes, break_points, group_ids, num_intermediate, stream);
 
@@ -1177,15 +1165,26 @@ namespace Realm {
       CUDA_CHECK(cudaStreamSynchronize(stream), stream);
     }
 
-    //And... we're done
-    if (out_rects > 0) {
-      d_out_rects = d_rects_in;
+    if (out_rects == 2) {
+      d_out_rects = d_rects;
+      if (d_out_rects != d_rects_in) {
+        CUDA_CHECK(cudaMemcpyAsync(d_out_rects, d_rects_in, num_intermediate * sizeof(RectDesc<N, T>), cudaMemcpyDeviceToDevice, stream), stream);
+      }
+      out_rects = num_intermediate;
+    } else if (out_rects == 1) {
+      my_arena.reset(true);
+      d_out_rects = my_arena.alloc<RectDesc<N, T>>(num_intermediate);
+      my_arena.commit(true);
+      if (d_rects_in + num_intermediate >= d_out_rects) {
+        assert(d_rects_out < d_rects_in);
+        CUDA_CHECK(cudaMemcpyAsync(d_rects_out, d_rects_in, num_intermediate * sizeof(RectDesc<N, T>), cudaMemcpyDeviceToDevice, stream), stream);
+        std::swap(d_rects_in, d_rects_out);
+      }
+      CUDA_CHECK(cudaMemcpyAsync(d_out_rects, d_rects_in, num_intermediate * sizeof(RectDesc<N, T>), cudaMemcpyDeviceToDevice, stream), stream);
       out_rects = num_intermediate;
     } else {
       this->send_output(d_rects_in, num_intermediate, my_arena, ctr, getIndex, getMap);
-      rects_out_instance.destroy();
     }
-
   }
 
   /*
@@ -1212,8 +1211,9 @@ namespace Realm {
     size_t bytes_S   = total_rects * sizeof(size_t);
     size_t bytes_HF  = total_rects * sizeof(HiFlag<T>);
     size_t max_bytes = std::max({bytes_T, bytes_HF, bytes_S});
+    size_t max_align = std::max({alignof(T), alignof(HiFlag<T>), alignof(size_t)});
 
-    char* aux_ptr = my_arena.alloc<char>(2 * max_bytes);
+    char* aux_ptr = reinterpret_cast<char *>(my_arena.alloc_bytes(2 * max_bytes, max_align));
 
     uint8_t* break_points = my_arena.alloc<uint8_t>(total_rects);
     size_t* group_ids = my_arena.alloc<size_t>(total_rects);

From c05776f5f9ecf12c89b47b2a6a5f4a5a297d9248 Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Thu, 19 Feb 2026 18:03:37 -0800
Subject: [PATCH 15/32] byfield tiled

---
 src/realm/deppart/byfield.cc              |   2 +-
 src/realm/deppart/byfield_gpu_impl.hpp    | 172 ++++++++++++++++------
 src/realm/deppart/image_gpu_impl.hpp      |  21 +--
 src/realm/deppart/partitions_gpu_impl.hpp |  15 +-
 4 files changed, 148 insertions(+), 62 deletions(-)

diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc
index b9d4bf5e43..ce543e1b44 100644
--- a/src/realm/deppart/byfield.cc
+++ b/src/realm/deppart/byfield.cc
@@ -44,7 +44,7 @@ namespace Realm {
             if (val) {
               device_size = atoi(val);
             }
-            size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>) * 100;
+            size_t optimal_size = is.bounds.volume() * sizeof(RectDesc<N, T>);
             std::vector<Machine::ProcessorMemoryAffinity> affinities;
             unsigned best_bandwidth = 0;
             Processor best_proc = Processor::NO_PROC;
diff --git a/src/realm/deppart/byfield_gpu_impl.hpp b/src/realm/deppart/byfield_gpu_impl.hpp
index 8765a57f11..56ab0258a2 100644
--- a/src/realm/deppart/byfield_gpu_impl.hpp
+++ b/src/realm/deppart/byfield_gpu_impl.hpp
@@ -32,6 +32,9 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
   inst_space.offsets = buffer_arena.alloc<size_t>(field_data.size() + 1);
   inst_space.num_children = field_data.size();
 
+  Arena sys_arena;
+  GPUMicroOp<N, T>::collapse_multi_space(field_data, inst_space, sys_arena, stream);
+
   collapsed_space<N, T> collapsed_parent;
   collapsed_parent.offsets = buffer_arena.alloc<size_t>(2);
   collapsed_parent.num_children = 1;
@@ -80,14 +83,125 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
 
   buffer_arena.commit(false);
 
-  GPUMicroOp<N, T>::collapse_multi_space(field_data, inst_space, buffer_arena, stream);
+  // Map colors to their output index to match send output iterator.
+  std::map<FT, size_t> color_indices;
+  for (size_t i = 0; i < colors.size(); i++) {
+    color_indices[colors[i]] = i;
+  }
+
+  size_t num_output = 0;
+  RectDesc<N, T>* output_start = nullptr;
+  size_t num_completed = 0;
+  size_t curr_tile = tile_size / 2;
+  int count = 0;
+  while (num_completed < inst_space.num_entries) {
+    try {
+      std::cout << "Byfield iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
+      buffer_arena.start();
+      if (num_completed + curr_tile > inst_space.num_entries) {
+        curr_tile = inst_space.num_entries - num_completed;
+      }
 
-  // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from.
-  GPUMicroOp<N, T>::template construct_input_rectlist<Rect<N, T>>(inst_space, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream);
+      collapsed_space<N, T> inst_space_tile = inst_space;
+      inst_space_tile.num_entries = curr_tile;
+      inst_space_tile.entries_buffer = buffer_arena.alloc<SparsityMapEntry<N,T>>(curr_tile);
+      CUDA_CHECK(cudaMemcpyAsync(inst_space_tile.entries_buffer, inst_space.entries_buffer + num_completed, curr_tile * sizeof(SparsityMapEntry<N,T>), cudaMemcpyHostToDevice, stream), stream);
 
+      // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from.
+      GPUMicroOp<N, T>::template construct_input_rectlist<Rect<N, T>>(inst_space_tile, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream);
 
-  // Early out if we don't have any rectangles.
-  if (num_valid_rects == 0) {
+
+      // Early out if we don't have any rectangles.
+      if (num_valid_rects == 0) {
+        num_completed += curr_tile;
+        curr_tile = tile_size / 2;
+        subtract_const<<<COMPUTE_GRID(field_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, field_data.size()+1, curr_tile);
+        KERNEL_CHECK(stream);
+        CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+        continue;
+      }
+
+
+      // Prefix sum the valid rectangles by volume.
+      size_t total_pts;
+      size_t* d_prefix_rects;
+
+      GPUMicroOp<N, T>::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream);
+
+      // Now we have everything we need to actually populate our outputs.
+      buffer_arena.flip_parity();
+      assert(!buffer_arena.get_parity());
+
+      PointDesc<N,T>* d_points = buffer_arena.alloc<PointDesc<N,T>>(total_pts);
+
+      // This is where the work is actually done - each thread figures out which points to read, reads it, marks a PointDesc with its color, and writes it out.
+      byfield_gpuPopulateBitmasksKernel<N,T,FT><<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, d_colors, total_pts, colors.size(), num_valid_rects, field_data.size(), d_points);
+      KERNEL_CHECK(stream);
+
+      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+
+      // Ship off the points for final processing.
+      size_t num_new_rects = (num_output == 0) ? 1 : 2;
+      assert(!buffer_arena.get_parity());
+      RectDesc<N, T>* d_new_rects;
+      this->complete_pipeline(d_points, total_pts, d_new_rects, num_new_rects, buffer_arena,
+        /* the Container: */  sparsity_outputs,
+        /* getIndex: */       [&](auto const& kv){
+                                // elem is a SparsityMap<N,T> from the vector
+                                return color_indices.at(kv.first);
+                             },
+        /* getMap: */         [&](auto const& kv){
+                              // return the SparsityMap key itself
+                              return kv.second;
+                           });
+
+      if (num_output==0) {
+        num_output = num_new_rects;
+        output_start = d_new_rects;
+        num_completed += curr_tile;
+        subtract_const<<<COMPUTE_GRID(field_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, field_data.size()+1, curr_tile);
+        KERNEL_CHECK(stream);
+        curr_tile = tile_size / 2;
+        CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+        continue;
+      }
+
+        //Otherwise we merge with existing rectangles
+        RectDesc<N, T>* d_old_rects = buffer_arena.alloc<RectDesc<N, T>>(num_output);
+        assert(d_old_rects == d_new_rects + num_new_rects);
+        CUDA_CHECK(cudaMemcpyAsync(d_old_rects, output_start, num_output * sizeof(RectDesc<N,T>), cudaMemcpyDeviceToDevice, stream), stream);
+        CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+
+        size_t num_final_rects = 1;
+
+        //Send it off for processing
+        this->complete_rect_pipeline(d_new_rects, num_output + num_new_rects, output_start, num_final_rects, buffer_arena,
+        /* the Container: */  sparsity_outputs,
+        /* getIndex: */       [&](auto const& kv){
+                                // elem is a SparsityMap<N,T> from the vector
+                                return color_indices.at(kv.first);
+                             },
+        /* getMap: */         [&](auto const& kv){
+                              // return the SparsityMap key itself
+                              return kv.second;
+                           });
+        num_completed += curr_tile;
+        num_output = num_final_rects;
+        subtract_const<<<COMPUTE_GRID(field_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, field_data.size()+1, curr_tile);
+        KERNEL_CHECK(stream);
+        curr_tile = tile_size / 2;
+        CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+      } catch (arena_oom&) {
+        std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl;
+        std::cout << buffer_arena.used() << " bytes used in arena." << std::endl;
+        curr_tile /= 2;
+        if (curr_tile == 0) {
+          throw;
+        }
+      }
+    }
+
+  if (num_output == 0) {
     for (std::pair<const FT, SparsityMap<N, T>> it : sparsity_outputs) {
       SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it.second);
       if (this->exclusive) {
@@ -99,45 +213,15 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
     return;
   }
 
+  this->send_output(output_start, num_output, buffer_arena, sparsity_outputs,
+        /* getIndex: */       [&](auto const& kv){
+                                // elem is a SparsityMap<N,T> from the vector
+                                return color_indices.at(kv.first);
+                             },
+        /* getMap: */         [&](auto const& kv){
+                              // return the SparsityMap key itself
+                              return kv.second;
+                           });
 
-  // Prefix sum the valid rectangles by volume.
-  size_t total_pts;
-
-  size_t* d_prefix_rects;
-  GPUMicroOp<N, T>::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream);
-
-  // Now we have everything we need to actually populate our outputs.
-  buffer_arena.flip_parity();
-  assert(!buffer_arena.get_parity());
-
-  RegionInstance points_instance = this->realm_malloc(total_pts * sizeof(PointDesc<N,T>), zcpy_mem);
-  PointDesc<N,T>* d_points = reinterpret_cast<PointDesc<N, T>*>(AffineAccessor<char,1>(points_instance, 0).base);
-
-  // This is where the work is actually done - each thread figures out which points to read, reads it, marks a PointDesc with its color, and writes it out.
-  byfield_gpuPopulateBitmasksKernel<N,T,FT><<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, d_colors, total_pts, colors.size(), num_valid_rects, field_data.size(), d_points);
-  KERNEL_CHECK(stream);
-
-
-  // Map colors to their output index to match send output iterator.
-  std::map<FT, size_t> color_indices;
-  for (size_t i = 0; i < colors.size(); i++) {
-    color_indices[colors[i]] = i;
-  }
-
-  CUDA_CHECK(cudaStreamSynchronize(stream), stream);
-
-  // Ship off the points for final processing.
-  size_t out_rects = 0;
-  RectDesc<N, T>* trash;
-  this->complete_pipeline(d_points, total_pts, trash, out_rects, buffer_arena,
-    /* the Container: */  sparsity_outputs,
-    /* getIndex: */       [&](auto const& kv){
-                            // elem is a SparsityMap<N,T> from the vector
-                            return color_indices.at(kv.first);
-                         },
-    /* getMap: */         [&](auto const& kv){
-                          // return the SparsityMap key itself
-                          return kv.second;
-                       });
 }
 }
diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp
index ce83e03639..b22812a00c 100644
--- a/src/realm/deppart/image_gpu_impl.hpp
+++ b/src/realm/deppart/image_gpu_impl.hpp
@@ -93,7 +93,6 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
     uint32_t* d_src_prefix = d_src_counters + sources.size();
 
     buffer_arena.commit(false);
-    size_t left = buffer_arena.used();
 
     size_t num_output = 0;
     RectDesc<N, T>* output_start = nullptr;
@@ -102,7 +101,7 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
     int count = 0;
     while (num_completed < inst_space.num_entries) {
       try {
-        std::cout << "Tile iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
+        std::cout << "Image Range iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
         buffer_arena.start();
         buffer_arena.flip_parity();
         if (num_completed + curr_tile > inst_space.num_entries) {
@@ -177,7 +176,7 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
 
         CUDA_CHECK(cudaStreamSynchronize(stream), stream);
 
-        size_t num_new_rects = 2;
+        size_t num_new_rects = (num_output == 0) ? 1 : 2;
         assert(!buffer_arena.get_parity());
         RectDesc<N, T>* d_new_rects;
 
@@ -197,13 +196,9 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
           if (num_output==0) {
 
             //We need to place the new output at the rightmost end of the buffer
-            buffer_arena.flip_parity();
-            buffer_arena.reset(true);
-            output_start = buffer_arena.alloc<RectDesc<N, T>>(num_new_rects);
-            buffer_arena.commit(true);
-            CUDA_CHECK(cudaMemcpyAsync(output_start, d_new_rects, num_new_rects * sizeof(RectDesc<N,T>), cudaMemcpyDeviceToDevice, stream), stream);
             num_output = num_new_rects;
             num_completed += curr_tile;
+            output_start = d_new_rects;
             subtract_const<<<COMPUTE_GRID(domain_transform.range_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile);
             KERNEL_CHECK(stream);
             curr_tile = tile_size / 2;
@@ -342,7 +337,7 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
     int count = 0;
     while (num_completed < inst_space.num_entries) {
       try {
-        std::cout << "Tile iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
+        std::cout << "Image iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
         buffer_arena.start();
         std::cout << "Amount Used: " << buffer_arena.used() << std::endl;
         std::cout << "Expected Amount Used: " << left + num_output * sizeof(RectDesc<N,T>) << std::endl;
@@ -409,7 +404,7 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
         CUDA_CHECK(cudaStreamSynchronize(stream), stream);
 
 
-        size_t num_new_rects = 1;
+        size_t num_new_rects = num_output == 0 ? 1 : 2;
         assert(!buffer_arena.get_parity());
         RectDesc<N, T>* d_new_rects;
 
@@ -426,13 +421,9 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
                            });
 
         if (num_output==0) {
-          buffer_arena.flip_parity();
-          buffer_arena.reset(true);
-          output_start = buffer_arena.alloc<RectDesc<N, T>>(num_new_rects);
-          buffer_arena.commit(true);
-          CUDA_CHECK(cudaMemcpyAsync(output_start, d_new_rects, num_new_rects * sizeof(RectDesc<N,T>), cudaMemcpyDeviceToDevice, stream), stream);
           num_output = num_new_rects;
           num_completed += curr_tile;
+          output_start = d_new_rects;
           subtract_const<<<COMPUTE_GRID(domain_transform.ptr_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile);
           KERNEL_CHECK(stream);
           curr_tile = tile_size / 2;
diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp
index 565a413fa0..6f50175ec9 100644
--- a/src/realm/deppart/partitions_gpu_impl.hpp
+++ b/src/realm/deppart/partitions_gpu_impl.hpp
@@ -1352,7 +1352,9 @@ namespace Realm {
 
     NVTX_DEPPART(complete_pipeline);
 
-    my_arena.flip_parity();
+    if (out_rects == 2) {
+      my_arena.flip_parity();
+    }
 
 
     cudaStream_t stream = Cuda::get_task_cuda_stream();
@@ -1451,12 +1453,21 @@ namespace Realm {
         std::swap(d_rects_in, d_rects_out);
       }
       my_arena.flip_parity();
+      if (out_rects == 2) {
+        assert(!my_arena.get_parity());
+      } else if (out_rects == 1) {
+        assert(my_arena.get_parity());
+        my_arena.reset(true);
+      }
       d_out_rects = my_arena.alloc<RectDesc<N, T>>(num_intermediate);
+      if (out_rects == 1) {
+        my_arena.commit(true);
+      }
       CUDA_CHECK(cudaMemcpyAsync(d_out_rects, d_rects_in, num_intermediate * sizeof(RectDesc<N,T>), cudaMemcpyDeviceToDevice, stream), stream);
       CUDA_CHECK(cudaStreamSynchronize(stream), stream);
     }
 
-    if (out_rects==1) {
+    if (out_rects > 0) {
       out_rects = num_intermediate;
     } else {
       this->send_output(d_rects_in, num_intermediate, my_arena, ctr, getIndex, getMap);

From 2182a0400332744c67ee495ceb3da2715c27ce50 Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Sun, 22 Feb 2026 21:46:14 -0800
Subject: [PATCH 16/32] Added host fallback

---
 src/realm/deppart/byfield_gpu_impl.hpp       |  101 +-
 src/realm/deppart/image_gpu_impl.hpp         |  113 +-
 src/realm/deppart/partitions.h               |    2 +
 src/realm/deppart/partitions_gpu_impl.hpp    |  112 +-
 src/realm/deppart/partitions_gpu_kernels.hpp |    8 +-
 src/realm/deppart/preimage.cc                |    2 +-
 src/realm/deppart/preimage_gpu_impl.hpp      |  819 +--
 src/realm/deppart/sparsity_impl.cc           |   13 +
 src/realm/deppart/sparsity_impl.h            |    1 +
 tests/CMakeLists.txt                         |    1 +
 tests/benchmark.cc                           | 5019 ++++++++++++++++++
 tests/deppart.cc                             |   16 +-
 12 files changed, 5811 insertions(+), 396 deletions(-)
 create mode 100644 tests/benchmark.cc

diff --git a/src/realm/deppart/byfield_gpu_impl.hpp b/src/realm/deppart/byfield_gpu_impl.hpp
index 56ab0258a2..e309cf7609 100644
--- a/src/realm/deppart/byfield_gpu_impl.hpp
+++ b/src/realm/deppart/byfield_gpu_impl.hpp
@@ -89,11 +89,17 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
     color_indices[colors[i]] = i;
   }
 
+  Memory sysmem;
+  assert(find_memory(sysmem, Memory::SYSTEM_MEM));
+
   size_t num_output = 0;
   RectDesc<N, T>* output_start = nullptr;
   size_t num_completed = 0;
   size_t curr_tile = tile_size / 2;
   int count = 0;
+  bool host_fallback = false;
+  std::vector<RegionInstance> h_instances(colors.size(), RegionInstance::NO_INST);
+  std::vector<size_t> entry_counts(colors.size(), 0);
   while (num_completed < inst_space.num_entries) {
     try {
       std::cout << "Byfield iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
@@ -155,7 +161,11 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
                               return kv.second;
                            });
 
-      if (num_output==0) {
+      if (host_fallback) {
+        this->split_output(d_new_rects, num_new_rects, h_instances, entry_counts, buffer_arena);
+      }
+
+      if (num_output==0 || host_fallback) {
         num_output = num_new_rects;
         output_start = d_new_rects;
         num_completed += curr_tile;
@@ -166,40 +176,44 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
         continue;
       }
 
-        //Otherwise we merge with existing rectangles
-        RectDesc<N, T>* d_old_rects = buffer_arena.alloc<RectDesc<N, T>>(num_output);
-        assert(d_old_rects == d_new_rects + num_new_rects);
-        CUDA_CHECK(cudaMemcpyAsync(d_old_rects, output_start, num_output * sizeof(RectDesc<N,T>), cudaMemcpyDeviceToDevice, stream), stream);
-        CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+      //Otherwise we merge with existing rectangles
+      RectDesc<N, T>* d_old_rects = buffer_arena.alloc<RectDesc<N, T>>(num_output);
+      assert(d_old_rects == d_new_rects + num_new_rects);
+      CUDA_CHECK(cudaMemcpyAsync(d_old_rects, output_start, num_output * sizeof(RectDesc<N,T>), cudaMemcpyDeviceToDevice, stream), stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
 
-        size_t num_final_rects = 1;
+      size_t num_final_rects = 1;
+      //Send it off for processing
+      this->complete_rect_pipeline(d_new_rects, num_output + num_new_rects, output_start, num_final_rects, buffer_arena,
+      /* the Container: */  sparsity_outputs,
+      /* getIndex: */       [&](auto const& kv){
+                              // elem is a SparsityMap<N,T> from the vector
+                              return color_indices.at(kv.first);
+                           },
+      /* getMap: */         [&](auto const& kv){
+                            // return the SparsityMap key itself
+                            return kv.second;
+                         });
+      num_completed += curr_tile;
+      num_output = num_final_rects;
+      subtract_const<<<COMPUTE_GRID(field_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, field_data.size()+1, curr_tile);
+      KERNEL_CHECK(stream);
+      curr_tile = tile_size / 2;
+      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
 
-        //Send it off for processing
-        this->complete_rect_pipeline(d_new_rects, num_output + num_new_rects, output_start, num_final_rects, buffer_arena,
-        /* the Container: */  sparsity_outputs,
-        /* getIndex: */       [&](auto const& kv){
-                                // elem is a SparsityMap<N,T> from the vector
-                                return color_indices.at(kv.first);
-                             },
-        /* getMap: */         [&](auto const& kv){
-                              // return the SparsityMap key itself
-                              return kv.second;
-                           });
-        num_completed += curr_tile;
-        num_output = num_final_rects;
-        subtract_const<<<COMPUTE_GRID(field_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, field_data.size()+1, curr_tile);
-        KERNEL_CHECK(stream);
-        curr_tile = tile_size / 2;
-        CUDA_CHECK(cudaStreamSynchronize(stream), stream);
-      } catch (arena_oom&) {
-        std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl;
-        std::cout << buffer_arena.used() << " bytes used in arena." << std::endl;
-        curr_tile /= 2;
-        if (curr_tile == 0) {
-          throw;
+    } catch (arena_oom&) {
+      std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl;
+      std::cout << buffer_arena.used() << " bytes used in arena." << std::endl;
+      curr_tile /= 2;
+      if (curr_tile == 0) {
+        host_fallback = true;
+        if (num_output > 0) {
+          this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
         }
+        curr_tile = tile_size / 2;
       }
     }
+  }
 
   if (num_output == 0) {
     for (std::pair<const FT, SparsityMap<N, T>> it : sparsity_outputs) {
@@ -213,7 +227,9 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
     return;
   }
 
-  this->send_output(output_start, num_output, buffer_arena, sparsity_outputs,
+  if (!host_fallback) {
+    try {
+      this->send_output(output_start, num_output, buffer_arena, sparsity_outputs,
         /* getIndex: */       [&](auto const& kv){
                                 // elem is a SparsityMap<N,T> from the vector
                                 return color_indices.at(kv.first);
@@ -222,6 +238,29 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
                               // return the SparsityMap key itself
                               return kv.second;
                            });
+    } catch (arena_oom&) {
+      this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+      host_fallback = true;
+    }
+  }
+
+  if (host_fallback) {
+    for (std::pair<const FT, SparsityMap<N, T>> it : sparsity_outputs) {
+      SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it.second);
+      if (this->exclusive) {
+        impl->set_contributor_count(1);
+      }
+      size_t idx = color_indices.at(it.first);
+      if (entry_counts[idx] > 0) {
+        Rect<N, T>* h_rects = reinterpret_cast<Rect<N,T> *>(AffineAccessor<char,1>(h_instances[idx], 0).base);
+        span<Rect<N, T>> h_rects_span(h_rects, entry_counts[idx]);
+        impl->contribute_dense_rect_list(h_rects_span, false);
+        h_instances[idx].destroy();
+      } else {
+        impl->contribute_nothing();
+      }
+    }
+  }
 
 }
 }
diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp
index b22812a00c..643845296d 100644
--- a/src/realm/deppart/image_gpu_impl.hpp
+++ b/src/realm/deppart/image_gpu_impl.hpp
@@ -7,6 +7,8 @@
 #include <thrust/transform.h>
 #include "realm/nvtx.h"
 
+#include <H5Rpublic.h>
+
 namespace Realm {
 
 //TODO: INTERSECTING INPUT/OUTPUT RECTS CAN BE DONE WITH BVH IF BECOME EXPENSIVE
@@ -74,7 +76,6 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
     // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second
     // to track which instance each rectangle came from in the populate phase.
     uint32_t* d_inst_prefix = d_inst_counters + domain_transform.range_data.size();
-    size_t num_valid_rects = tile_size;
 
     collapsed_space<N, T> collapsed_parent;
 
@@ -99,6 +100,10 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
     size_t num_completed = 0;
     size_t curr_tile = tile_size / 2;
     int count = 0;
+
+    bool host_fallback = false;
+    std::vector<RegionInstance> h_instances(sources.size(), RegionInstance::NO_INST);
+    std::vector<size_t> entry_counts(sources.size(), 0);
     while (num_completed < inst_space.num_entries) {
       try {
         std::cout << "Image Range iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
@@ -112,16 +117,17 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
         inst_space_tile.entries_buffer = buffer_arena.alloc<SparsityMapEntry<N2,T2>>(curr_tile);
         CUDA_CHECK(cudaMemcpyAsync(inst_space_tile.entries_buffer, inst_space.entries_buffer + num_completed, curr_tile * sizeof(SparsityMapEntry<N2,T2>), cudaMemcpyHostToDevice, stream), stream);
 
+        size_t num_valid_rects;
         RectDesc<N2, T2>* d_valid_rects;
         // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from.
         GPUMicroOp<N2, T2>::template construct_input_rectlist<RectDesc<N2, T2>>(inst_space_tile, src_space, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream);
 
         if (num_valid_rects == 0) {
           num_completed += curr_tile;
-          curr_tile = tile_size / 2;
           subtract_const<<<COMPUTE_GRID(domain_transform.range_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile);
           KERNEL_CHECK(stream);
           CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+          curr_tile = tile_size / 2;
           continue;
         }
 
@@ -158,10 +164,10 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
 
         if (num_valid_output == 0) {
           num_completed += curr_tile;
-          curr_tile = tile_size / 2;
           subtract_const<<<COMPUTE_GRID(domain_transform.range_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile);
           KERNEL_CHECK(stream);
           CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+          curr_tile = tile_size / 2;
           continue;
         }
 
@@ -192,8 +198,12 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
                              return elem;
                           });
 
+          if (host_fallback) {
+            this->split_output(d_new_rects, num_new_rects, h_instances, entry_counts, buffer_arena);
+          }
+
           //Set our first set of output rectangles
-          if (num_output==0) {
+          if (num_output==0 || host_fallback) {
 
             //We need to place the new output at the rightmost end of the buffer
             num_output = num_new_rects;
@@ -237,13 +247,30 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
         std::cout << buffer_arena.used() << " bytes used in arena." << std::endl;
         curr_tile /= 2;
         if (curr_tile == 0) {
-          throw;
+          host_fallback = true;
+          if (num_output > 0) {
+            this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+          }
+          curr_tile = tile_size / 2;
         }
       }
   }
-  CUDA_CHECK(cudaStreamSynchronize(stream), stream);
-  KERNEL_CHECK(stream);
-  this->send_output(output_start, num_output, buffer_arena, sparsity_outputs,
+
+  if (num_output == 0) {
+    for (SparsityMap<N, T> it : sparsity_outputs) {
+      SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it);
+      if (this->exclusive) {
+        impl->gpu_finalize();
+      } else {
+        impl->contribute_nothing();
+      }
+    }
+    return;
+  }
+
+  if (!host_fallback) {
+    try {
+      this->send_output(output_start, num_output, buffer_arena, sparsity_outputs,
       /* getIndex: */       [&](auto const& elem){
                               // elem is a SparsityMap<N,T> from the vector
                               return size_t(&elem - sparsity_outputs.data());
@@ -252,6 +279,28 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
                             // return the SparsityMap key itself
                             return elem;
                          });
+    } catch (arena_oom&) {
+      this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+      host_fallback = true;
+    }
+  }
+
+  if (host_fallback) {
+    for (size_t idx = 0; idx < sparsity_outputs.size(); ++idx) {
+      SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(sparsity_outputs[idx]);
+      if (this->exclusive) {
+        impl->set_contributor_count(1);
+      }
+      if (entry_counts[idx] > 0) {
+        Rect<N, T>* h_rects = reinterpret_cast<Rect<N,T> *>(AffineAccessor<char,1>(h_instances[idx], 0).base);
+        span<Rect<N, T>> h_rects_span(h_rects, entry_counts[idx]);
+        impl->contribute_dense_rect_list(h_rects_span, false);
+        h_instances[idx].destroy();
+      } else {
+        impl->contribute_nothing();
+      }
+    }
+  }
 
 }
 
@@ -307,7 +356,6 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
     // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second
     // to track which instance each rectangle came from in the populate phase.
     uint32_t* d_inst_prefix = d_inst_counters + domain_transform.ptr_data.size();
-    size_t num_valid_rects = tile_size;
 
     //Uniform for all tiles
     collapsed_space<N, T> collapsed_parent;
@@ -335,6 +383,9 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
     size_t num_completed = 0;
     size_t curr_tile = tile_size / 2;
     int count = 0;
+    bool host_fallback = false;
+    std::vector<RegionInstance> h_instances(sources.size(), RegionInstance::NO_INST);
+    std::vector<size_t> entry_counts(sources.size(), 0);
     while (num_completed < inst_space.num_entries) {
       try {
         std::cout << "Image iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
@@ -349,15 +400,16 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
         inst_space_tile.entries_buffer = buffer_arena.alloc<SparsityMapEntry<N2,T2>>(curr_tile);
         CUDA_CHECK(cudaMemcpyAsync(inst_space_tile.entries_buffer, inst_space.entries_buffer + num_completed, curr_tile * sizeof(SparsityMapEntry<N2,T2>), cudaMemcpyHostToDevice, stream), stream);
 
+        size_t num_valid_rects;
         RectDesc<N2, T2>* d_valid_rects;
         GPUMicroOp<N2, T2>::template construct_input_rectlist<RectDesc<N2, T2>>(inst_space_tile, src_space, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream);
 
         if (num_valid_rects == 0) {
           num_completed += curr_tile;
-          curr_tile = tile_size / 2;
           subtract_const<<<COMPUTE_GRID(domain_transform.ptr_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile);
           KERNEL_CHECK(stream);
           CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+          curr_tile = tile_size / 2;
           continue;
         }
 
@@ -385,10 +437,10 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
 
         if (num_valid_points == 0) {
           num_completed += curr_tile;
-          curr_tile = tile_size / 2;
           subtract_const<<<COMPUTE_GRID(domain_transform.ptr_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile);
           KERNEL_CHECK(stream);
           CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+          curr_tile = tile_size / 2;
           continue;
         }
 
@@ -420,7 +472,11 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
                               return elem;
                            });
 
-        if (num_output==0) {
+        if (host_fallback) {
+          this->split_output(d_new_rects, num_new_rects, h_instances, entry_counts, buffer_arena);
+        }
+
+        if (num_output==0 || host_fallback) {
           num_output = num_new_rects;
           num_completed += curr_tile;
           output_start = d_new_rects;
@@ -461,7 +517,11 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
         std::cout << buffer_arena.used() << " bytes used in arena." << std::endl;
         curr_tile /= 2;
         if (curr_tile == 0) {
-          throw;
+          host_fallback = true;
+          if (num_output > 0) {
+            this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+          }
+          curr_tile = tile_size / 2;
         }
       }
     }
@@ -477,7 +537,10 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
       }
       return;
     }
-    this->send_output(output_start, num_output, buffer_arena, sparsity_outputs,
+
+  if (!host_fallback) {
+    try {
+      this->send_output(output_start, num_output, buffer_arena, sparsity_outputs,
       /* getIndex: */       [&](auto const& elem){
                               // elem is a SparsityMap<N,T> from the vector
                               return size_t(&elem - sparsity_outputs.data());
@@ -486,5 +549,27 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
                             // return the SparsityMap key itself
                             return elem;
                          });
+    } catch (arena_oom&) {
+      this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+      host_fallback = true;
+    }
+  }
+
+  if (host_fallback) {
+    for (size_t idx = 0; idx < sparsity_outputs.size(); ++idx) {
+      SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(sparsity_outputs[idx]);
+      if (this->exclusive) {
+        impl->set_contributor_count(1);
+      }
+      if (entry_counts[idx] > 0) {
+        Rect<N, T>* h_rects = reinterpret_cast<Rect<N,T> *>(AffineAccessor<char,1>(h_instances[idx], 0).base);
+        span<Rect<N, T>> h_rects_span(h_rects, entry_counts[idx]);
+        impl->contribute_dense_rect_list(h_rects_span, false);
+        h_instances[idx].destroy();
+      } else {
+        impl->contribute_nothing();
+      }
+    }
+  }
 }
 }
\ No newline at end of file
diff --git a/src/realm/deppart/partitions.h b/src/realm/deppart/partitions.h
index 4a8899e251..8b67e5e642 100644
--- a/src/realm/deppart/partitions.h
+++ b/src/realm/deppart/partitions.h
@@ -370,6 +370,8 @@ namespace Realm {
     template<typename Container, typename IndexFn, typename MapFn>
     void complete1d_pipeline(RectDesc<N, T>* d_rects, size_t total_rects, RectDesc<N, T>* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap);
 
+    void split_output(RectDesc<N, T>* d_rects, size_t total_rects, std::vector<RegionInstance> &output_instances, std::vector<size_t> &output_counts, Arena &my_arena);
+
     template<typename Container, typename IndexFn, typename MapFn>
     void send_output(RectDesc<N, T>* d_rects, size_t total_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap);
 
diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp
index 6f50175ec9..82abfd57d9 100644
--- a/src/realm/deppart/partitions_gpu_impl.hpp
+++ b/src/realm/deppart/partitions_gpu_impl.hpp
@@ -1474,6 +1474,100 @@ namespace Realm {
     }
   }
 
+  template<int N, typename T>
+  void GPUMicroOp<N,T>::split_output(RectDesc<N, T>* d_rects, size_t total_rects, std::vector<RegionInstance> &output_instances, std::vector<size_t> &output_counts, Arena &my_arena)
+  {
+    NVTX_DEPPART(send_output);
+
+    cudaStream_t stream = Cuda::get_task_cuda_stream();
+    bool use_sysmem = false;
+    RegionInstance sys_instance = RegionInstance::NO_INST;
+
+    Memory sysmem;
+    assert(find_memory(sysmem, Memory::SYSTEM_MEM));
+
+    Rect<N,T>* final_rects;
+    std::vector<size_t> d_starts_host(output_instances.size()), d_ends_host(output_instances.size());
+
+    try {
+      final_rects = my_arena.alloc<Rect<N,T>>(total_rects);
+
+      size_t* d_starts = my_arena.alloc<size_t>(2 * output_instances.size());
+      size_t* d_ends = d_starts + output_instances.size();
+
+      CUDA_CHECK(cudaMemsetAsync(d_starts, 0, output_instances.size()*sizeof(size_t),stream), stream);
+      CUDA_CHECK(cudaMemsetAsync(d_ends, 0, output_instances.size()*sizeof(size_t),stream), stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+
+      //Convert RectDesc to SparsityMapEntry and determine where each src's rectangles start and end.
+      build_final_output<N, T><<<COMPUTE_GRID(total_rects), THREADS_PER_BLOCK, 0, stream>>>(d_rects, nullptr, final_rects, d_starts, d_ends, total_rects);
+      KERNEL_CHECK(stream);
+
+
+      //Copy starts and ends back to host and handle empty partitions
+
+      CUDA_CHECK(cudaMemcpyAsync(d_starts_host.data(), d_starts, output_instances.size() * sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream);
+      CUDA_CHECK(cudaMemcpyAsync(d_ends_host.data(), d_ends, output_instances.size() * sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+    } catch (arena_oom&) {
+      use_sysmem = true;
+      RegionInstance tmp_instance = this->realm_malloc(total_rects * sizeof(RectDesc<N,T>), sysmem);
+      sys_instance = this->realm_malloc(total_rects * sizeof(Rect<N,T>), sysmem);
+      RectDesc<N, T>* h_tmp_rects = reinterpret_cast<RectDesc<N,T>*>(tmp_instance.pointer_untyped(0, total_rects * sizeof(RectDesc<N,T>)));
+      final_rects = reinterpret_cast<Rect<N,T>*>(sys_instance.pointer_untyped(0, total_rects * sizeof(Rect<N,T>)));
+      CUDA_CHECK(cudaMemcpyAsync(h_tmp_rects, d_rects, total_rects * sizeof(RectDesc<N,T>), cudaMemcpyDeviceToHost, stream), stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+      for (size_t idx = 0; idx < total_rects; idx++ ) {
+        final_rects[idx] = h_tmp_rects[idx].rect;
+
+        //Checks if we're the first value for a given src
+        if (idx == 0 || h_tmp_rects[idx].src_idx != h_tmp_rects[idx-1].src_idx) {
+          d_starts_host[h_tmp_rects[idx].src_idx] = idx;
+        }
+
+        //Checks if we're the last value for a given src
+        if (idx == total_rects-1 || h_tmp_rects[idx].src_idx != h_tmp_rects[idx+1].src_idx) {
+          d_ends_host[h_tmp_rects[idx].src_idx] = idx+1;
+        }
+      }
+      tmp_instance.destroy();
+    }
+
+    for (size_t i = 1; i < output_instances.size(); i++) {
+      if (d_starts_host[i] < d_ends_host[i-1]) {
+        d_starts_host[i] = d_ends_host[i-1];
+        d_ends_host[i] = d_ends_host[i-1];
+      }
+    }
+
+    for (size_t i = 0; i < output_instances.size(); i++) {
+      if (d_ends_host[i] > d_starts_host[i]) {
+        size_t end = d_ends_host[i];
+        size_t start = d_starts_host[i];
+        if (end - start > 0) {
+          RegionInstance new_instance = this->realm_malloc(((end - start) + output_counts[i]) * sizeof(Rect<N, T>), sysmem);
+          Rect<N, T>* h_new_rects = reinterpret_cast<Rect<N, T>*>(new_instance.pointer_untyped(0, ((end - start) + output_counts[i]) * sizeof(Rect<N, T>)));
+          if (output_counts[i] > 0) {
+            Rect<N, T>* h_old_rects = reinterpret_cast<Rect<N, T>*>(output_instances[i].pointer_untyped(0, output_counts[i] * sizeof(Rect<N, T>)));
+            std::memcpy(h_new_rects, h_old_rects, output_counts[i] * sizeof(Rect<N, T>));
+            output_instances[i].destroy();
+          }
+          if (use_sysmem) {
+            std::memcpy(h_new_rects + output_counts[i], final_rects + start, (end - start) * sizeof(Rect<N, T>));
+          } else {
+            CUDA_CHECK(cudaMemcpyAsync(h_new_rects + output_counts[i], final_rects + start, (end - start) * sizeof(Rect<N, T>), cudaMemcpyDeviceToHost, stream), stream);
+            CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+          }
+          output_instances[i] = new_instance;
+          output_counts[i] += end - start;
+        }
+      }
+    }
+    if (use_sysmem) {
+      sys_instance.destroy();
+    }
+  }
+
   /*
    *  Input: An array of disjoint rectangles sorted by src idx.
    *  Output: Fills the sparsity output for each src with a host region instance
@@ -1491,8 +1585,6 @@ namespace Realm {
 
     cudaStream_t stream = Cuda::get_task_cuda_stream();
 
-    std::set<Event> output_allocs;
-
     SparsityMapEntry<N,T>* final_entries = my_arena.alloc<SparsityMapEntry<N,T>>(total_rects);
     Rect<N,T>* final_rects = my_arena.alloc<Rect<N,T>>(total_rects);
 
@@ -1502,9 +1594,6 @@ namespace Realm {
     CUDA_CHECK(cudaMemsetAsync(d_starts, 0, ctr.size()*sizeof(size_t),stream), stream);
     CUDA_CHECK(cudaMemsetAsync(d_ends, 0, ctr.size()*sizeof(size_t),stream), stream);
 
-
-    CUDA_CHECK(cudaStreamSynchronize(stream), stream);
-
     //Convert RectDesc to SparsityMapEntry and determine where each src's rectangles start and end.
     build_final_output<<<COMPUTE_GRID(total_rects), THREADS_PER_BLOCK, 0, stream>>>(d_rects, final_entries, final_rects, d_starts, d_ends, total_rects);
     KERNEL_CHECK(stream);
@@ -1522,6 +1611,8 @@ namespace Realm {
       }
     }
 
+    Memory sysmem;
+    assert(find_memory(sysmem, Memory::SYSTEM_MEM));
     if (!this->exclusive) {
       for (auto const& elem : ctr) {
         size_t idx = getIndex(elem);
@@ -1530,17 +1621,18 @@ namespace Realm {
         if (d_ends_host[idx] > d_starts_host[idx]) {
           size_t end = d_ends_host[idx];
           size_t start = d_starts_host[idx];
-          std::vector<Rect<N, T>> h_rects(end - start);
-          CUDA_CHECK(cudaMemcpyAsync(h_rects.data(), final_rects + start, (end - start) * sizeof(Rect<N,T>), cudaMemcpyDeviceToHost, stream), stream);
+          RegionInstance h_rects_instance = this->realm_malloc((end - start) * sizeof(Rect<N,T>), sysmem);
+          Rect<N, T> *h_rects = reinterpret_cast<Rect<N,T> *>(AffineAccessor<char, 1>(h_rects_instance, 0).base);
+          CUDA_CHECK(cudaMemcpyAsync(h_rects, final_rects + start, (end - start) * sizeof(Rect<N,T>), cudaMemcpyDeviceToHost, stream), stream);
           CUDA_CHECK(cudaStreamSynchronize(stream), stream);
-          impl->contribute_dense_rect_list(h_rects, false);
+          span<Rect<N, T>> h_rects_span(h_rects, end - start);
+          impl->contribute_dense_rect_list(h_rects_span, false);
+          h_rects_instance.destroy();
         } else {
           impl->contribute_nothing();
         }
       }
     } else {
-      Memory sysmem;
-      assert(find_memory(sysmem, Memory::SYSTEM_MEM));
 
       //Use provided lambdas to iterate over sparsity output container (map or vector)
       for (auto const& elem : ctr) {
diff --git a/src/realm/deppart/partitions_gpu_kernels.hpp b/src/realm/deppart/partitions_gpu_kernels.hpp
index 2f607930d9..b3bd280be4 100644
--- a/src/realm/deppart/partitions_gpu_kernels.hpp
+++ b/src/realm/deppart/partitions_gpu_kernels.hpp
@@ -794,9 +794,11 @@ void build_final_output(const RectDesc<N,T>* d_rects,
   size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx >= numRects) return;
   d_rects_out[idx] = d_rects[idx].rect;
-  d_entries_out[idx].bounds = d_rects[idx].rect;
-  d_entries_out[idx].sparsity.id = 0;
-  d_entries_out[idx].bitmap = 0;
+  if (d_entries_out != nullptr) {
+    d_entries_out[idx].bounds = d_rects[idx].rect;
+    d_entries_out[idx].sparsity.id = 0;
+    d_entries_out[idx].bitmap = 0;
+  }
 
   //Checks if we're the first value for a given src
   if (idx == 0 || d_rects[idx].src_idx != d_rects[idx-1].src_idx) {
diff --git a/src/realm/deppart/preimage.cc b/src/realm/deppart/preimage.cc
index 4ae8cd4ddc..b25c8b2c41 100644
--- a/src/realm/deppart/preimage.cc
+++ b/src/realm/deppart/preimage.cc
@@ -408,7 +408,7 @@ namespace Realm {
 			    }
 		}
 		bool gpu_data = !gpu_ptr_data.empty() || !gpu_rect_data.empty();
-	        bool opcount = cpu_ptr_data.size() + cpu_rect_data.size() + gpu_ptr_data.size() + gpu_rect_data.size();
+	        size_t opcount = cpu_ptr_data.size() + cpu_rect_data.size() + gpu_ptr_data.size() + gpu_rect_data.size();
 	        bool exclusive = (gpu_data && (opcount == 1));
 		if (domain_transform.type ==
 		           DomainTransform<N2, T2, N, T>::DomainTransformType::STRUCTURED && !gpu_data) {
diff --git a/src/realm/deppart/preimage_gpu_impl.hpp b/src/realm/deppart/preimage_gpu_impl.hpp
index 3793b32458..3e464c582f 100644
--- a/src/realm/deppart/preimage_gpu_impl.hpp
+++ b/src/realm/deppart/preimage_gpu_impl.hpp
@@ -15,33 +15,28 @@ namespace Realm {
       return;
     }
 
-    Memory my_mem = domain_transform.range_data[0].inst.get_location();
+    RegionInstance buffer = domain_transform.range_data[0].scratch_buffer;
 
-    const char* val = std::getenv("TILE_SIZE");  // or any env var
-    size_t tile_size = 100000000; //default
-    if (val) {
-      tile_size = atoi(val);
-    }
-
-    RegionInstance fixed_buffer = this->realm_malloc(tile_size, my_mem);
-    Arena buffer_arena(reinterpret_cast<void *>(AffineAccessor<char, 1>(fixed_buffer, 0).base), tile_size);
+    size_t tile_size = buffer.get_layout()->bytes_used;
+    std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
+    Arena buffer_arena(reinterpret_cast<void *>(AffineAccessor<char, 1>(buffer, 0).base), tile_size);
 
     NVTX_DEPPART(gpu_preimage);
 
+    Memory sysmem;
+    find_memory(sysmem, Memory::SYSTEM_MEM);
+
     cudaStream_t stream = Cuda::get_task_cuda_stream();
 
     collapsed_space<N, T> inst_space;
 
     // We combine all of our instances into one to batch work, tracking the offsets between instances.
-    RegionInstance inst_offsets_instance = this->realm_malloc((domain_transform.range_data.size() + 1) * sizeof(size_t), my_mem);
-    inst_space.offsets = reinterpret_cast<size_t*>(AffineAccessor<char,1>(inst_offsets_instance, 0).base);
+    inst_space.offsets = buffer_arena.alloc<size_t>(domain_transform.range_data.size() + 1);
     inst_space.num_children = domain_transform.range_data.size();
 
-    RegionInstance inst_entries_instance;
+    Arena sys_arena;
+    GPUMicroOp<N, T>::collapse_multi_space(domain_transform.range_data, inst_space, sys_arena, stream);
 
-    GPUMicroOp<N, T>::collapse_multi_space(domain_transform.range_data, inst_space, buffer_arena, stream);
-
-    RegionInstance parent_entries_instance;
     collapsed_space<N, T> collapsed_parent;
 
     // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs.
@@ -50,53 +45,16 @@ namespace Realm {
 
     // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter
     // to figure out where to write each rectangle.
-    RegionInstance inst_counters_instance = this->realm_malloc((2*domain_transform.range_data.size() + 1) * sizeof(uint32_t), my_mem);
-    uint32_t* d_inst_counters = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(inst_counters_instance, 0).base);
+    uint32_t* d_inst_counters = buffer_arena.alloc<uint32_t>(2 * domain_transform.range_data.size() + 1);
 
     // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second
     // to track which instance each rectangle came from in the populate phase.
     uint32_t* d_inst_prefix = d_inst_counters + domain_transform.range_data.size();
-    RegionInstance out_instance;
-    size_t num_valid_rects;
-
-    Rect<N, T>* d_valid_rects;
-
-    // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from.
-    GPUMicroOp<N, T>::template construct_input_rectlist<Rect<N, T>>(inst_space, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream);
-    inst_entries_instance.destroy();
-    parent_entries_instance.destroy();
-    inst_offsets_instance.destroy();
-
-    if (num_valid_rects == 0) {
-      for (auto it : sparsity_outputs) {
-        SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it);
-        if (this->exclusive) {
-          impl->gpu_finalize();
-        } else {
-          impl->contribute_nothing();
-        }
-      }
-      out_instance.destroy();
-      inst_counters_instance.destroy();
-      return;
-    }
-
-    // Prefix sum the valid rectangles by volume.
-    RegionInstance prefix_rects_instance;
-    size_t total_pts;
-
-    size_t* d_prefix_rects;
-    GPUMicroOp<N, T>::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream);
-
-    nvtx_range_push("cuda", "build target entries");
 
     collapsed_space<N2, T2> target_space;
-    RegionInstance offsets_instance = this->realm_malloc((targets.size()+1) * sizeof(size_t), my_mem);
-    target_space.offsets = reinterpret_cast<size_t*>(AffineAccessor<char,1>(offsets_instance, 0).base);
+    target_space.offsets = buffer_arena.alloc<size_t>(targets.size() + 1);
     target_space.num_children = targets.size();
 
-    RegionInstance targets_entries_instance;
-
     GPUMicroOp<N2, T2>::collapse_multi_space(targets, target_space, buffer_arena, stream);
 
     Memory zcpy_mem;
@@ -107,135 +65,255 @@ namespace Realm {
       d_accessors[i] = AffineAccessor<Rect<N2,T2>,N,T>(domain_transform.range_data[i].inst, domain_transform.range_data[i].field_offset);
     }
 
-    RegionInstance points_instance;
-    PointDesc<N,T>* d_points;
-    size_t num_valid_points;
-
-    RegionInstance target_counters_instance = this->realm_malloc((2*targets.size()+1) * sizeof(uint32_t), my_mem);
-    uint32_t* d_target_counters = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(target_counters_instance, 0).base);
+    uint32_t* d_target_counters = buffer_arena.alloc<uint32_t>(2*targets.size() + 1);
     uint32_t* d_targets_prefix = d_target_counters + targets.size();
     CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, targets.size() * sizeof(uint32_t), stream), stream);
 
-    if (target_space.num_entries > targets.size()) {
-      BVH<N2, T2> preimage_bvh;
-      RegionInstance bvh_instance;
-      GPUMicroOp<N2, T2>::build_bvh(target_space, preimage_bvh, buffer_arena, stream);
-
-      preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices,
-       preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.range_data.size(), preimage_bvh.num_leaves, nullptr, d_target_counters, nullptr);
-      KERNEL_CHECK(stream);
-
-      std::vector<uint32_t> h_target_counters(targets.size()+1);
-      h_target_counters[0] = 0; // prefix sum starts at 0
-      CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream);
-      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
-      for (size_t i = 0; i < targets.size(); ++i) {
-        h_target_counters[i+1] += h_target_counters[i];
-      }
+    buffer_arena.commit(false);
+
+    size_t num_output = 0;
+    RectDesc<N, T>* output_start = nullptr;
+    size_t num_completed = 0;
+    size_t curr_tile = tile_size / 2;
+    int count = 0;
+
+    bool host_fallback = false;
+    std::vector<RegionInstance> h_instances(targets.size(), RegionInstance::NO_INST);
+    std::vector<size_t> entry_counts(targets.size(), 0);
+    while (num_completed < inst_space.num_entries) {
+      try {
+
+        std::cout << "Preimage iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
+        buffer_arena.start();
+        std::cout << "Amount Used: " << buffer_arena.used() << std::endl;
+        if (num_completed + curr_tile > inst_space.num_entries) {
+          curr_tile = inst_space.num_entries - num_completed;
+        }
+
+        collapsed_space<N, T> inst_space_tile = inst_space;
+        inst_space_tile.num_entries = curr_tile;
+        inst_space_tile.entries_buffer = buffer_arena.alloc<SparsityMapEntry<N,T>>(curr_tile);
+        CUDA_CHECK(cudaMemcpyAsync(inst_space_tile.entries_buffer, inst_space.entries_buffer + num_completed, curr_tile * sizeof(SparsityMapEntry<N,T>), cudaMemcpyHostToDevice, stream), stream);
+
+        size_t num_valid_rects;
+        Rect<N, T>* d_valid_rects;
+        // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from.
+        GPUMicroOp<N, T>::template construct_input_rectlist<Rect<N, T>>(inst_space_tile, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream);
+
+        if (num_valid_rects == 0) {
+          num_completed += curr_tile;
+          subtract_const<<<COMPUTE_GRID(domain_transform.range_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile);
+          KERNEL_CHECK(stream);
+          CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+          curr_tile = tile_size / 2;
+          continue;
+        }
+
+        // Prefix sum the valid rectangles by volume.
+        size_t total_pts;
+        size_t* d_prefix_rects;
+        GPUMicroOp<N, T>::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream);
+
+        nvtx_range_push("cuda", "build target entries");
 
-      num_valid_points = h_target_counters[targets.size()];
+        PointDesc<N,T>* d_points;
+        size_t num_valid_points;
 
-      if (num_valid_points == 0) {
-        for (auto it : sparsity_outputs) {
-          SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it);
-          if (this->exclusive) {
-            impl->gpu_finalize();
-          } else {
-            impl->contribute_nothing();
+        CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, targets.size() * sizeof(uint32_t), stream), stream);
+
+        if (target_space.num_entries > targets.size()) {
+
+          BVH<N2, T2> preimage_bvh;
+          GPUMicroOp<N2, T2>::build_bvh(target_space, preimage_bvh, buffer_arena, stream);
+
+          preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices,
+           preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.range_data.size(), preimage_bvh.num_leaves, nullptr, d_target_counters, nullptr);
+          KERNEL_CHECK(stream);
+
+          std::vector<uint32_t> h_target_counters(targets.size()+1);
+          h_target_counters[0] = 0; // prefix sum starts at 0
+          CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream);
+          CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+          for (size_t i = 0; i < targets.size(); ++i) {
+            h_target_counters[i+1] += h_target_counters[i];
           }
-        }
-        target_counters_instance.destroy();
-        accessors_instance.destroy();
-        targets_entries_instance.destroy();
-        offsets_instance.destroy();
-        prefix_rects_instance.destroy();
-        out_instance.destroy();
-        inst_counters_instance.destroy();
-        bvh_instance.destroy();
-        return;
-      }
 
-      CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream);
-
-      points_instance = this->realm_malloc(num_valid_points * sizeof(PointDesc<N,T>), my_mem);
-      d_points = reinterpret_cast<PointDesc<N,T>*>(AffineAccessor<char,1>(points_instance, 0).base);
-
-      CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream);
-
-      preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices,
-       preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.range_data.size(), preimage_bvh.num_leaves, d_targets_prefix, d_target_counters, d_points);
-      KERNEL_CHECK(stream);
-      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
-      bvh_instance.destroy();
-    } else {
-      preimage_dense_populate_bitmasks_kernel < N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts,
-       num_valid_rects, domain_transform.range_data.size(), targets.size(), nullptr, d_target_counters, nullptr);
-      KERNEL_CHECK(stream);
-
-      std::vector<uint32_t> h_target_counters(targets.size()+1);
-      h_target_counters[0] = 0; // prefix sum starts at 0
-      CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream);
-      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
-      for (size_t i = 0; i < targets.size(); ++i) {
-        h_target_counters[i+1] += h_target_counters[i];
-      }
+          num_valid_points = h_target_counters[targets.size()];
 
-      num_valid_points = h_target_counters[targets.size()];
+          if (num_valid_points == 0) {
+            num_completed += curr_tile;
+            subtract_const<<<COMPUTE_GRID(domain_transform.range_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile);
+            KERNEL_CHECK(stream);
+            CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+            curr_tile = tile_size / 2;
+            continue;
+          }
 
-      if (num_valid_points == 0) {
-        for (auto it : sparsity_outputs) {
-          SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it);
-          if (this->exclusive) {
-            impl->gpu_finalize();
-          } else {
-            impl->contribute_nothing();
+          CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream);
+
+          buffer_arena.flip_parity();
+          d_points = buffer_arena.alloc<PointDesc<N, T>>(num_valid_points);
+
+          CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream);
+
+          preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices,
+           preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.range_data.size(), preimage_bvh.num_leaves, d_targets_prefix, d_target_counters, d_points);
+          KERNEL_CHECK(stream);
+          CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+        } else {
+          preimage_dense_populate_bitmasks_kernel< N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts,
+           num_valid_rects, domain_transform.range_data.size(), targets.size(), nullptr, d_target_counters, nullptr);
+          KERNEL_CHECK(stream);
+
+          std::vector<uint32_t> h_target_counters(targets.size()+1);
+          h_target_counters[0] = 0; // prefix sum starts at 0
+          CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream);
+          CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+          for (size_t i = 0; i < targets.size(); ++i) {
+            h_target_counters[i+1] += h_target_counters[i];
           }
+
+          num_valid_points = h_target_counters[targets.size()];
+
+          if (num_valid_points == 0) {
+            num_completed += curr_tile;
+            subtract_const<<<COMPUTE_GRID(domain_transform.range_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile);
+            KERNEL_CHECK(stream);
+            CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+            curr_tile = tile_size / 2;
+            continue;
+          }
+
+          CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream);
+
+          buffer_arena.flip_parity();
+          d_points = buffer_arena.alloc<PointDesc<N, T>>(num_valid_points);
+
+          CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream);
+
+          preimage_dense_populate_bitmasks_kernel< N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts,
+           num_valid_rects, domain_transform.range_data.size(), targets.size(), d_targets_prefix, d_target_counters, d_points);
+          KERNEL_CHECK(stream);
+          CUDA_CHECK(cudaStreamSynchronize(stream), stream);
         }
-        target_counters_instance.destroy();
-        accessors_instance.destroy();
-        targets_entries_instance.destroy();
-        offsets_instance.destroy();
-        prefix_rects_instance.destroy();
-        out_instance.destroy();
-        inst_counters_instance.destroy();
-        return;
-      }
 
-      CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream);
+        buffer_arena.flip_parity();
+        buffer_arena.flip_parity();
+        d_points = buffer_arena.alloc<PointDesc<N, T>>(num_valid_points);
+
+        size_t num_new_rects = num_output == 0 ? 1 : 2;
+        assert(!buffer_arena.get_parity());
+        RectDesc<N, T>* d_new_rects;
+
+        this->complete_pipeline(d_points, num_valid_points, d_new_rects, num_new_rects, buffer_arena,
+        /* the Container: */  sparsity_outputs,
+        /* getIndex: */       [&](auto const& elem){
+                                // elem is a SparsityMap<N,T> from the vector
+                                return size_t(&elem - sparsity_outputs.data());
+                             },
+        /* getMap: */         [&](auto const& elem){
+                              // return the SparsityMap key itself
+                              return elem;
+                           });
+
+        if (host_fallback) {
+          this->split_output(d_new_rects, num_new_rects, h_instances, entry_counts, buffer_arena);
+        }
 
-      points_instance = this->realm_malloc(num_valid_points * sizeof(PointDesc<N,T>), my_mem);
-      d_points = reinterpret_cast<PointDesc<N,T>*>(AffineAccessor<char,1>(points_instance, 0).base);
+        if (num_output==0 || host_fallback) {
+          num_output = num_new_rects;
+          num_completed += curr_tile;
+          output_start = d_new_rects;
+          subtract_const<<<COMPUTE_GRID(domain_transform.range_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile);
+          KERNEL_CHECK(stream);
+          curr_tile = tile_size / 2;
+          CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+          continue;
+        }
 
-      CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream);
+        RectDesc<N, T>* d_old_rects = buffer_arena.alloc<RectDesc<N, T>>(num_output);
+        assert(d_old_rects == d_new_rects + num_new_rects);
+        CUDA_CHECK(cudaMemcpyAsync(d_old_rects, output_start, num_output * sizeof(RectDesc<N,T>), cudaMemcpyDeviceToDevice, stream), stream);
+        CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+
+        size_t num_final_rects = 1;
+
+        //Send it off for processing
+        this->complete_rect_pipeline(d_new_rects, num_output + num_new_rects, output_start, num_final_rects, buffer_arena,
+        /* the Container: */  sparsity_outputs,
+        /* getIndex: */       [&](auto const& elem){
+                                // elem is a SparsityMap<N,T> from the vector
+                                return size_t(&elem - sparsity_outputs.data());
+                             },
+        /* getMap: */         [&](auto const& elem){
+                              // return the SparsityMap key itself
+                              return elem;
+                           });
+        num_completed += curr_tile;
+        num_output = num_final_rects;
+        subtract_const<<<COMPUTE_GRID(domain_transform.range_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile);
+        KERNEL_CHECK(stream);
+        curr_tile = tile_size / 2;
+        CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+
+      } catch (arena_oom&) {
+        std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl;
+        std::cout << buffer_arena.used() << " bytes used in arena." << std::endl;
+        curr_tile /= 2;
+        if (curr_tile == 0) {
+          host_fallback = true;
+          if (num_output > 0) {
+            this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+          }
+          curr_tile = tile_size / 2;
+        }
+      }
+    }
+    if (num_output == 0) {
+      for (SparsityMap<N, T> it : sparsity_outputs) {
+        SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it);
+        if (this->exclusive) {
+          impl->gpu_finalize();
+        } else {
+          impl->contribute_nothing();
+        }
+      }
+      return;
+    }
 
-      preimage_dense_populate_bitmasks_kernel < N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts,
-       num_valid_rects, domain_transform.range_data.size(), targets.size(), d_targets_prefix, d_target_counters, d_points);
-      KERNEL_CHECK(stream);
-      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+    if (!host_fallback) {
+      try {
+        this->send_output(output_start, num_output, buffer_arena, sparsity_outputs,
+        /* getIndex: */       [&](auto const& elem){
+                                // elem is a SparsityMap<N,T> from the vector
+                                return size_t(&elem - sparsity_outputs.data());
+                             },
+        /* getMap: */         [&](auto const& elem){
+                              // return the SparsityMap key itself
+                              return elem;
+                           });
+      } catch (arena_oom&) {
+        this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+        host_fallback = true;
+      }
     }
 
-    target_counters_instance.destroy();
-    accessors_instance.destroy();
-    targets_entries_instance.destroy();
-    offsets_instance.destroy();
-    prefix_rects_instance.destroy();
-    out_instance.destroy();
-    inst_counters_instance.destroy();
-
-    size_t out_rects = 0;
-    RectDesc<N, T>* trash;
-    this->complete_pipeline(d_points, num_valid_points, trash, out_rects, buffer_arena,
-    /* the Container: */  sparsity_outputs,
-    /* getIndex: */       [&](auto const& elem){
-                            // elem is a SparsityMap<N,T> from the vector
-                            return size_t(&elem - sparsity_outputs.data());
-                         },
-    /* getMap: */         [&](auto const& elem){
-                          // return the SparsityMap key itself
-                          return elem;
-                       });
-
-    points_instance.destroy();
+    if (host_fallback) {
+      for (size_t idx = 0; idx < sparsity_outputs.size(); ++idx) {
+        SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(sparsity_outputs[idx]);
+        if (this->exclusive) {
+          impl->set_contributor_count(1);
+        }
+        if (entry_counts[idx] > 0) {
+          Rect<N, T>* h_rects = reinterpret_cast<Rect<N,T> *>(AffineAccessor<char,1>(h_instances[idx], 0).base);
+          span<Rect<N, T>> h_rects_span(h_rects, entry_counts[idx]);
+          impl->contribute_dense_rect_list(h_rects_span, false);
+          h_instances[idx].destroy();
+        } else {
+          impl->contribute_nothing();
+        }
+      }
+    }
   }
 
   template<int N, typename T, int N2, typename T2>
@@ -244,33 +322,28 @@ namespace Realm {
       return;
     }
 
-    Memory my_mem = domain_transform.ptr_data[0].inst.get_location();
-
-    const char* val = std::getenv("TILE_SIZE");  // or any env var
-    size_t tile_size = 100000000; //default
-    if (val) {
-      tile_size = atoi(val);
-    }
+    RegionInstance buffer = domain_transform.ptr_data[0].scratch_buffer;
 
-    RegionInstance fixed_buffer = this->realm_malloc(tile_size, my_mem);
-    Arena buffer_arena(reinterpret_cast<void *>(AffineAccessor<char, 1>(fixed_buffer, 0).base), tile_size);
+    size_t tile_size = buffer.get_layout()->bytes_used;
+    std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
+    Arena buffer_arena(reinterpret_cast<void *>(AffineAccessor<char, 1>(buffer, 0).base), tile_size);
 
     NVTX_DEPPART(gpu_preimage);
 
+    Memory sysmem;
+    find_memory(sysmem, Memory::SYSTEM_MEM);
+
     cudaStream_t stream = Cuda::get_task_cuda_stream();
 
     collapsed_space<N, T> inst_space;
 
     // We combine all of our instances into one to batch work, tracking the offsets between instances.
-    RegionInstance inst_offsets_instance = this->realm_malloc((domain_transform.ptr_data.size() + 1) * sizeof(size_t), my_mem);
-    inst_space.offsets = reinterpret_cast<size_t*>(AffineAccessor<char,1>(inst_offsets_instance, 0).base);
+    inst_space.offsets = buffer_arena.alloc<size_t>(domain_transform.ptr_data.size() + 1);
     inst_space.num_children = domain_transform.ptr_data.size();
 
-    RegionInstance inst_entries_instance;
-
-    GPUMicroOp<N, T>::collapse_multi_space(domain_transform.ptr_data, inst_space, buffer_arena, stream);
+    Arena sys_arena;
+    GPUMicroOp<N, T>::collapse_multi_space(domain_transform.ptr_data, inst_space, sys_arena, stream);
 
-    RegionInstance parent_entries_instance;
     collapsed_space<N, T> collapsed_parent;
 
     // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs.
@@ -279,52 +352,16 @@ namespace Realm {
 
     // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter
     // to figure out where to write each rectangle.
-    RegionInstance inst_counters_instance = this->realm_malloc((2*domain_transform.ptr_data.size() + 1) * sizeof(uint32_t), my_mem);
-    uint32_t* d_inst_counters = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(inst_counters_instance, 0).base);
+    uint32_t* d_inst_counters = buffer_arena.alloc<uint32_t>(2 * domain_transform.ptr_data.size() + 1);
 
     // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second
     // to track which instance each rectangle came from in the populate phase.
     uint32_t* d_inst_prefix = d_inst_counters + domain_transform.ptr_data.size();
-    RegionInstance out_instance;
-    size_t num_valid_rects;
-
-    Rect<N, T>* d_valid_rects;
-    // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from.
-    GPUMicroOp<N, T>::template construct_input_rectlist<Rect<N, T>>(inst_space, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream);
-    inst_entries_instance.destroy();
-    parent_entries_instance.destroy();
-    inst_offsets_instance.destroy();
-
-    if (num_valid_rects == 0) {
-      for (auto it : sparsity_outputs) {
-        SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it);
-        if (this->exclusive) {
-          impl->gpu_finalize();
-        } else {
-          impl->contribute_nothing();
-        }
-      }
-      out_instance.destroy();
-      inst_counters_instance.destroy();
-      return;
-    }
-
-    // Prefix sum the valid rectangles by volume.
-    RegionInstance prefix_rects_instance;
-    size_t total_pts;
-
-    size_t* d_prefix_rects;
-    GPUMicroOp<N, T>::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream);
-
-    nvtx_range_push("cuda", "build target entries");
 
     collapsed_space<N2, T2> target_space;
-    RegionInstance offsets_instance = this->realm_malloc((targets.size()+1) * sizeof(size_t), my_mem);
-    target_space.offsets = reinterpret_cast<size_t*>(AffineAccessor<char,1>(offsets_instance, 0).base);
+    target_space.offsets = buffer_arena.alloc<size_t>(targets.size() + 1);
     target_space.num_children = targets.size();
 
-    RegionInstance targets_entries_instance;
-
     GPUMicroOp<N2, T2>::collapse_multi_space(targets, target_space, buffer_arena, stream);
 
     Memory zcpy_mem;
@@ -335,134 +372,254 @@ namespace Realm {
       d_accessors[i] = AffineAccessor<Point<N2,T2>,N,T>(domain_transform.ptr_data[i].inst, domain_transform.ptr_data[i].field_offset);
     }
 
-    RegionInstance points_instance;
-    PointDesc<N,T>* d_points;
-    size_t num_valid_points;
-
-    RegionInstance target_counters_instance = this->realm_malloc((2*targets.size()+1) * sizeof(uint32_t), my_mem);
-    uint32_t* d_target_counters = reinterpret_cast<uint32_t*>(AffineAccessor<char,1>(target_counters_instance, 0).base);
+    uint32_t* d_target_counters = buffer_arena.alloc<uint32_t>(2*targets.size() + 1);
     uint32_t* d_targets_prefix = d_target_counters + targets.size();
     CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, targets.size() * sizeof(uint32_t), stream), stream);
 
-    if (target_space.num_entries > targets.size()) {
-      BVH<N2, T2> preimage_bvh;
-      RegionInstance bvh_instance;
-      GPUMicroOp<N2, T2>::build_bvh(target_space, preimage_bvh, buffer_arena, stream);
-
-      preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices,
-       preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.ptr_data.size(), preimage_bvh.num_leaves, nullptr, d_target_counters, nullptr);
-      KERNEL_CHECK(stream);
-
-      std::vector<uint32_t> h_target_counters(targets.size()+1);
-      h_target_counters[0] = 0; // prefix sum starts at 0
-      CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream);
-      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
-      for (size_t i = 0; i < targets.size(); ++i) {
-        h_target_counters[i+1] += h_target_counters[i];
-      }
+    buffer_arena.commit(false);
+
+    size_t num_output = 0;
+    RectDesc<N, T>* output_start = nullptr;
+    size_t num_completed = 0;
+    size_t curr_tile = tile_size / 2;
+    int count = 0;
+
+    bool host_fallback = false;
+    std::vector<RegionInstance> h_instances(targets.size(), RegionInstance::NO_INST);
+    std::vector<size_t> entry_counts(targets.size(), 0);
+    while (num_completed < inst_space.num_entries) {
+      try {
+
+        std::cout << "Preimage iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
+        buffer_arena.start();
+        std::cout << "Amount Used: " << buffer_arena.used() << std::endl;
+        if (num_completed + curr_tile > inst_space.num_entries) {
+          curr_tile = inst_space.num_entries - num_completed;
+        }
+
+        collapsed_space<N, T> inst_space_tile = inst_space;
+        inst_space_tile.num_entries = curr_tile;
+        inst_space_tile.entries_buffer = buffer_arena.alloc<SparsityMapEntry<N,T>>(curr_tile);
+        CUDA_CHECK(cudaMemcpyAsync(inst_space_tile.entries_buffer, inst_space.entries_buffer + num_completed, curr_tile * sizeof(SparsityMapEntry<N,T>), cudaMemcpyHostToDevice, stream), stream);
+
+        size_t num_valid_rects;
+        Rect<N, T>* d_valid_rects;
+        // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from.
+        GPUMicroOp<N, T>::template construct_input_rectlist<Rect<N, T>>(inst_space_tile, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream);
+
+        if (num_valid_rects == 0) {
+          num_completed += curr_tile;
+          subtract_const<<<COMPUTE_GRID(domain_transform.ptr_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile);
+          KERNEL_CHECK(stream);
+          CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+          curr_tile = tile_size / 2;
+          continue;
+        }
+
+        // Prefix sum the valid rectangles by volume.
+        size_t total_pts;
+        size_t* d_prefix_rects;
+        GPUMicroOp<N, T>::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream);
+
+        nvtx_range_push("cuda", "build target entries");
+
+        PointDesc<N,T>* d_points;
+        size_t num_valid_points;
 
-      num_valid_points = h_target_counters[targets.size()];
+        CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream);
 
-      if (num_valid_points == 0) {
-        for (auto it : sparsity_outputs) {
-          SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it);
-          if (this->exclusive) {
-            impl->gpu_finalize();
-          } else {
-            impl->contribute_nothing();
+        if (target_space.num_entries > targets.size()) {
+
+          BVH<N2, T2> preimage_bvh;
+          GPUMicroOp<N2, T2>::build_bvh(target_space, preimage_bvh, buffer_arena, stream);
+
+          preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices,
+           preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.ptr_data.size(), preimage_bvh.num_leaves, nullptr, d_target_counters, nullptr);
+          KERNEL_CHECK(stream);
+
+          std::vector<uint32_t> h_target_counters(targets.size()+1);
+          h_target_counters[0] = 0; // prefix sum starts at 0
+          CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream);
+          CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+          for (size_t i = 0; i < targets.size(); ++i) {
+            h_target_counters[i+1] += h_target_counters[i];
           }
-        }
-        target_counters_instance.destroy();
-        accessors_instance.destroy();
-        targets_entries_instance.destroy();
-        offsets_instance.destroy();
-        prefix_rects_instance.destroy();
-        out_instance.destroy();
-        inst_counters_instance.destroy();
-        bvh_instance.destroy();
-        return;
-      }
 
-      CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream);
-
-      points_instance = this->realm_malloc(num_valid_points * sizeof(PointDesc<N,T>), my_mem);
-      d_points = reinterpret_cast<PointDesc<N,T>*>(AffineAccessor<char,1>(points_instance, 0).base);
-
-      CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream);
-
-      preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices,
-       preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.ptr_data.size(), preimage_bvh.num_leaves, d_targets_prefix, d_target_counters, d_points);
-      KERNEL_CHECK(stream);
-      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
-      bvh_instance.destroy();
-    } else {
-      preimage_dense_populate_bitmasks_kernel< N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts,
-       num_valid_rects, domain_transform.ptr_data.size(), targets.size(), nullptr, d_target_counters, nullptr);
-      KERNEL_CHECK(stream);
-
-      std::vector<uint32_t> h_target_counters(targets.size()+1);
-      h_target_counters[0] = 0; // prefix sum starts at 0
-      CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream);
-      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
-      for (size_t i = 0; i < targets.size(); ++i) {
-        h_target_counters[i+1] += h_target_counters[i];
-      }
+          num_valid_points = h_target_counters[targets.size()];
+
+          if (num_valid_points == 0) {
+            num_completed += curr_tile;
+            subtract_const<<<COMPUTE_GRID(domain_transform.ptr_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile);
+            KERNEL_CHECK(stream);
+            CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+            curr_tile = tile_size / 2;
+            continue;
+          }
 
-      num_valid_points = h_target_counters[targets.size()];
+          CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream);
 
-      if (num_valid_points == 0) {
-        for (auto it : sparsity_outputs) {
-          SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it);
-          if (this->exclusive) {
-            impl->gpu_finalize();
-          } else {
-            impl->contribute_nothing();
+          buffer_arena.flip_parity();
+          d_points = buffer_arena.alloc<PointDesc<N, T>>(num_valid_points);
+
+          CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream);
+
+          preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices,
+           preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.ptr_data.size(), preimage_bvh.num_leaves, d_targets_prefix, d_target_counters, d_points);
+          KERNEL_CHECK(stream);
+          CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+        } else {
+          preimage_dense_populate_bitmasks_kernel< N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts,
+           num_valid_rects, domain_transform.ptr_data.size(), targets.size(), nullptr, d_target_counters, nullptr);
+          KERNEL_CHECK(stream);
+
+          std::vector<uint32_t> h_target_counters(targets.size()+1);
+          h_target_counters[0] = 0; // prefix sum starts at 0
+          CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream);
+          CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+          for (size_t i = 0; i < targets.size(); ++i) {
+            h_target_counters[i+1] += h_target_counters[i];
           }
+
+          num_valid_points = h_target_counters[targets.size()];
+
+          if (num_valid_points == 0) {
+            num_completed += curr_tile;
+            subtract_const<<<COMPUTE_GRID(domain_transform.ptr_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile);
+            KERNEL_CHECK(stream);
+            CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+            curr_tile = tile_size / 2;
+            continue;
+          }
+
+          CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream);
+
+          buffer_arena.flip_parity();
+          d_points = buffer_arena.alloc<PointDesc<N, T>>(num_valid_points);
+
+          CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream);
+
+          preimage_dense_populate_bitmasks_kernel< N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts,
+           num_valid_rects, domain_transform.ptr_data.size(), targets.size(), d_targets_prefix, d_target_counters, d_points);
+          KERNEL_CHECK(stream);
+          CUDA_CHECK(cudaStreamSynchronize(stream), stream);
         }
-        target_counters_instance.destroy();
-        accessors_instance.destroy();
-        targets_entries_instance.destroy();
-        offsets_instance.destroy();
-        prefix_rects_instance.destroy();
-        out_instance.destroy();
-        inst_counters_instance.destroy();
-        return;
-      }
 
-      CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream);
+        buffer_arena.flip_parity();
+        buffer_arena.flip_parity();
+        d_points = buffer_arena.alloc<PointDesc<N, T>>(num_valid_points);
+
+        size_t num_new_rects = num_output == 0 ? 1 : 2;
+        assert(!buffer_arena.get_parity());
+        RectDesc<N, T>* d_new_rects;
+
+        this->complete_pipeline(d_points, num_valid_points, d_new_rects, num_new_rects, buffer_arena,
+        /* the Container: */  sparsity_outputs,
+        /* getIndex: */       [&](auto const& elem){
+                                // elem is a SparsityMap<N,T> from the vector
+                                return size_t(&elem - sparsity_outputs.data());
+                             },
+        /* getMap: */         [&](auto const& elem){
+                              // return the SparsityMap key itself
+                              return elem;
+                           });
+
+        if (host_fallback) {
+          this->split_output(d_new_rects, num_new_rects, h_instances, entry_counts, buffer_arena);
+        }
 
-      points_instance = this->realm_malloc(num_valid_points * sizeof(PointDesc<N,T>), my_mem);
-      d_points = reinterpret_cast<PointDesc<N,T>*>(AffineAccessor<char,1>(points_instance, 0).base);
+        if (num_output==0 || host_fallback) {
+          num_output = num_new_rects;
+          num_completed += curr_tile;
+          output_start = d_new_rects;
+          subtract_const<<<COMPUTE_GRID(domain_transform.ptr_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile);
+          KERNEL_CHECK(stream);
+          curr_tile = tile_size / 2;
+          CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+          continue;
+        }
 
-      CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream);
+        RectDesc<N, T>* d_old_rects = buffer_arena.alloc<RectDesc<N, T>>(num_output);
+        assert(d_old_rects == d_new_rects + num_new_rects);
+        CUDA_CHECK(cudaMemcpyAsync(d_old_rects, output_start, num_output * sizeof(RectDesc<N,T>), cudaMemcpyDeviceToDevice, stream), stream);
+        CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+
+        size_t num_final_rects = 1;
+
+        //Send it off for processing
+        this->complete_rect_pipeline(d_new_rects, num_output + num_new_rects, output_start, num_final_rects, buffer_arena,
+        /* the Container: */  sparsity_outputs,
+        /* getIndex: */       [&](auto const& elem){
+                                // elem is a SparsityMap<N,T> from the vector
+                                return size_t(&elem - sparsity_outputs.data());
+                             },
+        /* getMap: */         [&](auto const& elem){
+                              // return the SparsityMap key itself
+                              return elem;
+                           });
+        num_completed += curr_tile;
+        num_output = num_final_rects;
+        subtract_const<<<COMPUTE_GRID(domain_transform.ptr_data.size()), THREADS_PER_BLOCK, 0, stream>>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile);
+        KERNEL_CHECK(stream);
+        curr_tile = tile_size / 2;
+        CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+
+      } catch (arena_oom&) {
+        std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl;
+        std::cout << buffer_arena.used() << " bytes used in arena." << std::endl;
+        curr_tile /= 2;
+        if (curr_tile == 0) {
+          host_fallback = true;
+          if (num_output > 0) {
+            this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+          }
+          curr_tile = tile_size / 2;
+        }
+      }
+    }
+    if (num_output == 0) {
+      for (SparsityMap<N, T> it : sparsity_outputs) {
+        SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(it);
+        if (this->exclusive) {
+          impl->gpu_finalize();
+        } else {
+          impl->contribute_nothing();
+        }
+      }
+      return;
+    }
 
-      preimage_dense_populate_bitmasks_kernel< N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts,
-       num_valid_rects, domain_transform.ptr_data.size(), targets.size(), d_targets_prefix, d_target_counters, d_points);
-      KERNEL_CHECK(stream);
-      CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+    if (!host_fallback) {
+      try {
+        this->send_output(output_start, num_output, buffer_arena, sparsity_outputs,
+        /* getIndex: */       [&](auto const& elem){
+                                // elem is a SparsityMap<N,T> from the vector
+                                return size_t(&elem - sparsity_outputs.data());
+                             },
+        /* getMap: */         [&](auto const& elem){
+                              // return the SparsityMap key itself
+                              return elem;
+                           });
+      } catch (arena_oom&) {
+        this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+        host_fallback = true;
+      }
     }
 
-    target_counters_instance.destroy();
-    accessors_instance.destroy();
-    targets_entries_instance.destroy();
-    offsets_instance.destroy();
-    prefix_rects_instance.destroy();
-    out_instance.destroy();
-    inst_counters_instance.destroy();
-
-    size_t out_rects = 0;
-    RectDesc<N, T>* trash;
-    this->complete_pipeline(d_points, num_valid_points, trash, out_rects, buffer_arena,
-    /* the Container: */  sparsity_outputs,
-    /* getIndex: */       [&](auto const& elem){
-                            // elem is a SparsityMap<N,T> from the vector
-                            return size_t(&elem - sparsity_outputs.data());
-                         },
-    /* getMap: */         [&](auto const& elem){
-                          // return the SparsityMap key itself
-                          return elem;
-                       });
-
-    points_instance.destroy();
+    if (host_fallback) {
+      for (size_t idx = 0; idx < sparsity_outputs.size(); ++idx) {
+        SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(sparsity_outputs[idx]);
+        if (this->exclusive) {
+          impl->set_contributor_count(1);
+        }
+        if (entry_counts[idx] > 0) {
+          Rect<N, T>* h_rects = reinterpret_cast<Rect<N,T> *>(AffineAccessor<char,1>(h_instances[idx], 0).base);
+          span<Rect<N, T>> h_rects_span(h_rects, entry_counts[idx]);
+          impl->contribute_dense_rect_list(h_rects_span, false);
+          h_instances[idx].destroy();
+        } else {
+          impl->contribute_nothing();
+        }
+      }
+    }
   }
 }
\ No newline at end of file
diff --git a/src/realm/deppart/sparsity_impl.cc b/src/realm/deppart/sparsity_impl.cc
index c674a98b32..b4938edb3b 100644
--- a/src/realm/deppart/sparsity_impl.cc
+++ b/src/realm/deppart/sparsity_impl.cc
@@ -1144,6 +1144,19 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
     contribute_raw_rects((rects.empty() ? 0 : &rects[0]), rects.size(), 1, disjoint, 0);
   }
 
+  template <int N, typename T>
+  void
+  SparsityMapImpl<N, T>::contribute_dense_rect_list(const span<Rect<N, T>> &rects,
+                                                    bool disjoint)
+  {
+
+    HybridRectangleList<N, T> h_rect_list;
+    for (size_t i = 0; i < rects.size(); ++i) {
+      h_rect_list.add_rect(rects[i]);
+    }
+    contribute_dense_rect_list(h_rect_list.convert_to_vector(), disjoint);
+  }
+
   template <int N, typename T>
   void SparsityMapImpl<N, T>::contribute_raw_rects(const Rect<N, T> *rects, size_t count,
                                                    size_t piece_count, bool disjoint,
diff --git a/src/realm/deppart/sparsity_impl.h b/src/realm/deppart/sparsity_impl.h
index 2618f4decc..f9656e65b6 100644
--- a/src/realm/deppart/sparsity_impl.h
+++ b/src/realm/deppart/sparsity_impl.h
@@ -127,6 +127,7 @@ namespace Realm {
 
     void contribute_nothing(void);
     void contribute_dense_rect_list(const std::vector<Rect<N, T>> &rects, bool disjoint);
+    void contribute_dense_rect_list(const span<Rect<N, T>> &rects, bool disjoint);
     void contribute_raw_rects(const Rect<N, T> *rects, size_t count, size_t piece_count,
                               bool disjoint, size_t total_count);
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e166888637..bc6123b299 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -277,6 +277,7 @@ add_integration_test(transpose "${REALM_TEST_DIR}/transpose.cc")
 set(proc_group_ARGS -ll:cpu 4)
 add_integration_test(proc_group "${REALM_TEST_DIR}/proc_group.cc")
 add_integration_test(deppart "${REALM_TEST_DIR}/deppart.cc")
+add_integration_test(benchmark "${REALM_TEST_DIR}/benchmark.cc")
 set(scatter_ARGS -p1 2 -p2 2)
 add_integration_test(scatter "${REALM_TEST_DIR}/scatter.cc")
 set(proc_group_ARGS -ll:cpu 4)
diff --git a/tests/benchmark.cc b/tests/benchmark.cc
new file mode 100644
index 0000000000..b6847f5513
--- /dev/null
+++ b/tests/benchmark.cc
@@ -0,0 +1,5019 @@
+/*
+ * Copyright 2025 Stanford University, NVIDIA Corporation
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "realm.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <cstring>
+#include <cmath>
+#include <climits>
+
+#include <time.h>
+
+#include "osdep.h"
+
+#include "philox.h"
+
+using namespace Realm;
+
+#define USE_IMAGE_DIFF
+
+Logger log_app("app");
+
+// Task IDs, some IDs are reserved so start at first available number
+enum
+{
+  TOP_LEVEL_TASK = Processor::TASK_ID_FIRST_AVAILABLE + 0,
+  INIT_CIRCUIT_DATA_TASK,
+  INIT_BASIC_DATA_TASK,
+  INIT_TILE_DATA_TASK,
+  INIT_RANGE_DATA_TASK,
+  INIT_RANGE2D_DATA_TASK,
+  INIT_PENNANT_DATA_TASK,
+  INIT_MINIAERO_DATA_TASK,
+};
+
+enum TransformType
+{
+  AFFINE = 0,
+  TRANSLATION = 1,
+};
+
+namespace std {
+  template <typename T>
+  std::ostream &operator<<(std::ostream &os, const std::vector<T> &v)
+  {
+    os << v.size() << "{";
+    if(v.empty()) {
+      os << "}";
+    } else {
+      os << " ";
+      typename std::vector<T>::const_iterator it = v.begin();
+      os << *it;
+      ++it;
+      while(it != v.end()) {
+        os << ", " << *it;
+        ++it;
+      }
+      os << " }";
+    }
+    return os;
+  }
+}; // namespace std
+
+// we're going to use alarm() as a watchdog to detect deadlocks
+void sigalrm_handler(int sig)
+{
+  fprintf(stderr, "HELP!  Alarm triggered - likely deadlock!\n");
+  exit(1);
+}
+
+template <int N, typename T>
+void dump_sparse_index_space(const char *pfx, IndexSpace<N, T> is)
+{
+  std::cout << pfx << ": " << is << "\n";
+  if(!is.sparsity.exists())
+    return;
+  SparsityMapPublicImpl<N, T> *impl = is.sparsity.impl();
+  span<SparsityMapEntry<N, T>> entries = impl->get_entries();
+  for(size_t i = 0; i < entries.size(); i++) {
+    SparsityMapEntry<N, T> entry = entries[i];
+    std::cout << "  " << entry.bounds;
+    if(entry.bitmap)
+      std::cout << " bitmap(" << entry.bitmap << ")";
+    if(entry.sparsity.exists())
+      std::cout << " sparsity(" << entry.sparsity << ")";
+    std::cout << "\n";
+  }
+}
+
+static int check_empty(Event e, const std::vector<IndexSpace<1>> &p, const char *pfx)
+{
+  int errors = 0;
+  e.wait();
+  for(size_t i = 0; i < p.size(); i++) {
+    p[i].make_valid().wait();
+    if(p[i].volume() > 0) {
+      log_app.error() << "HELP! " << pfx << "[" << i << "] space " << p[i]
+                      << " isn't empty?";
+      dump_sparse_index_space(pfx, p[i]);
+      errors++;
+    }
+  }
+  return errors;
+}
+
+class TestInterface {
+public:
+  virtual ~TestInterface(void) {}
+
+  virtual void print_info(void) = 0;
+
+  virtual Event initialize_data(const std::vector<Memory> &memories,
+                                const std::vector<Processor> &procs) = 0;
+
+  virtual Event perform_partitioning(void) = 0;
+
+  virtual int perform_dynamic_checks(void) = 0;
+
+  virtual int check_partitioning(void) = 0;
+};
+
+// generic configuration settings
+namespace {
+  int random_seed = 12345;
+  bool random_colors = false;
+  bool wait_on_events = false;
+  bool show_graph = false;
+  bool skip_check = false;
+  TestInterface *testcfg = 0;
+}; // namespace
+
+template <typename T>
+void split_evenly(T total, T pieces, std::vector<T> &cuts)
+{
+  cuts.resize(pieces + 1);
+  for(T i = 0; i <= pieces; i++)
+    cuts[i] = ((long long)total * i) / pieces;
+}
+
+template <typename T>
+int find_split(const std::vector<T> &cuts, T v)
+{
+  // dumb linear search
+  assert(v >= cuts[0]);
+  for(size_t i = 1; i < cuts.size(); i++)
+    if(v < cuts[i])
+      return i - 1;
+  assert(false);
+  return 0;
+}
+
+/*
+ * Basic test - create a graph, partition it by
+ * node subgraph id and then check that the partitioning
+ * is correct
+ */
+class BasicTest : public TestInterface {
+public:
+  // graph config parameters
+  int num_nodes = 1000;
+  int num_edges = 1000;
+  int num_pieces = 4;
+  std::string filename;
+
+  BasicTest(int argc, const char *argv[])
+  {
+    for(int i = 1; i < argc; i++) {
+
+      if(!strcmp(argv[i], "-p")) {
+        num_pieces = atoi(argv[++i]);
+        continue;
+      }
+      if(!strcmp(argv[i], "-n")) {
+        num_nodes = atoi(argv[++i]);
+        continue;
+      }
+      if(!strcmp(argv[i], "-e")) {
+        num_edges = atoi(argv[++i]);
+        continue;
+      }
+    }
+
+
+    if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0) {
+      log_app.error() << "Invalid config: nodes=" << num_nodes << " edges=" << num_edges << " pieces=" << num_pieces << "\n";
+      exit(1);
+    }
+  }
+
+  struct InitDataArgs {
+    int index;
+    RegionInstance ri_nodes;
+    RegionInstance ri_edges;
+  };
+
+  enum PRNGStreams
+  {
+    NODE_SUBGRAPH_STREAM,
+  };
+
+  // assign subgraph ids to nodes
+  void random_node_data(int idx, int &subgraph)
+  {
+    if(random_colors)
+      subgraph =
+          Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces);
+    else
+      subgraph = idx * num_pieces / num_nodes;
+  }
+
+  void random_edge_data(int idx, int& src, int& dst)
+  {
+    src = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes);
+    dst = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, num_nodes);
+  }
+
+  static void init_data_task_wrapper(const void *args, size_t arglen,
+                                     const void *userdata, size_t userlen, Processor p)
+  {
+    BasicTest *me = (BasicTest *)testcfg;
+    me->init_data_task(args, arglen, p);
+  }
+
+  //Each piece has a task to initialize its data
+  void init_data_task(const void *args, size_t arglen, Processor p)
+  {
+    const InitDataArgs &i_args = *(const InitDataArgs *)args;
+
+    log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes
+                   << ")";
+
+    i_args.ri_nodes.fetch_metadata(p).wait();
+    i_args.ri_edges.fetch_metadata(p).wait();
+
+    IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>();
+    IndexSpace<1> is_edges = i_args.ri_edges.get_indexspace<1>();
+
+    log_app.debug() << "N: " << is_nodes;
+    log_app.debug() << "E: " << is_edges;
+
+    //For each node in the graph, mark it with a random (or deterministic) subgraph id
+    {
+      AffineAccessor<int, 1> a_piece_id(i_args.ri_nodes, 0 /* offset */);
+
+      for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) {
+        int subgraph;
+        random_node_data(i, subgraph);
+        a_piece_id.write(i, subgraph);
+      }
+
+      AffineAccessor<Point<1>,1> a_src(i_args.ri_edges, 0 /* offset */);
+      AffineAccessor<Point<1>,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */);
+
+      for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) {
+        int src, dst;
+        random_edge_data(i, src, dst);
+        a_src.write(i, Point<1>(src));
+        a_dst.write(i, Point<1>(dst));
+      }
+    }
+
+    //Optionally print out the assigned subgraph ids
+    if(show_graph) {
+      AffineAccessor<int, 1> a_piece_id(i_args.ri_nodes, 0 /* offset */);
+
+      for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++)
+        log_app.info() << "piece_id[" << i << "] = " << a_piece_id.read(i) << "\n";
+
+      AffineAccessor<Point<1>,1> a_src(i_args.ri_edges, 0 /* offset */);
+      AffineAccessor<Point<1>,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */);
+
+      for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++)
+        log_app.info() << "src, dst[" << i << "] = " << a_src.read(i) << ", " << a_dst.read(i) << "\n";
+    }
+  }
+
+  IndexSpace<1> is_nodes, is_edges;
+  std::vector<RegionInstance> ri_nodes, ri_edges;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, int> > piece_id_field_data;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1> > > src_node_field_data, dst_node_field_data;
+
+  virtual void print_info(void)
+  {
+    printf("Realm dependent partitioning test - basic: %d nodes, %d edges, %d pieces\n",
+	   (int)num_nodes, (int) num_edges, (int)num_pieces);
+  }
+
+  virtual Event initialize_data(const std::vector<Memory> &memories,
+                                const std::vector<Processor> &procs)
+  {
+    // now create index space for nodes
+    is_nodes = Rect<1>(0, num_nodes - 1);
+    is_edges = Rect<1>(0, num_edges - 1);
+
+    // equal partition is used to do initial population of edges and nodes
+    std::vector<IndexSpace<1> > ss_nodes_eq;
+    std::vector<IndexSpace<1> > ss_edges_eq;
+
+    log_app.info() << "Creating equal subspaces\n";
+
+    is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait();
+    is_edges.create_equal_subspaces(num_pieces, 1, ss_edges_eq, Realm::ProfilingRequestSet()).wait();
+
+    log_app.debug() << "Initial partitions:";
+    for(size_t i = 0; i < ss_nodes_eq.size(); i++)
+      log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i];
+    for(size_t i = 0; i < ss_edges_eq.size(); i++)
+      log_app.debug() << " Edges #" << i << ": " << ss_edges_eq[i];
+
+    // create instances for each of these subspaces
+    std::vector<size_t> node_fields, edge_fields;
+    node_fields.push_back(sizeof(int));  // piece_id
+    assert(sizeof(int) == sizeof(Point<1>));
+    edge_fields.push_back(sizeof(Point<1>));  // src_node
+    edge_fields.push_back(sizeof(Point<1>));  // dst_node
+
+    ri_nodes.resize(num_pieces);
+    piece_id_field_data.resize(num_pieces);
+
+    for(size_t i = 0; i < ss_nodes_eq.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i],
+                                      node_fields, 0 /*SOA*/,
+                                      Realm::ProfilingRequestSet())
+          .wait();
+      ri_nodes[i] = ri;
+
+      piece_id_field_data[i].index_space = ss_nodes_eq[i];
+      piece_id_field_data[i].inst = ri_nodes[i];
+      piece_id_field_data[i].field_offset = 0;
+    }
+
+
+    // Fire off tasks to initialize data
+    ri_edges.resize(num_pieces);
+    src_node_field_data.resize(num_pieces);
+    dst_node_field_data.resize(num_pieces);
+
+    for(size_t i = 0; i < ss_edges_eq.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri,
+				      memories[i % memories.size()],
+				      ss_edges_eq[i],
+				      edge_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+      ri_edges[i] = ri;
+
+      src_node_field_data[i].index_space = ss_edges_eq[i];
+      src_node_field_data[i].inst = ri_edges[i];
+      src_node_field_data[i].field_offset = 0 * sizeof(Point<1>);
+
+      dst_node_field_data[i].index_space = ss_edges_eq[i];
+      dst_node_field_data[i].inst = ri_edges[i];
+      dst_node_field_data[i].field_offset = 1 * sizeof(Point<1>);
+    }
+
+    // fire off tasks to initialize data
+    std::set<Event> events;
+    for(int i = 0; i < num_pieces; i++) {
+      Processor p = procs[i % procs.size()];
+      InitDataArgs args;
+      args.index = i;
+      args.ri_nodes = ri_nodes[i];
+      args.ri_edges = ri_edges[i];
+      Event e = p.spawn(INIT_BASIC_DATA_TASK, &args, sizeof(args));
+      events.insert(e);
+    }
+
+    return Event::merge_events(events);
+  }
+
+  // the outputs of our partitioning will be:
+  //  p_nodes - nodes partitioned by subgraph id (from GPU)
+  //  p_nodes_cpu - nodes partitioned by subgraph id (from CPU)
+
+
+    std::vector<IndexSpace<1> > p_nodes, p_rd;
+    std::vector<IndexSpace<1> > p_edges, p_preimage_edges;
+
+    std::vector<IndexSpace<1> > p_nodes_cpu, p_rd_cpu;
+    std::vector<IndexSpace<1> > p_edges_cpu, p_preimage_edges_cpu;
+
+  virtual Event perform_partitioning(void)
+  {
+    // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU
+    // Ensure that the results are identical
+
+    std::vector<int> colors(num_pieces);
+    for(int i = 0; i < num_pieces; i++)
+      colors[i] = i;
+
+    // We need a GPU memory for GPU partitioning
+    Memory gpu_memory;
+    bool found_gpu_memory = false;
+    Machine machine = Machine::get_machine();
+    std::set<Memory> all_memories;
+    machine.get_all_memories(all_memories);
+    for(Memory memory : all_memories) {
+      if(memory.kind() == Memory::GPU_FB_MEM) {
+        gpu_memory = memory;
+        found_gpu_memory = true;
+        break;
+      }
+    }
+    if (!found_gpu_memory) {
+      log_app.error() << "No GPU memory found for partitioning test\n";
+      return Event::NO_EVENT;
+    }
+    std::vector<size_t> edge_fields;
+    edge_fields.push_back(sizeof(Point<1>));
+    edge_fields.push_back(sizeof(Point<1>))	;
+    std::vector<size_t> node_fields;
+    node_fields.push_back(sizeof(int));
+
+    std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1> > > src_field_data_gpu;
+    std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1> > > dst_field_data_gpu;
+    std::vector<FieldDataDescriptor<IndexSpace<1>, int> > piece_field_data_gpu;
+    piece_field_data_gpu.resize(num_pieces);
+    src_field_data_gpu.resize(num_pieces);
+    dst_field_data_gpu.resize(num_pieces);
+    for (int i = 0; i < num_pieces; i++) {
+        RegionInstance src_gpu_instance;
+        RegionInstance dst_gpu_instance;
+    	RegionInstance piece_gpu_instance;
+        RegionInstance::create_instance(src_gpu_instance,
+				      gpu_memory,
+				      src_node_field_data[i].index_space,
+				      edge_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+        RegionInstance::create_instance(dst_gpu_instance,
+				      gpu_memory,
+				      dst_node_field_data[i].index_space,
+				      edge_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+    	RegionInstance::create_instance(piece_gpu_instance,
+					  gpu_memory,
+					  piece_id_field_data[i].index_space,
+					  node_fields,
+					  0 /*SOA*/,
+					  Realm::ProfilingRequestSet()).wait();
+      CopySrcDstField src_gpu_field, src_cpu_field, dst_gpu_field, dst_cpu_field, piece_gpu_field, piece_cpu_field;
+      src_gpu_field.inst = src_gpu_instance;
+      src_gpu_field.size = sizeof(Point<1>);
+      src_gpu_field.field_id = 0;
+      src_cpu_field.inst = src_node_field_data[i].inst;
+      src_cpu_field.size = sizeof(Point<1>);
+      src_cpu_field.field_id = 0;
+      dst_gpu_field.inst = dst_gpu_instance;
+      dst_gpu_field.size = sizeof(Point<1>);
+      dst_gpu_field.field_id = sizeof(Point<1>);
+      dst_cpu_field.inst = dst_node_field_data[i].inst;
+      dst_cpu_field.size = sizeof(Point<1>);
+      dst_cpu_field.field_id = sizeof(Point<1>);
+      piece_gpu_field.inst = piece_gpu_instance;
+      piece_gpu_field.size = sizeof(int);
+      piece_gpu_field.field_id = 0;
+      piece_cpu_field.inst = piece_id_field_data[i].inst;
+      piece_cpu_field.size = sizeof(int);
+      piece_cpu_field.field_id = 0;
+      std::vector<CopySrcDstField> src_cpu_data, src_gpu_data, dst_cpu_data, dst_gpu_data, piece_cpu_data, piece_gpu_data;
+      src_cpu_data.push_back(src_cpu_field);
+      dst_cpu_data.push_back(dst_cpu_field);
+      src_gpu_data.push_back(src_gpu_field);
+      dst_gpu_data.push_back(dst_gpu_field);
+    	piece_gpu_data.push_back(piece_gpu_field);
+    	piece_cpu_data.push_back(piece_cpu_field);
+      Event copy_event = src_node_field_data[i].index_space.copy(src_cpu_data, src_gpu_data, Realm::ProfilingRequestSet());
+      copy_event.wait();
+      Event second_copy_event = dst_node_field_data[i].index_space.copy(dst_cpu_data, dst_gpu_data, Realm::ProfilingRequestSet());
+      second_copy_event.wait();
+    	Event third_copy_event = piece_id_field_data[i].index_space.copy(piece_cpu_data, piece_gpu_data, Realm::ProfilingRequestSet());
+    		  third_copy_event.wait();
+      src_field_data_gpu[i].inst = src_gpu_instance;
+      src_field_data_gpu[i].index_space = src_node_field_data[i].index_space;
+      src_field_data_gpu[i].field_offset = 0;
+      dst_field_data_gpu[i].inst = dst_gpu_instance;
+      dst_field_data_gpu[i].index_space = dst_node_field_data[i].index_space;
+      dst_field_data_gpu[i].field_offset = 1 * sizeof(Point<1>);
+    	piece_field_data_gpu[i].inst = piece_gpu_instance;
+    	piece_field_data_gpu[i].index_space = piece_id_field_data[i].index_space;
+    	piece_field_data_gpu[i].field_offset = 0;
+    }
+	wait_on_events = true;
+        log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n";
+        const char* val = std::getenv("TILE_SIZE");  // or any env var
+        size_t tile_size = 10000000; //default
+        if (val) {
+          tile_size = atoi(val);
+        }
+        std::vector<size_t> byte_fields = {sizeof(char)};
+        IndexSpace<1> instance_index_space(Rect<1>(0, tile_size-1));
+        IndexSpace<1> dst_index_space(Rect<1>(0, tile_size/100-1));
+        for (size_t i = 0; i < piece_field_data_gpu.size(); i++) {
+          RegionInstance::create_instance(piece_field_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
+        }
+        for (size_t i = 0; i < src_field_data_gpu.size(); i++) {
+          RegionInstance::create_instance(src_field_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
+        }
+        for (size_t i = 0; i < dst_field_data_gpu.size(); i++) {
+          RegionInstance::create_instance(dst_field_data_gpu[i].scratch_buffer, gpu_memory, dst_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
+        }
+        std::vector<IndexSpace<1> > p_garbage_nodes, p_garbage_edges, p_garbage_rd, p_garbage_preimage_edges;
+    Event e01 = is_nodes.create_subspaces_by_field(piece_field_data_gpu,
+                                                  colors,
+                                                  p_garbage_nodes,
+                                                  Realm::ProfilingRequestSet());
+        if (wait_on_events) e01.wait();
+    Event e02 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu,
+                                                     p_garbage_nodes,
+                                                     p_garbage_edges,
+                                                     Realm::ProfilingRequestSet(),
+                                                     e01);
+    if(wait_on_events) e02.wait();
+
+    // an image of p_edges through out_node gives us all the shared nodes, along
+    //  with some private nodes
+    Event e03 = is_nodes.create_subspaces_by_image(src_field_data_gpu,
+                                                  p_garbage_edges,
+                                                  p_garbage_rd,
+                                                  Realm::ProfilingRequestSet(),
+                                                  e02);
+    if(wait_on_events) e03.wait();
+
+    Event e04 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu,
+                                                  p_garbage_rd,
+                                                  p_garbage_preimage_edges,
+                                                  Realm::ProfilingRequestSet(),
+                                                  e03);
+    e04.wait();
+        log_app.info() << "warming up complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting GPU Partitioning " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting GPU By Field " << Clock::current_time_in_microseconds() << "\n";
+    Event e1 = is_nodes.create_subspaces_by_field(piece_field_data_gpu,
+						  colors,
+						  p_nodes,
+						  Realm::ProfilingRequestSet());
+    if(wait_on_events) e1.wait();
+  	log_app.info() << "GPU By Field complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting GPU Preimage " << Clock::current_time_in_microseconds() << "\n";
+    // now compute p_edges based on the color of their in_node (i.e. a preimage)
+    Event e2 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu,
+						     p_nodes,
+						     p_edges,
+						     Realm::ProfilingRequestSet(),
+						     e1);
+    if(wait_on_events) e2.wait();
+  	log_app.info() << "GPU Preimage complete " << Clock::current_time_in_microseconds() << "\n";
+	log_app.info() << "Starting GPU Image " << Clock::current_time_in_microseconds() << "\n";
+
+    std::vector<DeppartEstimateInput<1, int>> spaces = {};
+    std::vector<DeppartBufferRequirements> requirements;
+    is_nodes.by_field_buffer_requirements(spaces, requirements);
+    // an image of p_edges through out_node gives us all the shared nodes, along
+    //  with some private nodes
+    Event e3 = is_nodes.create_subspaces_by_image(src_field_data_gpu,
+                                                  p_edges,
+                                                  p_rd,
+                                                  Realm::ProfilingRequestSet(),
+                                                  e2);
+    if(wait_on_events) e3.wait();
+  	log_app.info() << "GPU Image complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting second GPU preimage " << Clock::current_time_in_microseconds() << "\n";
+
+    Event e4 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu,
+						  p_rd,
+						  p_preimage_edges,
+						  Realm::ProfilingRequestSet(),
+						  e3);
+  	e4.wait();
+  	log_app.info() << "Second GPU preimage complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "GPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting CPU Partitioning " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting CPU By Field " << Clock::current_time_in_microseconds() << "\n";
+  	Event e5 = is_nodes.create_subspaces_by_field(piece_id_field_data,
+						  colors,
+						  p_nodes_cpu,
+						  Realm::ProfilingRequestSet());
+  	if(wait_on_events) e5.wait();
+  	log_app.info() << "CPU By Field complete " << Clock::current_time_in_microseconds() << "\n";
+  	// now compute p_edges based on the color of their in_node (i.e. a preimage)
+  	log_app.info() << "Starting CPU Preimage " << Clock::current_time_in_microseconds() << "\n";
+  	Event e6 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+							   p_nodes_cpu,
+							   p_edges_cpu,
+							   Realm::ProfilingRequestSet(),
+							   e5);
+  	if(wait_on_events) e6.wait();
+  	log_app.info() << "CPU Preimage complete " << Clock::current_time_in_microseconds() << "\n";
+
+  	// an image of p_edges through out_node gives us all the shared nodes, along
+  	//  with some private nodes
+  	log_app.info() << "Starting CPU Image " << Clock::current_time_in_microseconds() << "\n";
+  	Event e7 = is_nodes.create_subspaces_by_image(src_node_field_data,
+							p_edges_cpu,
+							p_rd_cpu,
+							Realm::ProfilingRequestSet(),
+							e6);
+  	if(wait_on_events) e7.wait();
+  	log_app.info() << "CPU Image complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting second CPU preimage " << Clock::current_time_in_microseconds() << "\n";
+
+  	Event e8 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+							p_rd_cpu,
+							p_preimage_edges_cpu,
+							Realm::ProfilingRequestSet(),
+							e7);
+  	e8.wait();
+  	log_app.info() << "Second CPU preimage complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n";
+    return e8;
+  }
+
+  virtual int perform_dynamic_checks(void)
+  {
+    // Nothing to do here
+    return 0;
+  }
+
+  virtual int check_partitioning(void)
+  {
+    int errors = 0;
+
+    if (!p_nodes.size()) {
+      return 0;
+    }
+
+    log_app.info() << "Checking correctness of partitioning " << "\n";
+
+    for(int i = 0; i < num_pieces; i++) {
+      for(IndexSpaceIterator<1> it(p_nodes[i]); it.valid; it.step()) {
+        for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+          if (!p_nodes_cpu[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU has extra byfield point " << point.p
+                            << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+      for(IndexSpaceIterator<1> it(p_nodes_cpu[i]); it.valid; it.step()) {
+        for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+          if (!p_nodes[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU is missing byfield point " << point.p
+                          << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_edges[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+           if (!p_edges_cpu[i].contains(point.p)) {
+             log_app.error() << "Mismatch! GPU has extra preimage edge " << point.p
+                         << " on piece " << i << "\n";
+             errors++;
+          }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_edges_cpu[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+            if (!p_edges[i].contains(point.p)) {
+              log_app.error() << "Mismatch! GPU is missing preimage edge " << point.p
+                         << " on piece " << i << "\n";
+              errors++;
+            }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_rd[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+           if (!p_rd_cpu[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU has extra image node " << point.p
+            << " on piece " << i << "\n";
+            errors++;
+           }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_rd_cpu[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+           if (!p_rd[i].contains(point.p)) {
+               log_app.error() << "Mismatch! GPU is missing image node " << point.p
+                           << " on piece " << i << "\n";
+               errors++;
+           }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_preimage_edges[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+            if (!p_preimage_edges_cpu[i].contains(point.p)) {
+                  log_app.error() << "Mismatch! GPU has extra second preimage edge " << point.p
+                                  << " on piece " << i << "\n";
+                  errors++;
+            }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_preimage_edges_cpu[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+           if (!p_preimage_edges[i].contains(point.p)) {
+           log_app.error() << "Mismatch! GPU is missing second preimage edge " << point.p
+                           << " on piece " << i << "\n";
+               errors++;
+           }
+        }
+      }
+
+    }
+    return errors;
+  }
+};
+
+class TileTest : public TestInterface {
+public:
+  // graph config parameters
+  int num_nodes = 1000;
+  int num_edges = 1000;
+  int num_pieces = 4;
+  int num_tiles = 1;
+  std::string filename;
+
+  TileTest(int argc, const char *argv[])
+  {
+    for(int i = 1; i < argc; i++) {
+
+      if(!strcmp(argv[i], "-p")) {
+        num_pieces = atoi(argv[++i]);
+        continue;
+      }
+      if(!strcmp(argv[i], "-n")) {
+        num_nodes = atoi(argv[++i]);
+        continue;
+      }
+      if(!strcmp(argv[i], "-e")) {
+        num_edges = atoi(argv[++i]);
+        continue;
+      }
+      if(!strcmp(argv[i], "-t")) {
+        num_tiles = atoi(argv[++i]);
+        continue;
+      }
+    }
+
+
+    if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0) {
+      log_app.error() << "Invalid config: nodes=" << num_nodes << " edges=" << num_edges << " pieces=" << num_pieces << "\n";
+      exit(1);
+    }
+  }
+
+  struct InitDataArgs {
+    int index;
+    RegionInstance ri_nodes;
+    RegionInstance ri_edges;
+  };
+
+  enum PRNGStreams
+  {
+    NODE_SUBGRAPH_STREAM,
+  };
+
+  // assign subgraph ids to nodes
+  void random_node_data(int idx, int &subgraph)
+  {
+    if(random_colors)
+      subgraph =
+          Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces);
+    else
+      subgraph = idx * num_pieces / num_nodes;
+  }
+
+  void random_edge_data(int idx, int& src, int& dst)
+  {
+    src = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes);
+    dst = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, num_nodes);
+  }
+
+  static void init_data_task_wrapper(const void *args, size_t arglen,
+                                     const void *userdata, size_t userlen, Processor p)
+  {
+    TileTest *me = (TileTest *)testcfg;
+    me->init_data_task(args, arglen, p);
+  }
+
+  //Each piece has a task to initialize its data
+  void init_data_task(const void *args, size_t arglen, Processor p)
+  {
+    const InitDataArgs &i_args = *(const InitDataArgs *)args;
+
+    log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes
+                   << ")";
+
+    i_args.ri_nodes.fetch_metadata(p).wait();
+    i_args.ri_edges.fetch_metadata(p).wait();
+
+    IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>();
+    IndexSpace<1> is_edges = i_args.ri_edges.get_indexspace<1>();
+
+    log_app.debug() << "N: " << is_nodes;
+    log_app.debug() << "E: " << is_edges;
+
+    //For each node in the graph, mark it with a random (or deterministic) subgraph id
+    {
+      AffineAccessor<int, 1> a_piece_id(i_args.ri_nodes, 0 /* offset */);
+
+      for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) {
+        int subgraph;
+        random_node_data(i, subgraph);
+        a_piece_id.write(i, subgraph);
+      }
+
+      AffineAccessor<Point<1>,1> a_src(i_args.ri_edges, 0 /* offset */);
+      AffineAccessor<Point<1>,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */);
+
+      for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) {
+        int src, dst;
+        random_edge_data(i, src, dst);
+        a_src.write(i, Point<1>(src));
+        a_dst.write(i, Point<1>(dst));
+      }
+    }
+
+    //Optionally print out the assigned subgraph ids
+    if(show_graph) {
+      AffineAccessor<int, 1> a_piece_id(i_args.ri_nodes, 0 /* offset */);
+
+      for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++)
+        log_app.info() << "piece_id[" << i << "] = " << a_piece_id.read(i) << "\n";
+
+      AffineAccessor<Point<1>,1> a_src(i_args.ri_edges, 0 /* offset */);
+      AffineAccessor<Point<1>,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */);
+
+      for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++)
+        log_app.info() << "src, dst[" << i << "] = " << a_src.read(i) << ", " << a_dst.read(i) << "\n";
+    }
+  }
+
+  IndexSpace<1> is_nodes, is_edges;
+  std::vector<RegionInstance> ri_nodes, ri_edges;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, int> > piece_id_field_data;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1> > > src_node_field_data, dst_node_field_data;
+
+  virtual void print_info(void)
+  {
+    printf("Realm dependent partitioning test - tile: %d nodes, %d edges, %d pieces, %d tiles\n",
+	   (int)num_nodes, (int) num_edges, (int)num_pieces, (int)num_tiles);
+  }
+
+  virtual Event initialize_data(const std::vector<Memory> &memories,
+                                const std::vector<Processor> &procs)
+  {
+    // now create index space for nodes
+    is_nodes = Rect<1>(0, num_nodes - 1);
+    is_edges = Rect<1>(0, num_edges - 1);
+    // equal partition is used to do initial population of edges and nodes
+    std::vector<IndexSpace<1> > ss_nodes_eq;
+    std::vector<IndexSpace<1> > ss_edges_eq;
+
+    log_app.info() << "Creating equal subspaces\n";
+
+    is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait();
+    is_edges.create_equal_subspaces(num_pieces, 1, ss_edges_eq, Realm::ProfilingRequestSet()).wait();
+
+    log_app.debug() << "Initial partitions:";
+    for(size_t i = 0; i < ss_nodes_eq.size(); i++)
+      log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i];
+    for(size_t i = 0; i < ss_edges_eq.size(); i++)
+      log_app.debug() << " Edges #" << i << ": " << ss_edges_eq[i];
+
+    // create instances for each of these subspaces
+    std::vector<size_t> node_fields, edge_fields;
+    node_fields.push_back(sizeof(int));  // piece_id
+    assert(sizeof(int) == sizeof(Point<1>));
+    edge_fields.push_back(sizeof(Point<1>));  // src_node
+    edge_fields.push_back(sizeof(Point<1>));  // dst_node
+
+    ri_nodes.resize(num_pieces);
+    piece_id_field_data.resize(num_pieces);
+
+    for(size_t i = 0; i < ss_nodes_eq.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i],
+                                      node_fields, 0 /*SOA*/,
+                                      Realm::ProfilingRequestSet())
+          .wait();
+      ri_nodes[i] = ri;
+
+      piece_id_field_data[i].index_space = ss_nodes_eq[i];
+      piece_id_field_data[i].inst = ri_nodes[i];
+      piece_id_field_data[i].field_offset = 0;
+    }
+
+
+    // Fire off tasks to initialize data
+    ri_edges.resize(num_pieces);
+    src_node_field_data.resize(num_pieces);
+    dst_node_field_data.resize(num_pieces);
+
+    for(size_t i = 0; i < ss_edges_eq.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri,
+				      memories[i % memories.size()],
+				      ss_edges_eq[i],
+				      edge_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+      ri_edges[i] = ri;
+
+      src_node_field_data[i].index_space = ss_edges_eq[i];
+      src_node_field_data[i].inst = ri_edges[i];
+      src_node_field_data[i].field_offset = 0 * sizeof(Point<1>);
+
+      dst_node_field_data[i].index_space = ss_edges_eq[i];
+      dst_node_field_data[i].inst = ri_edges[i];
+      dst_node_field_data[i].field_offset = 1 * sizeof(Point<1>);
+    }
+
+    // fire off tasks to initialize data
+    std::set<Event> events;
+    for(int i = 0; i < num_pieces; i++) {
+      Processor p = procs[i % procs.size()];
+      InitDataArgs args;
+      args.index = i;
+      args.ri_nodes = ri_nodes[i];
+      args.ri_edges = ri_edges[i];
+      Event e = p.spawn(INIT_TILE_DATA_TASK, &args, sizeof(args));
+      events.insert(e);
+    }
+
+    return Event::merge_events(events);
+  }
+
+  // the outputs of our partitioning will be:
+  //  p_nodes - nodes partitioned by subgraph id (from GPU)
+  //  p_nodes_cpu - nodes partitioned by subgraph id (from CPU)
+
+
+    std::vector<IndexSpace<1> > p_nodes, p_rd;
+    std::vector<IndexSpace<1> > p_edges, p_preimage_edges;
+
+    std::vector<IndexSpace<1> > p_nodes_cpu, p_rd_cpu;
+    std::vector<IndexSpace<1> > p_edges_cpu, p_preimage_edges_cpu;
+
+  virtual Event perform_partitioning(void)
+  {
+    // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU
+    // Ensure that the results are identical
+
+    std::vector<int> colors(num_pieces);
+    for(int i = 0; i < num_pieces; i++)
+      colors[i] = i;
+
+    // We need a GPU memory for GPU partitioning
+    Memory gpu_memory;
+    bool found_gpu_memory = false;
+    Machine machine = Machine::get_machine();
+    std::set<Memory> all_memories;
+    machine.get_all_memories(all_memories);
+    for(Memory memory : all_memories) {
+      if(memory.kind() == Memory::GPU_FB_MEM) {
+        gpu_memory = memory;
+        found_gpu_memory = true;
+        break;
+      }
+    }
+    if (!found_gpu_memory) {
+      log_app.error() << "No GPU memory found for partitioning test\n";
+      return Event::NO_EVENT;
+    }
+    std::vector<size_t> edge_fields;
+    edge_fields.push_back(sizeof(Point<1>));
+    edge_fields.push_back(sizeof(Point<1>))	;
+    std::vector<size_t> node_fields;
+    node_fields.push_back(sizeof(int));
+
+    std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1> > > src_field_data_gpu;
+    std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1> > > dst_field_data_gpu;
+    std::vector<FieldDataDescriptor<IndexSpace<1>, int> > piece_field_data_gpu;
+    piece_field_data_gpu.resize(num_pieces);
+    src_field_data_gpu.resize(num_pieces);
+    dst_field_data_gpu.resize(num_pieces);
+    for (int i = 0; i < num_pieces; i++) {
+        RegionInstance src_gpu_instance;
+        RegionInstance dst_gpu_instance;
+    	RegionInstance piece_gpu_instance;
+        RegionInstance::create_instance(src_gpu_instance,
+				      gpu_memory,
+				      src_node_field_data[i].index_space,
+				      edge_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+        RegionInstance::create_instance(dst_gpu_instance,
+				      gpu_memory,
+				      dst_node_field_data[i].index_space,
+				      edge_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+    	RegionInstance::create_instance(piece_gpu_instance,
+					  gpu_memory,
+					  piece_id_field_data[i].index_space,
+					  node_fields,
+					  0 /*SOA*/,
+					  Realm::ProfilingRequestSet()).wait();
+      CopySrcDstField src_gpu_field, src_cpu_field, dst_gpu_field, dst_cpu_field, piece_gpu_field, piece_cpu_field;
+      src_gpu_field.inst = src_gpu_instance;
+      src_gpu_field.size = sizeof(Point<1>);
+      src_gpu_field.field_id = 0;
+      src_cpu_field.inst = src_node_field_data[i].inst;
+      src_cpu_field.size = sizeof(Point<1>);
+      src_cpu_field.field_id = 0;
+      dst_gpu_field.inst = dst_gpu_instance;
+      dst_gpu_field.size = sizeof(Point<1>);
+      dst_gpu_field.field_id = sizeof(Point<1>);
+      dst_cpu_field.inst = dst_node_field_data[i].inst;
+      dst_cpu_field.size = sizeof(Point<1>);
+      dst_cpu_field.field_id = sizeof(Point<1>);
+      piece_gpu_field.inst = piece_gpu_instance;
+      piece_gpu_field.size = sizeof(int);
+      piece_gpu_field.field_id = 0;
+      piece_cpu_field.inst = piece_id_field_data[i].inst;
+      piece_cpu_field.size = sizeof(int);
+      piece_cpu_field.field_id = 0;
+      std::vector<CopySrcDstField> src_cpu_data, src_gpu_data, dst_cpu_data, dst_gpu_data, piece_cpu_data, piece_gpu_data;
+      src_cpu_data.push_back(src_cpu_field);
+      dst_cpu_data.push_back(dst_cpu_field);
+      src_gpu_data.push_back(src_gpu_field);
+      dst_gpu_data.push_back(dst_gpu_field);
+    	piece_gpu_data.push_back(piece_gpu_field);
+    	piece_cpu_data.push_back(piece_cpu_field);
+      Event copy_event = src_node_field_data[i].index_space.copy(src_cpu_data, src_gpu_data, Realm::ProfilingRequestSet());
+      copy_event.wait();
+      Event second_copy_event = dst_node_field_data[i].index_space.copy(dst_cpu_data, dst_gpu_data, Realm::ProfilingRequestSet());
+      second_copy_event.wait();
+    	Event third_copy_event = piece_id_field_data[i].index_space.copy(piece_cpu_data, piece_gpu_data, Realm::ProfilingRequestSet());
+    		  third_copy_event.wait();
+      src_field_data_gpu[i].inst = src_gpu_instance;
+      src_field_data_gpu[i].index_space = src_node_field_data[i].index_space;
+      src_field_data_gpu[i].field_offset = 0;
+      dst_field_data_gpu[i].inst = dst_gpu_instance;
+      dst_field_data_gpu[i].index_space = dst_node_field_data[i].index_space;
+      dst_field_data_gpu[i].field_offset = 1 * sizeof(Point<1>);
+    	piece_field_data_gpu[i].inst = piece_gpu_instance;
+    	piece_field_data_gpu[i].index_space = piece_id_field_data[i].index_space;
+    	piece_field_data_gpu[i].field_offset = 0;
+    }
+	wait_on_events = true;
+        log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n";
+        std::vector<IndexSpace<1> > p_garbage_nodes, p_garbage_edges, p_garbage_rd, p_garbage_preimage_edges;
+    Event e01 = is_nodes.create_subspaces_by_field(piece_field_data_gpu,
+                                                  colors,
+                                                  p_garbage_nodes,
+                                                  Realm::ProfilingRequestSet());
+        if (wait_on_events) e01.wait();
+    Event e02 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+                                                     p_garbage_nodes,
+                                                     p_garbage_edges,
+                                                     Realm::ProfilingRequestSet(),
+                                                     e01);
+    if(wait_on_events) e02.wait();
+
+    // an image of p_edges through out_node gives us all the shared nodes, along
+    //  with some private nodes
+    Event e03 = is_nodes.create_subspaces_by_image(src_field_data_gpu,
+                                                  p_garbage_edges,
+                                                  p_garbage_rd,
+                                                  Realm::ProfilingRequestSet(),
+                                                  e02);
+    if(wait_on_events) e03.wait();
+
+    Event e04 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+                                                  p_garbage_rd,
+                                                  p_garbage_preimage_edges,
+                                                  Realm::ProfilingRequestSet(),
+                                                  e03);
+    e04.wait();
+        log_app.info() << "warming up complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting GPU Partitioning " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting GPU By Field " << Clock::current_time_in_microseconds() << "\n";
+    Event e1 = is_nodes.create_subspaces_by_field(piece_field_data_gpu,
+						  colors,
+						  p_nodes,
+						  Realm::ProfilingRequestSet());
+    if(wait_on_events) e1.wait();
+  	log_app.info() << "GPU By Field complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting GPU Preimage " << Clock::current_time_in_microseconds() << "\n";
+    // now compute p_edges based on the color of their in_node (i.e. a preimage)
+    Event e2 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+						     p_nodes,
+						     p_edges,
+						     Realm::ProfilingRequestSet(),
+						     e1);
+    if(wait_on_events) e2.wait();
+  	log_app.info() << "GPU Preimage complete " << Clock::current_time_in_microseconds() << "\n";
+	log_app.info() << "Starting GPU Image " << Clock::current_time_in_microseconds() << "\n";
+
+    // an image of p_edges through out_node gives us all the shared nodes, along
+    //  with some private nodes
+    Event e3 = is_nodes.create_subspaces_by_image(src_field_data_gpu,
+						  p_edges,
+						  p_rd,
+						  Realm::ProfilingRequestSet(),
+						  e2);
+    if(wait_on_events) e3.wait();
+  	log_app.info() << "GPU Image complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting second GPU preimage " << Clock::current_time_in_microseconds() << "\n";
+
+    Event e4 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+						  p_rd,
+						  p_preimage_edges,
+						  Realm::ProfilingRequestSet(),
+						  e3);
+  	e4.wait();
+  	log_app.info() << "Second GPU preimage complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "GPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting CPU Partitioning " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting CPU By Field " << Clock::current_time_in_microseconds() << "\n";
+  	Event e5 = is_nodes.create_subspaces_by_field(piece_id_field_data,
+						  colors,
+						  p_nodes_cpu,
+						  Realm::ProfilingRequestSet());
+  	if(wait_on_events) e5.wait();
+  	log_app.info() << "CPU By Field complete " << Clock::current_time_in_microseconds() << "\n";
+  	// now compute p_edges based on the color of their in_node (i.e. a preimage)
+  	log_app.info() << "Starting CPU Preimage " << Clock::current_time_in_microseconds() << "\n";
+  	Event e6 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+							   p_nodes_cpu,
+							   p_edges_cpu,
+							   Realm::ProfilingRequestSet(),
+							   e5);
+  	if(wait_on_events) e6.wait();
+  	log_app.info() << "CPU Preimage complete " << Clock::current_time_in_microseconds() << "\n";
+
+  	// an image of p_edges through out_node gives us all the shared nodes, along
+  	//  with some private nodes
+  	log_app.info() << "Starting CPU Image " << Clock::current_time_in_microseconds() << "\n";
+  	Event e7 = is_nodes.create_subspaces_by_image(src_node_field_data,
+							p_edges_cpu,
+							p_rd_cpu,
+							Realm::ProfilingRequestSet(),
+							e6);
+  	if(wait_on_events) e7.wait();
+  	log_app.info() << "CPU Image complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "Starting second CPU preimage " << Clock::current_time_in_microseconds() << "\n";
+
+  	Event e8 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+							p_rd_cpu,
+							p_preimage_edges_cpu,
+							Realm::ProfilingRequestSet(),
+							e7);
+  	e8.wait();
+  	log_app.info() << "Second CPU preimage complete " << Clock::current_time_in_microseconds() << "\n";
+  	log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n";
+    return e8;
+  }
+
+  virtual int perform_dynamic_checks(void)
+  {
+    // Nothing to do here
+    return 0;
+  }
+
+  virtual int check_partitioning(void)
+  {
+    int errors = 0;
+
+    if (!p_nodes.size()) {
+      return 0;
+    }
+
+    log_app.info() << "Checking correctness of partitioning " << "\n";
+
+    for(int i = 0; i < num_pieces; i++) {
+      for(IndexSpaceIterator<1> it(p_nodes[i]); it.valid; it.step()) {
+        for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+          if (!p_nodes_cpu[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU has extra byfield point " << point.p
+                            << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+      for(IndexSpaceIterator<1> it(p_nodes_cpu[i]); it.valid; it.step()) {
+        for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+          if (!p_nodes[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU is missing byfield point " << point.p
+                          << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_edges[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+           if (!p_edges_cpu[i].contains(point.p)) {
+             log_app.error() << "Mismatch! GPU has extra preimage edge " << point.p
+                         << " on piece " << i << "\n";
+             errors++;
+          }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_edges_cpu[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+            if (!p_edges[i].contains(point.p)) {
+              log_app.error() << "Mismatch! GPU is missing preimage edge " << point.p
+                         << " on piece " << i << "\n";
+              errors++;
+            }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_rd[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+           if (!p_rd_cpu[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU has extra image node " << point.p
+            << " on piece " << i << "\n";
+            errors++;
+           }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_rd_cpu[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+           if (!p_rd[i].contains(point.p)) {
+               log_app.error() << "Mismatch! GPU is missing image node " << point.p
+                           << " on piece " << i << "\n";
+               errors++;
+           }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_preimage_edges[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+            if (!p_preimage_edges_cpu[i].contains(point.p)) {
+                  log_app.error() << "Mismatch! GPU has extra second preimage edge " << point.p
+                                  << " on piece " << i << "\n";
+                  errors++;
+            }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_preimage_edges_cpu[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+           if (!p_preimage_edges[i].contains(point.p)) {
+           log_app.error() << "Mismatch! GPU is missing second preimage edge " << point.p
+                           << " on piece " << i << "\n";
+               errors++;
+           }
+        }
+      }
+
+    }
+    return errors;
+  }
+};
+
+class RangeTest : public TestInterface {
+public:
+  // graph config parameters
+  int num_nodes = 1000;
+  int num_rects = 1000;
+  int max_rect_size = 10;
+  int num_pieces = 4;
+  std::string filename;
+
+  RangeTest(int argc, const char *argv[])
+  {
+    for(int i = 1; i < argc; i++) {
+
+      if(!strcmp(argv[i], "-p")) {
+	num_pieces = atoi(argv[++i]);
+	continue;
+      }
+
+      if(!strcmp(argv[i], "-n")) {
+        num_nodes = atoi(argv[++i]);
+        continue;
+      }
+
+      if(!strcmp(argv[i], "-r")) {
+        num_rects = atoi(argv[++i]);
+        continue;
+      }
+
+      if(!strcmp(argv[i], "-m")) {
+        max_rect_size = atoi(argv[++i]);
+        continue;
+      }
+    }
+
+
+
+    if (num_nodes <= 0 || num_rects <= 0) {
+      log_app.error() << "Invalid graph dimensions in input file: rects=" << num_rects << " nodes=" << num_nodes;
+      exit(1);
+    }
+
+  }
+
+
+
+  struct InitDataArgs {
+    int index;
+    RegionInstance ri_nodes;
+    RegionInstance ri_rects;
+  };
+
+  enum PRNGStreams {
+    NODE_SUBGRAPH_STREAM,
+  };
+
+  void random_rect_data(int idx, int& subgraph)
+  {
+    if(random_colors)
+      subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces);
+    else
+      subgraph = idx * num_pieces / num_rects;
+  }
+
+  void random_node_data(int idx, int& subgraph)
+  {
+    if(true)
+      subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces);
+    else
+      subgraph = idx * num_pieces / num_nodes;
+  }
+
+  void initialize_rect_data(int idx, Rect<1> &rect, int max_rect_size = 10)
+  {
+
+    int first = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes);
+    int amount = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, max_rect_size);
+    rect = Rect<1>(first, first + amount);
+  }
+
+
+  static void init_data_task_wrapper(const void *args, size_t arglen,
+				     const void *userdata, size_t userlen, Processor p)
+  {
+    RangeTest *me = (RangeTest *)testcfg;
+    me->init_data_task(args, arglen, p);
+  }
+
+  void init_data_task(const void *args, size_t arglen, Processor p)
+  {
+    const InitDataArgs& i_args = *(const InitDataArgs *)args;
+
+    log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes << ", ri_rects=" << i_args.ri_rects << ")";
+
+    i_args.ri_nodes.fetch_metadata(p).wait();
+    i_args.ri_rects.fetch_metadata(p).wait();
+
+    IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>();
+    IndexSpace<1> is_rects = i_args.ri_rects.get_indexspace<1>();
+
+    log_app.debug() << "N: " << is_nodes;
+    log_app.debug() << "E: " << is_rects;
+
+    //Write out colors and rectangles
+
+    {
+      AffineAccessor<int,1> a_rect_id(i_args.ri_rects, 0 /* offset */);
+
+      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) {
+	      int subgraph;
+	      random_rect_data(i, subgraph);
+	      a_rect_id.write(i, subgraph);
+      }
+    }
+    {
+      AffineAccessor<int,1> a_piece_id(i_args.ri_nodes, 0 /* offset */);
+
+      for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) {
+        int subgraph;
+        random_node_data(i, subgraph);
+        a_piece_id.write(i, subgraph);
+      }
+    }
+
+
+    {
+
+      AffineAccessor<Rect<1>, 1> a_rect_val(i_args.ri_rects, 1 * sizeof(int) /* offset */);
+
+      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) {
+        Rect<1> rect;
+        initialize_rect_data(i, rect, max_rect_size);
+        a_rect_val.write(i, rect);
+      }
+    }
+
+    if(show_graph) {
+      AffineAccessor<int,1> a_piece_id(i_args.ri_nodes, 0 /* offset */);
+
+      for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++)
+	log_app.info() << "node_id[" << i << "] = " << a_piece_id.read(i) << "\n";
+
+      AffineAccessor<int,1> a_rect_id(i_args.ri_rects, 0 * sizeof(Point<1>) /* offset */);
+
+      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++)
+	log_app.info() << "rect_id[" << i << "] = " << a_rect_id.read(i) << "\n";
+
+      AffineAccessor<Rect<1>,1> a_rect_val(i_args.ri_rects, 1 * sizeof(int) /* offset */);
+
+      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++)
+	log_app.info() << "rect_val[" << i << "] = " << a_rect_val.read(i) << "\n";
+    }
+  }
+
+  IndexSpace<1> is_nodes, is_rects;
+  std::vector<RegionInstance> ri_nodes;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, int> > node_id_field_data;
+  std::vector<RegionInstance> ri_rects;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, int> > rect_id_field_data;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, Rect<1> > > rect_val_field_data;
+
+  virtual void print_info(void)
+  {
+    printf("Realm dependent partitioning test - ranges: %d nodes, %d rects, %d pieces\n",
+	   (int)num_nodes, (int)num_rects, (int)num_pieces);
+  }
+
+  virtual Event initialize_data(const std::vector<Memory>& memories,
+				const std::vector<Processor>& procs)
+  {
+    // now create index spaces for nodes and edges
+    is_nodes = Rect<1>(0, num_nodes - 1);
+    is_rects = Rect<1>(0, num_rects - 1);
+
+    // equal partition is used to do initial population of edges and nodes
+    std::vector<IndexSpace<1> > ss_nodes_eq;
+    std::vector<IndexSpace<1> > ss_rects_eq;
+
+    log_app.info() << "Creating equal subspaces" << "\n";
+
+    is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait();
+    is_rects.create_equal_subspaces(num_pieces, 1, ss_rects_eq, Realm::ProfilingRequestSet()).wait();
+
+    log_app.debug() << "Initial partitions:";
+    for(size_t i = 0; i < ss_nodes_eq.size(); i++)
+      log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i];
+    for(size_t i = 0; i < ss_rects_eq.size(); i++)
+      log_app.debug() << " Rects #" << i << ": " << ss_rects_eq[i];
+
+    // create instances for each of these subspaces
+    std::vector<size_t> node_fields, rect_fields;
+    node_fields.push_back(sizeof(int));  // piece_id
+    rect_fields.push_back(sizeof(int));  // src_node
+    rect_fields.push_back(sizeof(Rect<1>));  // dst_node
+
+    ri_nodes.resize(num_pieces);
+    node_id_field_data.resize(num_pieces);
+
+    for(size_t i = 0; i < ss_nodes_eq.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri,
+				      memories[i % memories.size()],
+				      ss_nodes_eq[i],
+				      node_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+      ri_nodes[i] = ri;
+
+      node_id_field_data[i].index_space = ss_nodes_eq[i];
+      node_id_field_data[i].inst = ri_nodes[i];
+      node_id_field_data[i].field_offset = 0;
+    }
+
+    ri_rects.resize(num_pieces);
+    rect_id_field_data.resize(num_pieces);
+    rect_val_field_data.resize(num_pieces);
+
+    for(size_t i = 0; i < ss_rects_eq.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri,
+				      memories[i % memories.size()],
+				      ss_rects_eq[i],
+				      rect_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+      ri_rects[i] = ri;
+
+      rect_id_field_data[i].index_space = ss_rects_eq[i];
+      rect_id_field_data[i].inst = ri_rects[i];
+      rect_id_field_data[i].field_offset = 0;
+
+      rect_val_field_data[i].index_space = ss_rects_eq[i];
+      rect_val_field_data[i].inst = ri_rects[i];
+      rect_val_field_data[i].field_offset = 1 * sizeof(int);
+    }
+
+    // fire off tasks to initialize data
+    std::set<Event> events;
+    for(int i = 0; i < num_pieces; i++) {
+      Processor p = procs[i % procs.size()];
+      InitDataArgs args;
+      args.index = i;
+      args.ri_nodes = ri_nodes[i];
+      args.ri_rects = ri_rects[i];
+      Event e = p.spawn(INIT_RANGE_DATA_TASK, &args, sizeof(args));
+      events.insert(e);
+    }
+
+    return Event::merge_events(events);
+  }
+
+  // the outputs of our partitioning will be:
+  //p_colored_rects -> all of our rectangles marked with the color given by random_rect_data
+  //p_rects -> image range by p colored rects into nodes
+
+  std::vector<IndexSpace<1> > p_colored_rects, p_rects;
+  std::vector<IndexSpace<1> > p_colored_rects_cpu, p_rects_cpu;
+
+  virtual Event perform_partitioning(void)
+  {
+
+    std::vector<int> colors(num_pieces);
+    for(int i = 0; i < num_pieces; i++)
+      colors[i] = i;
+
+    Memory gpu_memory;
+    bool found_gpu_memory = false;
+    Machine machine = Machine::get_machine();
+    std::set<Memory> all_memories;
+    machine.get_all_memories(all_memories);
+    for(auto& memory : all_memories) {
+      if(memory.kind() == Memory::GPU_FB_MEM) {
+        gpu_memory = memory;
+        found_gpu_memory = true;
+        break;
+      }
+    }
+    assert(found_gpu_memory);
+    std::vector<size_t> rect_fields;
+    rect_fields.push_back(sizeof(int));
+    rect_fields.push_back(sizeof(Rect<1>));
+    std::vector<size_t> node_fields;
+    node_fields.push_back(sizeof(int));
+
+    std::vector<FieldDataDescriptor<IndexSpace<1>, int > > node_id_data_gpu;
+    std::vector<FieldDataDescriptor<IndexSpace<1>, int > > rect_id_data_gpu;
+    std::vector<FieldDataDescriptor<IndexSpace<1>, Rect<1>>> rect_val_data_gpu;
+    node_id_data_gpu.resize(num_pieces);
+    rect_id_data_gpu.resize(num_pieces);
+    rect_val_data_gpu.resize(num_pieces);
+    for (int i = 0; i < num_pieces; i++) {
+	RegionInstance node_id_instance;
+	RegionInstance rect_id_instance;
+    	RegionInstance rect_val_instance;
+        RegionInstance::create_instance(node_id_instance,
+				      gpu_memory,
+				      node_id_field_data[i].index_space,
+				      node_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+        RegionInstance::create_instance(rect_id_instance,
+				      gpu_memory,
+				      rect_id_field_data[i].index_space,
+				      rect_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+    	RegionInstance::create_instance(rect_val_instance,
+					  gpu_memory,
+					  rect_val_field_data[i].index_space,
+					  rect_fields,
+					  0 /*SOA*/,
+					  Realm::ProfilingRequestSet()).wait();
+      CopySrcDstField node_id_gpu_field, node_id_cpu_field, rect_id_gpu_field, rect_id_cpu_field, rect_val_gpu_field, rect_val_cpu_field;
+      node_id_gpu_field.inst = node_id_instance;
+      node_id_gpu_field.size = sizeof(int);
+      node_id_gpu_field.field_id = 0;
+      node_id_cpu_field.inst = node_id_field_data[i].inst;
+      node_id_cpu_field.size = sizeof(int);
+      node_id_cpu_field.field_id = 0;
+      rect_id_gpu_field.inst = rect_id_instance;
+      rect_id_gpu_field.size = sizeof(int);
+      rect_id_gpu_field.field_id = 0;
+      rect_id_cpu_field.inst = rect_id_field_data[i].inst;
+      rect_id_cpu_field.size = sizeof(int);
+      rect_id_cpu_field.field_id = 0;
+      rect_val_gpu_field.inst = rect_val_instance;
+      rect_val_gpu_field.size = sizeof(Rect<1>);
+      rect_val_gpu_field.field_id = sizeof(int);
+      rect_val_cpu_field.inst = rect_val_field_data[i].inst;
+      rect_val_cpu_field.size = sizeof(Rect<1>);
+      rect_val_cpu_field.field_id = sizeof(int);
+      std::vector<CopySrcDstField> node_id_gpu_data, node_id_cpu_data, rect_id_gpu_data, rect_id_cpu_data, rect_val_gpu_data, rect_val_cpu_data;
+      node_id_gpu_data.push_back(node_id_gpu_field);
+      node_id_cpu_data.push_back(node_id_cpu_field);
+      rect_id_gpu_data.push_back(rect_id_gpu_field);
+      rect_id_cpu_data.push_back(rect_id_cpu_field);
+      rect_val_gpu_data.push_back(rect_val_gpu_field);
+      rect_val_cpu_data.push_back(rect_val_cpu_field);
+      Event copy_event = node_id_field_data[i].index_space.copy(node_id_cpu_data, node_id_gpu_data, Realm::ProfilingRequestSet());
+      copy_event.wait();
+      Event second_copy_event = rect_id_field_data[i].index_space.copy(rect_id_cpu_data, rect_id_gpu_data, Realm::ProfilingRequestSet());
+      second_copy_event.wait();
+      Event third_copy_event = rect_val_field_data[i].index_space.copy(rect_val_cpu_data, rect_val_gpu_data, Realm::ProfilingRequestSet());
+      third_copy_event.wait();
+      node_id_data_gpu[i].inst = node_id_instance;
+      node_id_data_gpu[i].index_space = node_id_field_data[i].index_space;
+      node_id_data_gpu[i].field_offset = 0;
+      rect_id_data_gpu[i].inst = rect_id_instance;
+      rect_id_data_gpu[i].index_space = rect_id_field_data[i].index_space;
+      rect_id_data_gpu[i].field_offset = 0;
+      rect_val_data_gpu[i].inst = rect_val_instance;
+      rect_val_data_gpu[i].index_space = rect_val_field_data[i].index_space;
+      rect_val_data_gpu[i].field_offset = sizeof(int);
+    }
+    wait_on_events = true;
+    std::vector<IndexSpace<1>> p_garbage_rects, p_garbage_colors;
+    log_app.info() << "WARMING UP " << "\n";
+
+    std::vector<DeppartEstimateInput<1, int>> field_estimate_input(rect_id_data_gpu.size());
+    std::vector<DeppartBufferRequirements> field_estimate_output(rect_id_data_gpu.size());
+    std::vector<DeppartEstimateInput<1, int>> image_estimate_input(rect_val_data_gpu.size());
+    std::vector<DeppartBufferRequirements> image_estimate_output(rect_val_data_gpu.size());
+    std::vector<DeppartSubspace<1, int>> subspace_input(colors.size());
+    for (size_t i = 0; i < rect_id_data_gpu.size(); i++) {
+      field_estimate_input[i].location = rect_id_data_gpu[i].inst.get_location();
+      field_estimate_input[i].space = rect_id_data_gpu[i].index_space;
+    }
+    for (size_t i = 0; i < rect_val_data_gpu.size(); i++) {
+      image_estimate_input[i].location = rect_val_data_gpu[i].inst.get_location();
+      image_estimate_input[i].space = rect_val_data_gpu[i].index_space;
+    }
+
+    is_rects.by_field_buffer_requirements(field_estimate_input, field_estimate_output);
+    std::vector<size_t> byte_fields = {sizeof(char)};
+    for (size_t i = 0; i < rect_id_data_gpu.size(); i++) {
+      IndexSpace<1> instance_index_space(Rect<1>(0, field_estimate_output[i].upper_bound-1));
+      RegionInstance::create_instance(rect_id_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
+    }
+
+    Event e001 = is_rects.create_subspaces_by_field(rect_id_data_gpu,
+                                                  colors,
+                                                  p_garbage_colors,
+                                                  Realm::ProfilingRequestSet());
+    if (wait_on_events) e001.wait();
+    for (size_t i = 0; i < colors.size(); i++) {
+      subspace_input[i].space = p_garbage_colors[i];
+      subspace_input[i].entries = p_garbage_colors[i].sparsity.impl()->get_entries().size();
+    }
+    is_nodes.by_image_buffer_requirements(subspace_input, image_estimate_input, image_estimate_output);
+    for (size_t i = 0; i < rect_val_data_gpu.size(); i++) {
+      IndexSpace<1> instance_index_space(Rect<1>(0, (image_estimate_output[i].upper_bound)/12-1));
+      RegionInstance::create_instance(rect_val_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
+    }
+    Event e002 = is_nodes.create_subspaces_by_image(rect_val_data_gpu,
+                                                     p_garbage_colors,
+                                                     p_garbage_rects,
+                                                     Realm::ProfilingRequestSet(),
+                                                     e001);
+    if(wait_on_events) e002.wait();
+
+    log_app.info() << "FINISHED WARMING UP " << "\n";
+    log_app.info() << "starting GPU  partitioning " << Clock::current_time_in_microseconds() << "\n";
+
+    log_app.info() << "STARTING GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
+
+    Event e01 = is_rects.create_subspaces_by_field(rect_id_data_gpu,
+                                                  colors,
+                                                  p_colored_rects,
+                                                  Realm::ProfilingRequestSet());
+        if (wait_on_events) e01.wait();
+
+    log_app.info() << "FINISHED GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
+    log_app.info() << "STARTING GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
+    Event e02 = is_nodes.create_subspaces_by_image(rect_val_data_gpu,
+                                                     p_colored_rects,
+                                                     p_rects,
+                                                     Realm::ProfilingRequestSet(),
+                                                     e01);
+    if(wait_on_events) e02.wait();
+
+    log_app.info() << "FINISHED GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
+    log_app.info() << "STARTING CPU  partitioning " << Clock::current_time_in_microseconds() << "\n";
+    log_app.info() << "STARTING CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
+    Event e1 = is_rects.create_subspaces_by_field(rect_id_field_data,
+                                                  colors,
+                                                  p_colored_rects_cpu,
+                                                  Realm::ProfilingRequestSet());
+    if (wait_on_events) e1.wait();
+    log_app.info() << "FINISHED CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
+    log_app.info() << "STARTING CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
+    Event e2 = is_nodes.create_subspaces_by_image(rect_val_field_data,
+                                                     p_colored_rects_cpu,
+                                                     p_rects_cpu,
+                                                     Realm::ProfilingRequestSet(),
+                                                     e1);
+    if(wait_on_events) e2.wait();
+    log_app.info() << "FINISHED CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
+    log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n";
+    return e2;
+  }
+
+
+
+  virtual int perform_dynamic_checks(void)
+  {
+    return 0;
+  }
+
+  virtual int check_partitioning(void)
+  {
+    log_app.info() << "Checking correctness of partitioning " << "\n";
+    int errors = 0;
+
+    for (int i = 0; i < num_pieces; i++) {
+      for (IndexSpaceIterator<1> it(p_colored_rects[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+          if (!p_colored_rects_cpu[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU has extra colored rect point " << point.p
+                            << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_colored_rects_cpu[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+          if(!p_colored_rects[i].contains(point.p)) {
+                log_app.error() << "Mismatch! GPU is missing colored rect point " << point.p
+                                  << " on piece " << i << "\n";
+                errors++;
+          }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_rects[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+          if (!p_rects_cpu[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU has extra rect point " << point.p
+                            << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_rects_cpu[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+          if(!p_rects[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU is missing rect point " << point.p
+                            << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+    }
+    return errors;
+  }
+};
+
+class Range2DTest : public TestInterface {
+public:
+  // graph config parameters
+  int num_nodes = 1000;
+  int num_rects = 1000;
+  int max_rect_size = 10;
+  int num_pieces = 4;
+
+  Range2DTest(int argc, const char *argv[])
+  {
+    for(int i = 1; i < argc; i++) {
+
+      if(!strcmp(argv[i], "-p")) {
+	num_pieces = atoi(argv[++i]);
+	continue;
+      }
+
+        if(!strcmp(argv[i], "-n")) {
+          num_nodes = atoi(argv[++i]);
+          continue;
+        }
+
+        if (!strcmp(argv[i], "-r")) {
+          num_rects = atoi(argv[++i]);
+          continue;
+        }
+
+        if (!strcmp(argv[i], "-m")) {
+          max_rect_size = atoi(argv[++i]);
+          continue;
+        }
+    }
+
+    if (num_nodes <= 0 || num_rects <= 0) {
+      log_app.error() << "Invalid graph dimensions in input file: rects=" << num_rects << " nodes=" << num_nodes;
+      exit(1);
+    }
+
+  }
+
+
+
+  struct InitDataArgs {
+    int index;
+    RegionInstance ri_nodes;
+    RegionInstance ri_rects;
+  };
+
+  enum PRNGStreams {
+    NODE_SUBGRAPH_STREAM,
+  };
+
+  void random_rect_data(int idx, int& subgraph)
+  {
+    if(random_colors)
+      subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces);
+    else
+      subgraph = idx * num_pieces / num_rects;
+  }
+
+  void random_node_data(int idx, int& subgraph)
+  {
+    if(true)
+      subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces);
+    else
+      subgraph = idx * num_pieces / num_nodes;
+  }
+
+  void initialize_rect_data(int idx, Rect<2> &rect, int max_rect_size = 10)
+  {
+
+    int x = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes);
+    int y = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, num_nodes);
+    int length = Philox_2x32<>::rand_int(random_seed, idx + 2, NODE_SUBGRAPH_STREAM, max_rect_size);
+    int height = Philox_2x32<>::rand_int(random_seed, idx + 3, NODE_SUBGRAPH_STREAM, max_rect_size);
+    rect.lo[0] = x;
+    rect.hi[0] = x + length;
+    rect.lo[1] = y;
+    rect.hi[1] = y + height;
+  }
+
+
+  static void init_data_task_wrapper(const void *args, size_t arglen,
+				     const void *userdata, size_t userlen, Processor p)
+  {
+    Range2DTest *me = (Range2DTest *)testcfg;
+    me->init_data_task(args, arglen, p);
+  }
+
+  void init_data_task(const void *args, size_t arglen, Processor p)
+  {
+    const InitDataArgs& i_args = *(const InitDataArgs *)args;
+
+    log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes << ", ri_rects=" << i_args.ri_rects << ")";
+
+    i_args.ri_nodes.fetch_metadata(p).wait();
+    i_args.ri_rects.fetch_metadata(p).wait();
+
+    IndexSpace<2> is_nodes = i_args.ri_nodes.get_indexspace<2>();
+    IndexSpace<1> is_rects = i_args.ri_rects.get_indexspace<1>();
+
+    log_app.debug() << "N: " << is_nodes;
+    log_app.debug() << "E: " << is_rects;
+
+    {
+      AffineAccessor<int,1> a_piece_id(i_args.ri_rects, 0 /* offset */);
+
+      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) {
+	      int subgraph;
+	      random_rect_data(i, subgraph);
+	      a_piece_id.write(i, subgraph);
+      }
+    }
+    {
+      AffineAccessor<int,2> a_piece_id(i_args.ri_nodes, 0 /* offset */);
+
+      for(int i = is_nodes.bounds.lo[0]; i <= is_nodes.bounds.hi[0]; i++) {
+        for (int j = is_nodes.bounds.lo[1]; j <= is_nodes.bounds.hi[1]; j++) {
+          int idx = i * (is_nodes.bounds.hi[1] - is_nodes.bounds.lo[1] + 1) + j;
+          int subgraph;
+          random_node_data(idx, subgraph);
+          a_piece_id.write(Point<2>(i, j), subgraph);
+        }
+      }
+    }
+
+
+    {
+
+      AffineAccessor<Rect<2>, 1> a_rect(i_args.ri_rects, 1 * sizeof(int) /* offset */);
+
+      // Read edges line by line
+      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) {
+        Rect<2> rect;
+        initialize_rect_data(i, rect, max_rect_size);
+        a_rect.write(i, rect);
+      }
+    }
+
+    if(show_graph) {
+      AffineAccessor<int,2> a_piece_id(i_args.ri_nodes, 0 /* offset */);
+
+      for(int i = is_nodes.bounds.lo[0]; i <= is_nodes.bounds.hi[1]; i++) {
+        for (int j = is_nodes.bounds.lo[1]; j <= is_nodes.bounds.hi[1]; j++) {
+          Point<2> p(i, j);
+          log_app.info() << "node_id[" << p << "] = " << a_piece_id.read(p) << "\n";
+        }
+      }
+
+      AffineAccessor<int,1> a_rect_id(i_args.ri_rects, 0 * sizeof(Point<1>) /* offset */);
+
+      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++)
+	log_app.info() << "rect_id[" << i << "] = " << a_rect_id.read(i) << "\n";
+
+      AffineAccessor<Rect<2>,1> a_rect_val(i_args.ri_rects, 1 * sizeof(int) /* offset */);
+
+      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++)
+	log_app.info() << "rect_val[" << i << "] = " << a_rect_val.read(i) << "\n";
+    }
+  }
+
+  IndexSpace<1> is_rects;
+  IndexSpace<2> is_nodes;
+  std::vector<RegionInstance> ri_nodes;
+  std::vector<FieldDataDescriptor<IndexSpace<2>, int> > node_id_field_data;
+  std::vector<RegionInstance> ri_rects;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, int> > rect_id_field_data;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, Rect<2> > > rect_val_field_data;
+
+  virtual void print_info(void)
+  {
+    printf("Realm dependent partitioning test - 2D ranges: %d nodes, %d rects, %d pieces\n",
+	   (int)num_nodes, (int)num_rects, (int)num_pieces);
+  }
+
+  virtual Event initialize_data(const std::vector<Memory>& memories,
+				const std::vector<Processor>& procs)
+  {
+    // now create index spaces for nodes and edges
+    is_nodes = Rect<2>(Point<2>(0, 0), Point<2>(num_nodes - 1, num_nodes - 1));
+    is_rects = Rect<1>(0, num_rects - 1);
+
+    // equal partition is used to do initial population of edges and nodes
+    std::vector<IndexSpace<2> > ss_nodes_eq;
+    std::vector<IndexSpace<1> > ss_rects_eq;
+
+    log_app.info() << "Creating equal subspaces" << "\n";
+
+    is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait();
+    is_rects.create_equal_subspaces(num_pieces, 1, ss_rects_eq, Realm::ProfilingRequestSet()).wait();
+
+    log_app.debug() << "Initial partitions:\n";
+    for(size_t i = 0; i < ss_nodes_eq.size(); i++)
+      log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i];
+    for(size_t i = 0; i < ss_rects_eq.size(); i++)
+      log_app.debug() << " Rects #" << i << ": " << ss_rects_eq[i];
+
+    // create instances for each of these subspaces
+    std::vector<size_t> node_fields, rect_fields;
+    node_fields.push_back(sizeof(int));  // piece_id
+    rect_fields.push_back(sizeof(int));  // src_node
+    rect_fields.push_back(sizeof(Rect<2>));  // dst_node
+
+    ri_nodes.resize(num_pieces);
+    node_id_field_data.resize(num_pieces);
+
+    for(size_t i = 0; i < ss_nodes_eq.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri,
+				      memories[i % memories.size()],
+				      ss_nodes_eq[i],
+				      node_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+      ri_nodes[i] = ri;
+
+      node_id_field_data[i].index_space = ss_nodes_eq[i];
+      node_id_field_data[i].inst = ri_nodes[i];
+      node_id_field_data[i].field_offset = 0;
+    }
+
+    ri_rects.resize(num_pieces);
+    rect_id_field_data.resize(num_pieces);
+    rect_val_field_data.resize(num_pieces);
+
+    for(size_t i = 0; i < ss_rects_eq.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri,
+				      memories[i % memories.size()],
+				      ss_rects_eq[i],
+				      rect_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+      ri_rects[i] = ri;
+
+      rect_id_field_data[i].index_space = ss_rects_eq[i];
+      rect_id_field_data[i].inst = ri_rects[i];
+      rect_id_field_data[i].field_offset = 0;
+
+      rect_val_field_data[i].index_space = ss_rects_eq[i];
+      rect_val_field_data[i].inst = ri_rects[i];
+      rect_val_field_data[i].field_offset = 1 * sizeof(int);
+    }
+
+    // fire off tasks to initialize data
+    std::set<Event> events;
+    for(int i = 0; i < num_pieces; i++) {
+      Processor p = procs[i % procs.size()];
+      InitDataArgs args;
+      args.index = i;
+      args.ri_nodes = ri_nodes[i];
+      args.ri_rects = ri_rects[i];
+      Event e = p.spawn(INIT_RANGE2D_DATA_TASK, &args, sizeof(args));
+      events.insert(e);
+    }
+
+    return Event::merge_events(events);
+  }
+
+  // the outputs of our partitioning will be:
+  //  is_private, is_shared - subsets of is_nodes based on private/shared
+  //  p_rd, p_wr, p_ghost - subsets of the above split by subckt
+  //  p_edges               - subsets of is_edges for each subckt
+
+  std::vector<IndexSpace<1> > p_colored_rects;
+  std::vector<IndexSpace<2>> p_rects, p_intersect, p_diff;
+  std::vector<IndexSpace<1>> p_colored_rects_cpu;
+  std::vector<IndexSpace<2>> p_rects_cpu, p_intersect_cpu, p_diff_cpu;
+
+  IndexSpace<2> cpu_union, gpu_union, garbage_union;
+
+  virtual Event perform_partitioning(void)
+  {
+    // first partition nodes by subckt id (this is the independent partition,
+    //  but not actually used by the app)
+
+    std::vector<int> colors(num_pieces);
+    for(int i = 0; i < num_pieces; i++)
+      colors[i] = i;
+
+    Memory gpu_memory;
+    bool found_gpu_memory = false;
+    Machine machine = Machine::get_machine();
+    std::set<Memory> all_memories;
+    machine.get_all_memories(all_memories);
+    for(auto& memory : all_memories) {
+      if(memory.kind() == Memory::GPU_FB_MEM) {
+        gpu_memory = memory;
+        found_gpu_memory = true;
+        break;
+      }
+    }
+    assert(found_gpu_memory);
+    std::vector<size_t> rect_fields;
+    rect_fields.push_back(sizeof(int));
+    rect_fields.push_back(sizeof(Rect<2>));
+    std::vector<size_t> node_fields;
+    node_fields.push_back(sizeof(int));
+
+    std::vector<FieldDataDescriptor<IndexSpace<2>, int > > node_id_data_gpu;
+    std::vector<FieldDataDescriptor<IndexSpace<1>, int > > rect_id_data_gpu;
+    std::vector<FieldDataDescriptor<IndexSpace<1>, Rect<2>>> rect_val_data_gpu;
+    node_id_data_gpu.resize(num_pieces);
+    rect_id_data_gpu.resize(num_pieces);
+    rect_val_data_gpu.resize(num_pieces);
+    for (int i = 0; i < num_pieces; i++) {
+	RegionInstance node_id_instance;
+	RegionInstance rect_id_instance;
+    	RegionInstance rect_val_instance;
+        RegionInstance::create_instance(node_id_instance,
+				      gpu_memory,
+				      node_id_field_data[i].index_space,
+				      node_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+        RegionInstance::create_instance(rect_id_instance,
+				      gpu_memory,
+				      rect_id_field_data[i].index_space,
+				      rect_fields,
+				      0 /*SOA*/,
+				      Realm::ProfilingRequestSet()).wait();
+    	RegionInstance::create_instance(rect_val_instance,
+					  gpu_memory,
+					  rect_val_field_data[i].index_space,
+					  rect_fields,
+					  0 /*SOA*/,
+					  Realm::ProfilingRequestSet()).wait();
+      CopySrcDstField node_id_gpu_field, node_id_cpu_field, rect_id_gpu_field, rect_id_cpu_field, rect_val_gpu_field, rect_val_cpu_field;
+      node_id_gpu_field.inst = node_id_instance;
+      node_id_gpu_field.size = sizeof(int);
+      node_id_gpu_field.field_id = 0;
+      node_id_cpu_field.inst = node_id_field_data[i].inst;
+      node_id_cpu_field.size = sizeof(int);
+      node_id_cpu_field.field_id = 0;
+      rect_id_gpu_field.inst = rect_id_instance;
+      rect_id_gpu_field.size = sizeof(int);
+      rect_id_gpu_field.field_id = 0;
+      rect_id_cpu_field.inst = rect_id_field_data[i].inst;
+      rect_id_cpu_field.size = sizeof(int);
+      rect_id_cpu_field.field_id = 0;
+      rect_val_gpu_field.inst = rect_val_instance;
+      rect_val_gpu_field.size = sizeof(Rect<2>);
+      rect_val_gpu_field.field_id = sizeof(int);
+      rect_val_cpu_field.inst = rect_val_field_data[i].inst;
+      rect_val_cpu_field.size = sizeof(Rect<2>);
+      rect_val_cpu_field.field_id = sizeof(int);
+      std::vector<CopySrcDstField> node_id_gpu_data, node_id_cpu_data, rect_id_gpu_data, rect_id_cpu_data, rect_val_gpu_data, rect_val_cpu_data;
+      node_id_gpu_data.push_back(node_id_gpu_field);
+      node_id_cpu_data.push_back(node_id_cpu_field);
+      rect_id_gpu_data.push_back(rect_id_gpu_field);
+      rect_id_cpu_data.push_back(rect_id_cpu_field);
+      rect_val_gpu_data.push_back(rect_val_gpu_field);
+      rect_val_cpu_data.push_back(rect_val_cpu_field);
+      Event copy_event = node_id_field_data[i].index_space.copy(node_id_cpu_data, node_id_gpu_data, Realm::ProfilingRequestSet());
+      copy_event.wait();
+      Event second_copy_event = rect_id_field_data[i].index_space.copy(rect_id_cpu_data, rect_id_gpu_data, Realm::ProfilingRequestSet());
+      second_copy_event.wait();
+      Event third_copy_event = rect_val_field_data[i].index_space.copy(rect_val_cpu_data, rect_val_gpu_data, Realm::ProfilingRequestSet());
+      third_copy_event.wait();
+      node_id_data_gpu[i].inst = node_id_instance;
+      node_id_data_gpu[i].index_space = node_id_field_data[i].index_space;
+      node_id_data_gpu[i].field_offset = 0;
+      rect_id_data_gpu[i].inst = rect_id_instance;
+      rect_id_data_gpu[i].index_space = rect_id_field_data[i].index_space;
+      rect_id_data_gpu[i].field_offset = 0;
+      rect_val_data_gpu[i].inst = rect_val_instance;
+      rect_val_data_gpu[i].index_space = rect_val_field_data[i].index_space;
+      rect_val_data_gpu[i].field_offset = sizeof(int);
+    }
+    wait_on_events = true;
+    std::vector<IndexSpace<1>> p_garbage_colors;
+    std::vector<IndexSpace<2>> p_garbage_rects;
+    log_app.info() << "WARMING UP " << "\n";
+
+    std::vector<DeppartEstimateInput<1, int>> field_estimate_input(rect_id_data_gpu.size());
+    std::vector<DeppartBufferRequirements> field_estimate_output(rect_id_data_gpu.size());
+    std::vector<DeppartEstimateInput<1, int>> image_estimate_input(rect_val_data_gpu.size());
+    std::vector<DeppartBufferRequirements> image_estimate_output(rect_val_data_gpu.size());
+    std::vector<DeppartSubspace<1, int>> subspace_input(colors.size());
+    for (size_t i = 0; i < rect_id_data_gpu.size(); i++) {
+      field_estimate_input[i].location = rect_id_data_gpu[i].inst.get_location();
+      field_estimate_input[i].space = rect_id_data_gpu[i].index_space;
+    }
+    for (size_t i = 0; i < rect_val_data_gpu.size(); i++) {
+      image_estimate_input[i].location = rect_val_data_gpu[i].inst.get_location();
+      image_estimate_input[i].space = rect_val_data_gpu[i].index_space;
+    }
+
+    is_rects.by_field_buffer_requirements(field_estimate_input, field_estimate_output);
+    std::vector<size_t> byte_fields = {sizeof(char)};
+    for (size_t i = 0; i < rect_id_data_gpu.size(); i++) {
+      IndexSpace<1> instance_index_space(Rect<1>(0, field_estimate_output[i].upper_bound-1));
+      RegionInstance::create_instance(rect_id_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
+    }
+
+    Event e001 = is_rects.create_subspaces_by_field(rect_id_data_gpu,
+                                                  colors,
+                                                  p_garbage_colors,
+                                                  Realm::ProfilingRequestSet());
+    if (wait_on_events) e001.wait();
+    for (size_t i = 0; i < colors.size(); i++) {
+      subspace_input[i].space = p_garbage_colors[i];
+      subspace_input[i].entries = p_garbage_colors[i].sparsity.impl()->get_entries().size();
+    }
+    is_nodes.by_image_buffer_requirements(subspace_input, image_estimate_input, image_estimate_output);
+    for (size_t i = 0; i < rect_val_data_gpu.size(); i++) {
+      IndexSpace<1> instance_index_space(Rect<1>(0, (image_estimate_output[i].upper_bound*5)-1));
+      RegionInstance::create_instance(rect_val_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
+    }
+    Event e002 = is_nodes.create_subspaces_by_image(rect_val_data_gpu,
+                                                     p_garbage_colors,
+                                                     p_garbage_rects,
+                                                     Realm::ProfilingRequestSet(),
+                                                     e001);
+    if(wait_on_events) e002.wait();
+
+    log_app.info() << "FINISHED WARMING UP " << "\n";
+    log_app.info() << "starting GPU  partitioning " << Clock::current_time_in_microseconds() << "\n";
+
+    log_app.info() << "STARTING GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
+
+    Event e01 = is_rects.create_subspaces_by_field(rect_id_data_gpu,
+                                                  colors,
+                                                  p_colored_rects,
+                                                  Realm::ProfilingRequestSet());
+        if (wait_on_events) e01.wait();
+
+    log_app.info() << "FINISHED GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
+    log_app.info() << "STARTING GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
+    Event e02 = is_nodes.create_subspaces_by_image(rect_val_data_gpu,
+                                                     p_colored_rects,
+                                                     p_rects,
+                                                     Realm::ProfilingRequestSet(),
+                                                     e01);
+    if(wait_on_events) e02.wait();
+    log_app.info() << "FINISHED GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
+    log_app.info() << "GPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n";
+
+    log_app.info() << "STARTING CPU  partitioning " << Clock::current_time_in_microseconds() << "\n";
+    log_app.info() << "STARTING CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
+    Event e1 = is_rects.create_subspaces_by_field(rect_id_field_data,
+                                                  colors,
+                                                  p_colored_rects_cpu,
+                                                  Realm::ProfilingRequestSet());
+    if (wait_on_events) e1.wait();
+    log_app.info() << "FINISHED CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
+    log_app.info() << "STARTING CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
+    Event e2 = is_nodes.create_subspaces_by_image(rect_val_field_data,
+                                                     p_colored_rects_cpu,
+                                                     p_rects_cpu,
+                                                     Realm::ProfilingRequestSet(),
+                                                     e1);
+    if(wait_on_events) e2.wait();
+    log_app.info() << "FINISHED CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
+    log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n";
+    return e2;
+  }
+
+
+
+  virtual int perform_dynamic_checks(void)
+  {
+    return 0;
+  }
+
+  virtual int check_partitioning(void)
+  {
+    log_app.info() << "Checking correctness of partitioning " << "\n";
+    int errors = 0;
+
+    for (int i = 0; i < num_pieces; i++) {
+      for (IndexSpaceIterator<1> it(p_colored_rects[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+          if(!p_colored_rects_cpu[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU has extra colored rect point " << point.p
+                            << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+      for (IndexSpaceIterator<1> it(p_colored_rects_cpu[i]); it.valid; it.step()) {
+        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+          if (!p_colored_rects[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU is missing colored rect point " << point.p
+                            << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+      for (IndexSpaceIterator<2> it(p_rects[i]); it.valid; it.step()) {
+        for (PointInRectIterator<2> point(it.rect); point.valid; point.step()) {
+          if (!p_rects_cpu[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU has extra rect point " << point.p
+                            << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+      for (IndexSpaceIterator<2> it(p_rects_cpu[i]); it.valid; it.step()) {
+        for (PointInRectIterator<2> point(it.rect); point.valid; point.step()) {
+          if (!p_rects[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU is missing rect point " << point.p
+                            << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+    }
+    return errors;
+  }
+};
+
+class MiniAeroTest : public TestInterface {
+public:
+  enum ProblemType
+  {
+    PTYPE_0,
+    PTYPE_1,
+    PTYPE_2,
+  };
+  enum FaceType
+  {
+    BC_INTERIOR = 0,
+    BC_TANGENT = 1,
+    BC_EXTRAPOLATE = 2,
+    BC_INFLOW = 3,
+    BC_NOSLIP = 4,
+    BC_BLOCK_BORDER = 5,
+    BC_TOTAL = 6,
+  };
+
+  ProblemType problem_type = PTYPE_0;
+  int global_x = 4;
+  int global_y = 4;
+  int global_z = 4;
+  int blocks_x = 2;
+  int blocks_y = 2;
+  int blocks_z = 2;
+
+  int n_cells;                             // total cell count
+  int n_blocks;                            // total block count
+  int n_faces;                             // total face count
+  std::vector<int> xsplit, ysplit, zsplit; // cut planes
+  std::vector<int> cells_per_block, faces_per_block;
+
+  // can't do 64-bit index types right now, so at least get most of our 32-bit space
+  typedef int INDEXTYPE;
+  static const INDEXTYPE FIRST_INDEX = -2000000000; // easier to read than INT_MIN+1
+
+  MiniAeroTest(int argc, const char *argv[])
+  {
+#define INT_ARG(s, v)                                                                    \
+  if(!strcmp(argv[i], s)) {                                                              \
+    v = atoi(argv[++i]);                                                                 \
+    continue;                                                                            \
+  }
+    for(int i = 1; i < argc; i++) {
+      if(!strcmp(argv[i], "-type")) {
+        problem_type = (ProblemType)atoi(argv[++i]);
+        continue;
+      }
+      INT_ARG("-gx", global_x);
+      INT_ARG("-gy", global_y);
+      INT_ARG("-gz", global_z);
+      INT_ARG("-bx", blocks_x);
+      INT_ARG("-by", blocks_y);
+      INT_ARG("-bz", blocks_z);
+      if(!strcmp(argv[i], "-g")) {
+        int v = atoi(argv[++i]);
+        global_x = global_y = global_z = v;
+        continue;
+      }
+      if(!strcmp(argv[i], "-b")) {
+        int v = atoi(argv[++i]);
+        blocks_x = blocks_y = blocks_z = v;
+        continue;
+      }
+    }
+#undef INT_ARG
+
+    // don't allow degenerate blocks
+    assert(global_x >= blocks_x);
+    assert(global_y >= blocks_y);
+    assert(global_z >= blocks_z);
+
+    split_evenly<int>(global_x, blocks_x, xsplit);
+    split_evenly<int>(global_y, blocks_y, ysplit);
+    split_evenly<int>(global_z, blocks_z, zsplit);
+
+    n_blocks = blocks_x * blocks_y * blocks_z;
+    n_cells = 0;
+    n_faces = 0;
+    for(int bz = 0; bz < blocks_z; bz++)
+      for(int by = 0; by < blocks_y; by++)
+        for(int bx = 0; bx < blocks_x; bx++) {
+          int nx = xsplit[bx + 1] - xsplit[bx];
+          int ny = ysplit[by + 1] - ysplit[by];
+          int nz = zsplit[bz + 1] - zsplit[bz];
+
+          int c = nx * ny * nz;
+          int f = (((nx + 1) * ny * nz) + (nx * (ny + 1) * nz) + (nx * ny * (nz + 1)));
+          cells_per_block.push_back(c);
+          faces_per_block.push_back(f);
+
+          n_cells += c;
+          n_faces += f;
+        }
+    assert(n_cells == global_x * global_y * global_z);
+    assert(n_faces == (((global_x + blocks_x) * global_y * global_z) +
+                       (global_x * (global_y + blocks_y) * global_z) +
+                       (global_x * global_y * (global_z + blocks_z))));
+  }
+
+  virtual void print_info(void)
+  {
+    printf("Realm dependent partitioning test - miniaero: %d x %d x %d cells, %d x %d x "
+           "%d blocks\n",
+           (int)global_x, (int)global_y, (int)global_z, (int)blocks_x, (int)blocks_y,
+           (int)blocks_z);
+  }
+
+  IndexSpace<1> is_cells, is_faces;
+  std::vector<RegionInstance> ri_cells;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, int>> cell_blockid_field_data;
+  std::vector<RegionInstance> ri_faces;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1>>> face_left_field_data;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1>>> face_right_field_data;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, int>> face_type_field_data;
+
+  struct InitDataArgs {
+    int index;
+    RegionInstance ri_cells, ri_faces;
+  };
+
+  virtual Event initialize_data(const std::vector<Memory> &memories,
+                                const std::vector<Processor> &procs)
+  {
+    // top level index spaces
+    is_cells = Rect<1>(FIRST_INDEX, FIRST_INDEX + n_cells - 1);
+    is_faces = Rect<1>(FIRST_INDEX, FIRST_INDEX + n_faces - 1);
+
+    // weighted partitions based on the distribution we already computed
+    std::vector<IndexSpace<1>> ss_cells_w;
+    std::vector<IndexSpace<1>> ss_faces_w;
+
+    is_cells
+        .create_weighted_subspaces(n_blocks, 1, cells_per_block, ss_cells_w,
+                                   Realm::ProfilingRequestSet())
+        .wait();
+    is_faces
+        .create_weighted_subspaces(n_blocks, 1, faces_per_block, ss_faces_w,
+                                   Realm::ProfilingRequestSet())
+        .wait();
+
+    log_app.debug() << "Initial partitions:";
+    for(size_t i = 0; i < ss_cells_w.size(); i++)
+      log_app.debug() << " Cells #" << i << ": " << ss_cells_w[i];
+    for(size_t i = 0; i < ss_faces_w.size(); i++)
+      log_app.debug() << " Faces #" << i << ": " << ss_faces_w[i];
+
+    // create instances for each of these subspaces
+    std::vector<size_t> cell_fields, face_fields;
+    cell_fields.push_back(sizeof(int)); // blockid
+    assert(sizeof(int) == sizeof(Point<1>));
+    face_fields.push_back(sizeof(Point<1>)); // left
+    face_fields.push_back(sizeof(Point<1>)); // right
+    face_fields.push_back(sizeof(int));      // type
+
+    ri_cells.resize(n_blocks);
+    cell_blockid_field_data.resize(n_blocks);
+
+    for(size_t i = 0; i < ss_cells_w.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri, memories[i % memories.size()], ss_cells_w[i],
+                                      cell_fields, 0 /*SOA*/,
+                                      Realm::ProfilingRequestSet())
+          .wait();
+      ri_cells[i] = ri;
+
+      cell_blockid_field_data[i].index_space = ss_cells_w[i];
+      cell_blockid_field_data[i].inst = ri_cells[i];
+      cell_blockid_field_data[i].field_offset = 0;
+    }
+
+    ri_faces.resize(n_blocks);
+    face_left_field_data.resize(n_blocks);
+    face_right_field_data.resize(n_blocks);
+    face_type_field_data.resize(n_blocks);
+
+    for(size_t i = 0; i < ss_faces_w.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri, memories[i % memories.size()], ss_faces_w[i],
+                                      face_fields, 0 /*SOA*/,
+                                      Realm::ProfilingRequestSet())
+          .wait();
+      ri_faces[i] = ri;
+
+      face_left_field_data[i].index_space = ss_faces_w[i];
+      face_left_field_data[i].inst = ri_faces[i];
+      face_left_field_data[i].field_offset = 0 * sizeof(Point<1>);
+
+      face_right_field_data[i].index_space = ss_faces_w[i];
+      face_right_field_data[i].inst = ri_faces[i];
+      face_right_field_data[i].field_offset = 1 * sizeof(Point<1>);
+
+      face_type_field_data[i].index_space = ss_faces_w[i];
+      face_type_field_data[i].inst = ri_faces[i];
+      face_type_field_data[i].field_offset = 2 * sizeof(Point<1>);
+    }
+
+    // fire off tasks to initialize data
+    std::set<Event> events;
+    for(int i = 0; i < n_blocks; i++) {
+      Processor p = procs[i % memories.size()];
+      InitDataArgs args;
+      args.index = i;
+      args.ri_cells = ri_cells[i];
+      args.ri_faces = ri_faces[i];
+      Event e = p.spawn(INIT_MINIAERO_DATA_TASK, &args, sizeof(args));
+      events.insert(e);
+    }
+
+    return Event::merge_events(events);
+  }
+
+  static void init_data_task_wrapper(const void *args, size_t arglen,
+                                     const void *userdata, size_t userlen, Processor p)
+  {
+    MiniAeroTest *me = (MiniAeroTest *)testcfg;
+    me->init_data_task(args, arglen, p);
+  }
+
+  Point<1> global_cell_pointer(int cx, int cy, int cz)
+  {
+    INDEXTYPE p = FIRST_INDEX;
+
+    // out of range?  return -1
+    if((cx < 0) || (cx >= global_x) || (cy < 0) || (cy >= global_y) || (cz < 0) ||
+       (cz >= global_z))
+      return -1;
+
+    // first chunks in z, then y, then x
+    int zi = find_split(zsplit, cz);
+    p += global_x * global_y * zsplit[zi];
+    cz -= zsplit[zi];
+    int local_z = zsplit[zi + 1] - zsplit[zi];
+
+    int yi = find_split(ysplit, cy);
+    p += global_x * ysplit[yi] * local_z;
+    cy -= ysplit[yi];
+    int local_y = ysplit[yi + 1] - ysplit[yi];
+
+    int xi = find_split(xsplit, cx);
+    p += xsplit[xi] * local_y * local_z;
+    cx -= xsplit[xi];
+    int local_x = xsplit[xi + 1] - xsplit[xi];
+
+    // now local addressing within this block
+    p += (cx + (cy * local_x) + (cz * local_x * local_y));
+    return p;
+  }
+
+  void init_data_task(const void *args, size_t arglen, Processor p)
+  {
+    const InitDataArgs &i_args = *(const InitDataArgs *)args;
+
+    i_args.ri_cells.fetch_metadata(p).wait();
+    i_args.ri_faces.fetch_metadata(p).wait();
+
+    log_app.info() << "init task #" << i_args.index << " (ri_cells=" << i_args.ri_cells
+                   << ", ri_faces=" << i_args.ri_faces << ")";
+
+    IndexSpace<1> is_cells = i_args.ri_cells.get_indexspace<1>();
+    IndexSpace<1> is_faces = i_args.ri_faces.get_indexspace<1>();
+
+    log_app.debug() << "C: " << is_cells;
+    log_app.debug() << "F: " << is_faces;
+
+    int bx = i_args.index % blocks_x;
+    int by = (i_args.index / blocks_x) % blocks_y;
+    int bz = i_args.index / blocks_x / blocks_y;
+
+    size_t nx = xsplit[bx + 1] - xsplit[bx];
+    size_t ny = ysplit[by + 1] - ysplit[by];
+    size_t nz = zsplit[bz + 1] - zsplit[bz];
+
+    size_t c = nx * ny * nz;
+    size_t f = (((nx + 1) * ny * nz) + (nx * (ny + 1) * nz) + (nx * ny * (nz + 1)));
+    assert(is_cells.bounds.volume() == c);
+    assert(is_faces.bounds.volume() == f);
+
+    // cells are all assigned to the local block
+    {
+      AffineAccessor<int, 1> a_cell_blockid(i_args.ri_cells, 0 /* offset */);
+
+      for(int cz = zsplit[bz]; cz < zsplit[bz + 1]; cz++)
+        for(int cy = ysplit[by]; cy < ysplit[by + 1]; cy++)
+          for(int cx = xsplit[bx]; cx < xsplit[bx + 1]; cx++) {
+            Point<1> pz = global_cell_pointer(cx, cy, cz);
+            assert(is_cells.bounds.contains(pz));
+
+            a_cell_blockid.write(pz, i_args.index);
+          }
+    }
+
+    // faces aren't in any globally-visible order
+    {
+      AffineAccessor<Point<1>, 1> a_face_left(i_args.ri_faces,
+                                              0 * sizeof(Point<1>) /* offset */);
+      AffineAccessor<Point<1>, 1> a_face_right(i_args.ri_faces,
+                                               1 * sizeof(Point<1>) /* offset */);
+      AffineAccessor<int, 1> a_face_type(i_args.ri_faces,
+                                         2 * sizeof(Point<1>) /* offset */);
+
+      Point<1> pf = is_faces.bounds.lo;
+
+      //  --           type 0      | type 1      | type 2
+      //  --           ------      | ------      | ------
+      //  -- left      extrapolate | inflow      | inflow
+      //  -- right     extrapolate | extrapolate | extrapolate
+      //  -- down      tangent     | noslip      | tangent
+      //  -- up        tangent     | extrapolate | tangent
+      //  -- back      tangent     | tangent     | tangent
+      //  -- front     tangent     | tangent     | tangent
+
+      // left/right faces first
+      for(int fx = xsplit[bx]; fx <= xsplit[bx + 1]; fx++) {
+        int ftype = BC_INTERIOR;
+        bool reversed = false;
+        if(fx == xsplit[bx]) {
+          // low boundary
+          reversed = true;
+          if(fx == 0)
+            switch(problem_type) {
+            case PTYPE_0:
+              ftype = BC_EXTRAPOLATE;
+              break;
+            case PTYPE_1:
+              ftype = BC_INFLOW;
+              break;
+            case PTYPE_2:
+              ftype = BC_INFLOW;
+              break;
+            }
+          else
+            ftype = BC_BLOCK_BORDER;
+        } else if(fx == xsplit[bx + 1]) {
+          // high boundary
+          if(fx == global_x)
+            switch(problem_type) {
+            case PTYPE_0:
+              ftype = BC_EXTRAPOLATE;
+              break;
+            case PTYPE_1:
+              ftype = BC_EXTRAPOLATE;
+              break;
+            case PTYPE_2:
+              ftype = BC_EXTRAPOLATE;
+              break;
+            }
+          else
+            ftype = BC_BLOCK_BORDER;
+        }
+
+        for(int cz = zsplit[bz]; cz < zsplit[bz + 1]; cz++)
+          for(int cy = ysplit[by]; cy < ysplit[by + 1]; cy++) {
+            a_face_left.write(pf, global_cell_pointer(fx - (reversed ? 0 : 1), cy, cz));
+            a_face_right.write(pf, global_cell_pointer(fx - (reversed ? 1 : 0), cy, cz));
+            a_face_type.write(pf, ftype);
+            pf[0]++;
+          }
+      }
+
+      // down/up faces next
+      for(int fy = ysplit[by]; fy <= ysplit[by + 1]; fy++) {
+        int ftype = BC_INTERIOR;
+        bool reversed = false;
+        if(fy == ysplit[by]) {
+          // low boundary
+          reversed = true;
+          if(fy == 0)
+            switch(problem_type) {
+            case PTYPE_0:
+              ftype = BC_TANGENT;
+              break;
+            case PTYPE_1:
+              ftype = BC_NOSLIP;
+              break;
+            case PTYPE_2:
+              ftype = BC_TANGENT;
+              break;
+            }
+          else
+            ftype = BC_BLOCK_BORDER;
+        } else if(fy == ysplit[by + 1]) {
+          // high boundary
+          if(fy == global_y)
+            switch(problem_type) {
+            case PTYPE_0:
+              ftype = BC_TANGENT;
+              break;
+            case PTYPE_1:
+              ftype = BC_EXTRAPOLATE;
+              break;
+            case PTYPE_2:
+              ftype = BC_TANGENT;
+              break;
+            }
+          else
+            ftype = BC_BLOCK_BORDER;
+        }
+
+        for(int cz = zsplit[bz]; cz < zsplit[bz + 1]; cz++)
+          for(int cx = xsplit[bx]; cx < xsplit[bx + 1]; cx++) {
+            a_face_left.write(pf, global_cell_pointer(cx, fy - (reversed ? 0 : 1), cz));
+            a_face_right.write(pf, global_cell_pointer(cx, fy - (reversed ? 1 : 0), cz));
+            a_face_type.write(pf, ftype);
+            pf[0]++;
+          }
+      }
+
+      // back/front faces last
+      for(int fz = zsplit[bz]; fz <= zsplit[bz + 1]; fz++) {
+        int ftype = BC_INTERIOR;
+        bool reversed = false;
+        if(fz == zsplit[bz]) {
+          // low boundary
+          reversed = true;
+          if(fz == 0)
+            switch(problem_type) {
+            case PTYPE_0:
+              ftype = BC_TANGENT;
+              break;
+            case PTYPE_1:
+              ftype = BC_TANGENT;
+              break;
+            case PTYPE_2:
+              ftype = BC_TANGENT;
+              break;
+            }
+          else
+            ftype = BC_BLOCK_BORDER;
+        } else if(fz == zsplit[bz + 1]) {
+          // high boundary
+          if(fz == global_z)
+            switch(problem_type) {
+            case PTYPE_0:
+              ftype = BC_TANGENT;
+              break;
+            case PTYPE_1:
+              ftype = BC_TANGENT;
+              break;
+            case PTYPE_2:
+              ftype = BC_TANGENT;
+              break;
+            }
+          else
+            ftype = BC_BLOCK_BORDER;
+        }
+
+        for(int cy = ysplit[by]; cy < ysplit[by + 1]; cy++)
+          for(int cx = xsplit[bx]; cx < xsplit[bx + 1]; cx++) {
+            a_face_left.write(pf, global_cell_pointer(cx, cy, fz - (reversed ? 0 : 1)));
+            a_face_right.write(pf, global_cell_pointer(cx, cy, fz - (reversed ? 1 : 0)));
+            a_face_type.write(pf, ftype);
+            pf[0]++;
+          }
+      }
+
+      assert(pf[0] == is_faces.bounds.hi[0] + 1);
+    }
+
+    if(show_graph) {
+      AffineAccessor<int, 1> a_cell_blockid(i_args.ri_cells, 0 /* offset */);
+
+      for(int i = is_cells.bounds.lo[0]; i <= is_cells.bounds.hi[0]; i++)
+        std::cout << "Z[" << i << "]: blockid=" << a_cell_blockid.read(i) << "\n";
+
+      AffineAccessor<Point<1>, 1> a_face_left(i_args.ri_faces,
+                                              0 * sizeof(Point<1>) /* offset */);
+      AffineAccessor<Point<1>, 1> a_face_right(i_args.ri_faces,
+                                               1 * sizeof(Point<1>) /* offset */);
+      AffineAccessor<int, 1> a_face_type(i_args.ri_faces,
+                                         2 * sizeof(Point<1>) /* offset */);
+
+      for(int i = is_faces.bounds.lo[0]; i <= is_faces.bounds.hi[0]; i++)
+        std::cout << "S[" << i << "]:"
+                  << " left=" << a_face_left.read(i) << " right=" << a_face_right.read(i)
+                  << " type=" << a_face_type.read(i) << "\n";
+    }
+  }
+
+  // the outputs of our partitioning will be:
+  //  p_cells               - subsets of is_cells split by block
+  //  p_faces               - subsets of_is_faces split by block (based on left cell)
+  //  p_facetypes[6]        - subsets of p_faces split further by face type
+  //  p_ghost               - subsets of is_cells reachable by each block's boundary faces
+
+  std::vector<IndexSpace<1>> p_cells;
+  std::vector<IndexSpace<1>> p_faces;
+  std::vector<std::vector<IndexSpace<1>>> p_facetypes;
+  std::vector<IndexSpace<1>> p_ghost;
+
+  virtual Event perform_partitioning(void)
+  {
+    // partition cells first
+    std::vector<int> colors(n_blocks);
+    for(int i = 0; i < n_blocks; i++)
+      colors[i] = i;
+
+    Event e1 = is_cells.create_subspaces_by_field(cell_blockid_field_data, colors,
+                                                  p_cells, Realm::ProfilingRequestSet());
+    if(wait_on_events)
+      e1.wait();
+
+    // now a preimage to get faces
+    Event e2 = is_faces.create_subspaces_by_preimage(
+        face_left_field_data, p_cells, p_faces, Realm::ProfilingRequestSet(), e1);
+    if(wait_on_events)
+      e2.wait();
+
+    // now split by face type
+    std::set<Event> evs;
+    std::vector<int> ftcolors(BC_TOTAL);
+    for(int i = 0; i < BC_TOTAL; i++)
+      ftcolors[i] = i;
+    p_facetypes.resize(n_blocks);
+    std::vector<IndexSpace<1>> p_border_faces(n_blocks);
+
+    for(int idx = 0; idx < n_blocks; idx++) {
+      Event e = p_faces[idx].create_subspaces_by_field(face_type_field_data, ftcolors,
+                                                       p_facetypes[idx],
+                                                       Realm::ProfilingRequestSet(), e2);
+      if(wait_on_events)
+        e.wait();
+      evs.insert(e);
+      p_border_faces[idx] = p_facetypes[idx][BC_BLOCK_BORDER];
+    }
+    Event e3 = Event::merge_events(evs);
+
+    // finally, the image of just the boundary faces through the right face gets us
+    //  ghost cells
+    Event e4 = is_cells.create_subspaces_by_image(
+        face_right_field_data, p_border_faces, p_ghost, Realm::ProfilingRequestSet(), e3);
+    if(wait_on_events)
+      e4.wait();
+
+    return e4;
+  }
+
+  virtual int perform_dynamic_checks(void)
+  {
+    int errors = 0;
+
+    std::vector<IndexSpace<1>> p_int_faces, p_border_faces;
+    for(int idx = 0; idx < n_blocks; idx++) {
+      p_int_faces.push_back(p_facetypes[idx][BC_INTERIOR]);
+      p_border_faces.push_back(p_facetypes[idx][BC_BLOCK_BORDER]);
+    }
+    // miniaero's checks are faster with image/diff on 1 thread, but slower on 4
+#ifdef MINIAERO_USE_IMAGE_DIFF
+    std::vector<IndexSpace<1>> p_l_test, p_ri_test, p_rb_test;
+    Event e4 = is_cells.create_subspaces_by_image_with_difference(
+        face_left_field_data, p_faces, p_cells, p_l_test, Realm::ProfilingRequestSet());
+    Event e5 = is_cells.create_subspaces_by_image_with_difference(
+        face_right_field_data, p_int_faces, p_cells, p_ri_test,
+        Realm::ProfilingRequestSet());
+    Event e6 = is_cells.create_subspaces_by_image_with_difference(
+        face_right_field_data, p_border_faces, p_ghost, p_rb_test,
+        Realm::ProfilingRequestSet());
+#else
+    std::vector<IndexSpace<1>> p_img_left, p_img_right_i, p_img_right_b;
+    Event e1 = is_cells.create_subspaces_by_image(
+        face_left_field_data, p_faces, p_img_left, Realm::ProfilingRequestSet());
+    Event e2 = is_cells.create_subspaces_by_image(
+        face_right_field_data, p_int_faces, p_img_right_i, Realm::ProfilingRequestSet());
+    Event e3 =
+        is_cells.create_subspaces_by_image(face_right_field_data, p_border_faces,
+                                           p_img_right_b, Realm::ProfilingRequestSet());
+    std::vector<IndexSpace<1>> p_l_test, p_ri_test, p_rb_test;
+    Event e4 = IndexSpace<1>::compute_differences(p_img_left, p_cells, p_l_test,
+                                                  Realm::ProfilingRequestSet(), e1);
+    for(unsigned idx = 0; idx < p_img_left.size(); idx++) {
+      p_img_left[idx].destroy(e4);
+    }
+    Event e5 = IndexSpace<1>::compute_differences(p_img_right_i, p_cells, p_ri_test,
+                                                  Realm::ProfilingRequestSet(), e2);
+    for(unsigned idx = 0; idx < p_img_right_i.size(); idx++) {
+      p_img_right_i[idx].destroy(e5);
+    }
+    Event e6 = IndexSpace<1>::compute_differences(p_img_right_b, p_ghost, p_rb_test,
+                                                  Realm::ProfilingRequestSet(), e3);
+    for(unsigned idx = 0; idx < p_img_right_b.size(); idx++) {
+      p_img_right_b[idx].destroy(e6);
+    }
+#endif
+    errors += check_empty(e4, p_l_test, "p_l_test");
+    errors += check_empty(e5, p_ri_test, "p_ri_test");
+    errors += check_empty(e6, p_rb_test, "p_rb_test");
+    for(unsigned idx = 0; idx < p_l_test.size(); idx++) {
+      p_l_test[idx].destroy(e4);
+    }
+    for(unsigned idx = 0; idx < p_ri_test.size(); idx++) {
+      p_ri_test[idx].destroy(e5);
+    }
+    for(unsigned idx = 0; idx < p_rb_test.size(); idx++) {
+      p_rb_test[idx].destroy(e6);
+    }
+
+    return errors;
+  }
+
+  virtual int check_partitioning(void)
+  {
+    int errors = 0;
+
+    Point<1> pc = is_cells.bounds.lo;
+    Point<1> pf = is_faces.bounds.lo;
+
+    for(int blkid = 0; blkid < n_blocks; blkid++) {
+      int bx = blkid % blocks_x;
+      int by = (blkid / blocks_x) % blocks_y;
+      int bz = blkid / blocks_x / blocks_y;
+
+      int nx = xsplit[bx + 1] - xsplit[bx];
+      int ny = ysplit[by + 1] - ysplit[by];
+      int nz = zsplit[bz + 1] - zsplit[bz];
+
+      // check cells
+      for(int i = 0; i < cells_per_block[blkid]; i++) {
+        for(int j = 0; j < n_blocks; j++) {
+          bool exp = (j == blkid);
+          bool act = p_cells[j].contains(pc);
+          if(exp != act) {
+            log_app.error() << "mismatch: cell " << pc << " in p_cells[" << j
+                            << "]: exp=" << exp << " act=" << act;
+            errors++;
+          }
+        }
+
+        std::set<int> exp_ghosts;
+        int cx = i % nx;
+        int cy = (i / nx) % ny;
+        int cz = i / nx / ny;
+        if((cx == 0) && (bx > 0))
+          exp_ghosts.insert(blkid - 1);
+        if((cx == (nx - 1)) && (bx < (blocks_x - 1)))
+          exp_ghosts.insert(blkid + 1);
+        if((cy == 0) && (by > 0))
+          exp_ghosts.insert(blkid - blocks_x);
+        if((cy == (ny - 1)) && (by < (blocks_y - 1)))
+          exp_ghosts.insert(blkid + blocks_x);
+        if((cz == 0) && (bz > 0))
+          exp_ghosts.insert(blkid - blocks_x * blocks_y);
+        if((cz == (nz - 1)) && (bz < (blocks_z - 1)))
+          exp_ghosts.insert(blkid + blocks_x * blocks_y);
+
+        for(int j = 0; j < n_blocks; j++) {
+          bool exp = exp_ghosts.count(j) > 0;
+          bool act = p_ghost[j].contains(pc);
+          if(exp != act) {
+            log_app.error() << "mismatch: cell " << pc << " in p_ghost[" << j
+                            << "]: exp=" << exp << " act=" << act;
+            errors++;
+          }
+        }
+
+        pc[0]++;
+      }
+
+      // check faces
+      for(int i = 0; i < faces_per_block[blkid]; i++) {
+        for(int j = 0; j < n_blocks; j++) {
+          bool exp = (j == blkid);
+          bool act = p_faces[j].contains(pf);
+          if(exp != act) {
+            log_app.error() << "mismatch: face " << pf << " in p_faces[" << j
+                            << "]: exp=" << exp << " act=" << act;
+            errors++;
+          }
+          FaceType exptype = BC_INTERIOR;
+          // luckily the faces on the edge of a block come in chunks
+          int lr_faces = (nx + 1) * ny * nz;
+          int du_faces = nx * (ny + 1) * nz;
+          int bf_faces = nx * ny * (nz + 1);
+          assert((lr_faces + du_faces + bf_faces) == faces_per_block[blkid]);
+          if(i < lr_faces) {
+            int x = i / ny / nz;
+            if(x == 0)
+              exptype = ((bx == 0) ? ((problem_type == PTYPE_0)   ? BC_EXTRAPOLATE
+                                      : (problem_type == PTYPE_1) ? BC_INFLOW
+                                                                  : BC_INFLOW)
+                                   : BC_BLOCK_BORDER);
+            if(x == nx)
+              exptype =
+                  ((bx == blocks_x - 1) ? ((problem_type == PTYPE_0)   ? BC_EXTRAPOLATE
+                                           : (problem_type == PTYPE_1) ? BC_EXTRAPOLATE
+                                                                       : BC_EXTRAPOLATE)
+                                        : BC_BLOCK_BORDER);
+          } else if(i < (lr_faces + du_faces)) {
+            int y = (i - lr_faces) / nx / nz;
+            if(y == 0)
+              exptype = ((by == 0) ? ((problem_type == PTYPE_0)   ? BC_TANGENT
+                                      : (problem_type == PTYPE_1) ? BC_NOSLIP
+                                                                  : BC_TANGENT)
+                                   : BC_BLOCK_BORDER);
+            if(y == ny)
+              exptype =
+                  ((by == blocks_y - 1) ? ((problem_type == PTYPE_0)   ? BC_TANGENT
+                                           : (problem_type == PTYPE_1) ? BC_EXTRAPOLATE
+                                                                       : BC_TANGENT)
+                                        : BC_BLOCK_BORDER);
+          } else {
+            int z = (i - lr_faces - du_faces) / nx / ny;
+            if(z == 0)
+              exptype = ((bz == 0) ? ((problem_type == PTYPE_0)   ? BC_TANGENT
+                                      : (problem_type == PTYPE_1) ? BC_TANGENT
+                                                                  : BC_TANGENT)
+                                   : BC_BLOCK_BORDER);
+            if(z == nz)
+              exptype = ((bz == blocks_z - 1) ? ((problem_type == PTYPE_0)   ? BC_TANGENT
+                                                 : (problem_type == PTYPE_1) ? BC_TANGENT
+                                                                             : BC_TANGENT)
+                                              : BC_BLOCK_BORDER);
+          }
+
+          for(int k = 0; k < BC_TOTAL; k++) {
+            bool exp = (j == blkid) && (k == exptype);
+            bool act = p_facetypes[j][k].contains(pf);
+            if(exp != act) {
+              log_app.error() << "mismatch: face " << pf << " in p_facetypes[" << j
+                              << "][" << k << "]: exp=" << exp << " act=" << act;
+              errors++;
+            }
+          }
+        }
+        pf[0]++;
+      }
+    }
+    for(unsigned idx = 0; idx < p_cells.size(); idx++) {
+      p_cells[idx].destroy();
+    }
+    for(unsigned idx = 0; idx < p_faces.size(); idx++) {
+      p_faces[idx].destroy();
+    }
+    for(unsigned i = 0; i < p_facetypes.size(); i++) {
+      for(unsigned j = 0; j < p_facetypes[i].size(); j++) {
+        p_facetypes[i][j].destroy();
+      }
+    }
+    for(unsigned idx = 0; idx < p_ghost.size(); idx++) {
+      p_ghost[idx].destroy();
+    }
+
+    return errors;
+  }
+};
+
+class CircuitTest : public TestInterface {
+public:
+  // graph config parameters
+  int num_nodes = 100;
+  int num_edges = 10;
+  int num_pieces = 2;
+  int pct_wire_in_piece = 50;
+
+  CircuitTest(int argc, const char *argv[])
+  {
+    for(int i = 1; i < argc; i++) {
+      if(!strcmp(argv[i], "-n")) {
+        num_nodes = atoi(argv[++i]);
+        continue;
+      }
+
+      if(!strcmp(argv[i], "-e")) {
+        num_edges = atoi(argv[++i]);
+        continue;
+      }
+
+      if(!strcmp(argv[i], "-p")) {
+        num_pieces = atoi(argv[++i]);
+        continue;
+      }
+    }
+  }
+
+  struct InitDataArgs {
+    int index;
+    RegionInstance ri_nodes, ri_edges;
+  };
+
+  enum PRNGStreams
+  {
+    NODE_SUBCKT_STREAM,
+    EDGE_IN_NODE_STREAM,
+    EDGE_OUT_NODE_STREAM1,
+    EDGE_OUT_NODE_STREAM2,
+  };
+
+  // nodes and edges are generated pseudo-randomly so that we can check the results
+  // without
+  //  needing all the field data in any one place
+  void random_node_data(int idx, int &subckt)
+  {
+    if(random_colors)
+      subckt = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBCKT_STREAM, num_pieces);
+    else
+      subckt = idx * num_pieces / num_nodes;
+  }
+
+  void random_edge_data(int idx, Point<1> &in_node, Point<1> &out_node)
+  {
+    if(random_colors) {
+      in_node = Philox_2x32<>::rand_int(random_seed, idx, EDGE_IN_NODE_STREAM, num_nodes);
+      out_node =
+          Philox_2x32<>::rand_int(random_seed, idx, EDGE_OUT_NODE_STREAM1, num_nodes);
+    } else {
+      int subckt = idx * num_pieces / num_edges;
+      int n_lo = subckt * num_nodes / num_pieces;
+      int n_hi = (subckt + 1) * num_nodes / num_pieces;
+      in_node = n_lo + Philox_2x32<>::rand_int(random_seed, idx, EDGE_IN_NODE_STREAM,
+                                               n_hi - n_lo);
+      int pct = Philox_2x32<>::rand_int(random_seed, idx, EDGE_OUT_NODE_STREAM2, 100);
+      if(pct < pct_wire_in_piece)
+        out_node = n_lo + Philox_2x32<>::rand_int(random_seed, idx, EDGE_OUT_NODE_STREAM1,
+                                                  n_hi - n_lo);
+      else
+        out_node =
+            Philox_2x32<>::rand_int(random_seed, idx, EDGE_OUT_NODE_STREAM1, num_nodes);
+    }
+  }
+
+  static void init_data_task_wrapper(const void *args, size_t arglen,
+                                     const void *userdata, size_t userlen, Processor p)
+  {
+    CircuitTest *me = (CircuitTest *)testcfg;
+    me->init_data_task(args, arglen, p);
+  }
+
+  void init_data_task(const void *args, size_t arglen, Processor p)
+  {
+    const InitDataArgs &i_args = *(const InitDataArgs *)args;
+
+    log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes
+                   << ", ri_edges=" << i_args.ri_edges << ")";
+
+    i_args.ri_nodes.fetch_metadata(p).wait();
+    i_args.ri_edges.fetch_metadata(p).wait();
+
+    IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>();
+    IndexSpace<1> is_edges = i_args.ri_edges.get_indexspace<1>();
+
+    log_app.debug() << "N: " << is_nodes;
+    log_app.debug() << "E: " << is_edges;
+
+    {
+      AffineAccessor<int, 1> a_subckt_id(i_args.ri_nodes, 0 /* offset */);
+
+      for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) {
+        int subckt;
+        random_node_data(i, subckt);
+        a_subckt_id.write(i, subckt);
+      }
+    }
+
+    {
+      AffineAccessor<Point<1>, 1> a_in_node(i_args.ri_edges,
+                                            0 * sizeof(Point<1>) /* offset */);
+      AffineAccessor<Point<1>, 1> a_out_node(i_args.ri_edges,
+                                             1 * sizeof(Point<1>) /* offset */);
+
+      for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) {
+        Point<1> in_node, out_node;
+        random_edge_data(i, in_node, out_node);
+        a_in_node.write(i, in_node);
+        a_out_node.write(i, out_node);
+      }
+    }
+
+    if(show_graph) {
+      AffineAccessor<int, 1> a_subckt_id(i_args.ri_nodes, 0 /* offset */);
+
+      for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++)
+        std::cout << "subckt_id[" << i << "] = " << a_subckt_id.read(i) << "\n";
+
+      AffineAccessor<Point<1>, 1> a_in_node(i_args.ri_edges,
+                                            0 * sizeof(Point<1>) /* offset */);
+
+      for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++)
+        std::cout << "in_node[" << i << "] = " << a_in_node.read(i) << "\n";
+
+      AffineAccessor<Point<1>, 1> a_out_node(i_args.ri_edges,
+                                             1 * sizeof(Point<1>) /* offset */);
+
+      for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++)
+        std::cout << "out_node[" << i << "] = " << a_out_node.read(i) << "\n";
+    }
+  }
+
+  IndexSpace<1> is_nodes, is_edges;
+  std::vector<RegionInstance> ri_nodes;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, int>> subckt_field_data;
+  std::vector<RegionInstance> ri_edges;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1>>> in_node_field_data;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1>>> out_node_field_data;
+
+  virtual void print_info(void)
+  {
+    printf("Realm dependent partitioning test - circuit: %d nodes, %d edges, %d pieces\n",
+           (int)num_nodes, (int)num_edges, (int)num_pieces);
+  }
+
+  virtual Event initialize_data(const std::vector<Memory> &memories,
+                                const std::vector<Processor> &procs)
+  {
+    // now create index spaces for nodes and edges
+    is_nodes = Rect<1>(0, num_nodes - 1);
+    is_edges = Rect<1>(0, num_edges - 1);
+
+    // equal partition is used to do initial population of edges and nodes
+    std::vector<IndexSpace<1>> ss_nodes_eq;
+    std::vector<IndexSpace<1>> ss_edges_eq;
+
+    is_nodes
+        .create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet())
+        .wait();
+    is_edges
+        .create_equal_subspaces(num_pieces, 1, ss_edges_eq, Realm::ProfilingRequestSet())
+        .wait();
+
+    log_app.debug() << "Initial partitions:";
+    for(size_t i = 0; i < ss_nodes_eq.size(); i++)
+      log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i];
+    for(size_t i = 0; i < ss_edges_eq.size(); i++)
+      log_app.debug() << " Edges #" << i << ": " << ss_edges_eq[i];
+
+    // create instances for each of these subspaces
+    std::vector<size_t> node_fields, edge_fields;
+    node_fields.push_back(sizeof(int)); // subckt_id
+    assert(sizeof(int) == sizeof(Point<1>));
+    edge_fields.push_back(sizeof(Point<1>)); // in_node
+    edge_fields.push_back(sizeof(Point<1>)); // out_node
+
+    ri_nodes.resize(num_pieces);
+    subckt_field_data.resize(num_pieces);
+
+    for(size_t i = 0; i < ss_nodes_eq.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i],
+                                      node_fields, 0 /*SOA*/,
+                                      Realm::ProfilingRequestSet())
+          .wait();
+      ri_nodes[i] = ri;
+
+      subckt_field_data[i].index_space = ss_nodes_eq[i];
+      subckt_field_data[i].inst = ri_nodes[i];
+      subckt_field_data[i].field_offset = 0;
+    }
+
+    ri_edges.resize(num_pieces);
+    in_node_field_data.resize(num_pieces);
+    out_node_field_data.resize(num_pieces);
+
+    for(size_t i = 0; i < ss_edges_eq.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri, memories[i % memories.size()], ss_edges_eq[i],
+                                      edge_fields, 0 /*SOA*/,
+                                      Realm::ProfilingRequestSet())
+          .wait();
+      ri_edges[i] = ri;
+
+      in_node_field_data[i].index_space = ss_edges_eq[i];
+      in_node_field_data[i].inst = ri_edges[i];
+      in_node_field_data[i].field_offset = 0 * sizeof(Point<1>);
+
+      out_node_field_data[i].index_space = ss_edges_eq[i];
+      out_node_field_data[i].inst = ri_edges[i];
+      out_node_field_data[i].field_offset = 1 * sizeof(Point<1>);
+    }
+
+    // fire off tasks to initialize data
+    std::set<Event> events;
+    for(int i = 0; i < num_pieces; i++) {
+      Processor p = procs[i % memories.size()];
+      InitDataArgs args;
+      args.index = i;
+      args.ri_nodes = ri_nodes[i];
+      args.ri_edges = ri_edges[i];
+      Event e = p.spawn(INIT_CIRCUIT_DATA_TASK, &args, sizeof(args));
+      events.insert(e);
+    }
+
+    return Event::merge_events(events);
+  }
+
+  // the outputs of our partitioning will be:
+  //  is_private, is_shared - subsets of is_nodes based on private/shared
+  //  p_pvt, p_shr, p_ghost - subsets of the above split by subckt
+  //  p_edges               - subsets of is_edges for each subckt
+
+  IndexSpace<1> is_shared, is_private;
+  std::vector<IndexSpace<1>> p_pvt, p_shr, p_ghost;
+  std::vector<IndexSpace<1>> p_edges;
+
+  virtual Event perform_partitioning(void)
+  {
+    // first partition nodes by subckt id (this is the independent partition,
+    //  but not actually used by the app)
+    std::vector<IndexSpace<1>> p_nodes;
+
+    std::vector<int> colors(num_pieces);
+    for(int i = 0; i < num_pieces; i++)
+      colors[i] = i;
+
+    Event e1 = is_nodes.create_subspaces_by_field(subckt_field_data, colors, p_nodes,
+                                                  Realm::ProfilingRequestSet());
+    if(wait_on_events)
+      e1.wait();
+
+    // now compute p_edges based on the color of their in_node (i.e. a preimage)
+    Event e2 = is_edges.create_subspaces_by_preimage(in_node_field_data, p_nodes, p_edges,
+                                                     Realm::ProfilingRequestSet(), e1);
+    if(wait_on_events)
+      e2.wait();
+
+      // an image of p_edges through out_node gives us all the shared nodes, along
+      //  with some private nodes
+#ifdef USE_IMAGE_DIFF
+    Event e4 = is_nodes.create_subspaces_by_image_with_difference(
+        out_node_field_data, p_edges, p_nodes, p_ghost, Realm::ProfilingRequestSet(), e2);
+    if(wait_on_events)
+      e4.wait();
+#else
+    std::vector<IndexSpace<1>> p_extra_nodes;
+
+    Event e3 = is_nodes.create_subspaces_by_image(
+        out_node_field_data, p_edges, p_extra_nodes, Realm::ProfilingRequestSet(), e2);
+    if(wait_on_events)
+      e3.wait();
+
+    // subtracting out those private nodes gives us p_ghost
+    Event e4 = IndexSpace<1>::compute_differences(p_extra_nodes, p_nodes, p_ghost,
+                                                  Realm::ProfilingRequestSet(), e3);
+    if(wait_on_events)
+      e4.wait();
+#endif
+
+    // the union of everybody's ghost nodes is is_shared
+    Event e5 = IndexSpace<1>::compute_union(p_ghost, is_shared,
+                                            Realm::ProfilingRequestSet(), e4);
+    if(wait_on_events)
+      e5.wait();
+
+    // and is_private is just the nodes of is_nodes that aren't in is_shared
+    Event e6 = IndexSpace<1>::compute_difference(is_nodes, is_shared, is_private,
+                                                 Realm::ProfilingRequestSet(), e5);
+    if(wait_on_events)
+      e6.wait();
+
+    // the intersection of the original p_nodes with is_shared gives us p_shr
+    // (note that we can do this in parallel with the computation of is_private)
+    Event e7 = IndexSpace<1>::compute_intersections(p_nodes, is_shared, p_shr,
+                                                    Realm::ProfilingRequestSet(), e5);
+    if(wait_on_events)
+      e7.wait();
+
+    // and finally, the intersection of p_nodes with is_private gives us p_pvt
+    Event e8 = IndexSpace<1>::compute_intersections(p_nodes, is_private, p_pvt,
+                                                    Realm::ProfilingRequestSet(), e6);
+    if(wait_on_events)
+      e8.wait();
+
+    // all done - wait on e7 and e8, which dominate every other operation
+    Event e9 = Event::merge_events(e7, e8);
+
+    for(unsigned idx = 0; idx < p_nodes.size(); idx++) {
+      p_nodes[idx].destroy(e9);
+    }
+
+    return e9;
+  }
+
+  virtual int perform_dynamic_checks(void)
+  {
+    int errors = 0;
+    // compute the intermediates for the checks - these duplicate things we
+    //  already have, but we're not supposed to know that here
+    std::vector<IndexSpace<1>> p_pvt_and_shr, p_all;
+    Event e1 = IndexSpace<1>::compute_unions(
+        p_pvt, p_shr, p_pvt_and_shr, Realm::ProfilingRequestSet(), Event::NO_EVENT);
+    Event e2 = IndexSpace<1>::compute_unions(p_pvt_and_shr, p_ghost, p_all,
+                                             Realm::ProfilingRequestSet(), e1);
+#ifdef USE_IMAGE_DIFF
+    std::vector<IndexSpace<1>> p_in_test, p_out_test;
+    Event e5 = is_nodes.create_subspaces_by_image_with_difference(
+        in_node_field_data, p_edges, p_pvt_and_shr, p_in_test,
+        Realm::ProfilingRequestSet(), e1);
+    Event e6 = is_nodes.create_subspaces_by_image_with_difference(
+        out_node_field_data, p_edges, p_all, p_out_test, Realm::ProfilingRequestSet(),
+        e2);
+#else
+    std::vector<IndexSpace<1>> p_in_img, p_out_img;
+    Event e3 =
+        is_nodes.create_subspaces_by_image(in_node_field_data, p_edges, p_in_img,
+                                           Realm::ProfilingRequestSet(), Event::NO_EVENT);
+    Event e4 =
+        is_nodes.create_subspaces_by_image(out_node_field_data, p_edges, p_out_img,
+                                           Realm::ProfilingRequestSet(), Event::NO_EVENT);
+    std::vector<IndexSpace<1>> p_in_test, p_out_test;
+    Event e5 = IndexSpace<1>::compute_differences(p_in_img, p_pvt_and_shr, p_in_test,
+                                                  Realm::ProfilingRequestSet(),
+                                                  Event::merge_events(e1, e3));
+    Event e6 = IndexSpace<1>::compute_differences(p_out_img, p_all, p_out_test,
+                                                  Realm::ProfilingRequestSet(),
+                                                  Event::merge_events(e2, e4));
+    for(unsigned idx = 0; idx < p_in_img.size(); idx++) {
+      p_in_img[idx].destroy(e5);
+    }
+    for(unsigned idx = 0; idx < p_out_img.size(); idx++) {
+      p_out_img[idx].destroy(e6);
+    }
+#endif
+
+    errors += check_empty(e5, p_in_test, "p_in_test");
+    errors += check_empty(e6, p_out_test, "p_out_test");
+    for(unsigned idx = 0; idx < p_pvt_and_shr.size(); idx++) {
+      p_pvt_and_shr[idx].destroy(e5);
+    }
+    for(unsigned idx = 0; idx < p_all.size(); idx++) {
+      p_all[idx].destroy(e6);
+    }
+    for(unsigned idx = 0; idx < p_in_test.size(); idx++) {
+      p_in_test[idx].destroy(e5);
+    }
+    for(unsigned idx = 0; idx < p_out_test.size(); idx++) {
+      p_out_test[idx].destroy(e6);
+    }
+
+    return errors;
+  }
+
+  virtual int check_partitioning(void)
+  {
+    int errors = 0;
+
+    // we'll make up the list of nodes we expect to be shared as we walk the edges
+    std::map<int, std::set<int>> ghost_nodes;
+
+#ifdef DUMP_OUTPUT_SPACES
+    dump_sparse_index_space<1, int>("is_private", is_private);
+    dump_sparse_index_space<1, int>("is_shared", is_shared);
+
+    for(int p = 0; p < num_pieces; p++) {
+      std::cout << "Piece #" << p << "\n";
+      dump_sparse_index_space<1, int>("p_pvt", p_pvt[p]);
+      dump_sparse_index_space<1, int>("p_shr", p_shr[p]);
+      dump_sparse_index_space<1, int>("p_ghost", p_ghost[p]);
+    }
+#endif
+
+    for(int i = 0; i < num_edges; i++) {
+      // regenerate the random info for this edge and the two nodes it touches
+      Point<1> in_node, out_node;
+      int in_subckt, out_subckt;
+      random_edge_data(i, in_node, out_node);
+      random_node_data(in_node, in_subckt);
+      random_node_data(out_node, out_subckt);
+
+      // the edge should be in exactly the p_edges for in_subckt
+      for(int p = 0; p < num_pieces; p++) {
+        bool exp = (p == in_subckt);
+        bool act = p_edges[p].contains(i);
+        if(exp != act) {
+          log_app.error() << "mismatch: edge " << i << " in p_edges[" << p
+                          << "]: exp=" << exp << " act=" << act;
+          errors++;
+        }
+      }
+
+      // is the output node a ghost for this wire?
+      if(in_subckt != out_subckt)
+        ghost_nodes[out_node].insert(in_subckt);
+    }
+
+    // now we can check the nodes
+    for(int i = 0; i < num_nodes; i++) {
+      int subckt;
+      random_node_data(i, subckt);
+      // check is_private and is_shared first
+      {
+        bool exp = ghost_nodes.count(i) == 0;
+        bool act = is_private.contains(i);
+        if(exp != act) {
+          log_app.error() << "mismatch: node " << i << " in is_private: exp=" << exp
+                          << " act=" << act;
+          errors++;
+        }
+      }
+      {
+        bool exp = ghost_nodes.count(i) > 0;
+        bool act = is_shared.contains(i);
+        if(exp != act) {
+          log_app.error() << "mismatch: node " << i << " in is_shared: exp=" << exp
+                          << " act=" << act;
+          errors++;
+        }
+      }
+
+      // now check p_pvt/shr/ghost
+      for(int p = 0; p < num_pieces; p++) {
+        bool exp = (subckt == p) && (ghost_nodes.count(i) == 0);
+        bool act = p_pvt[p].contains(i);
+        if(exp != act) {
+          log_app.error() << "mismatch: node " << i << " in p_pvt[" << p
+                          << "]: exp=" << exp << " act=" << act;
+          errors++;
+        }
+      }
+      for(int p = 0; p < num_pieces; p++) {
+        bool exp = (subckt == p) && (ghost_nodes.count(i) > 0);
+        bool act = p_shr[p].contains(i);
+        if(exp != act) {
+          log_app.error() << "mismatch: node " << i << " in p_shr[" << p
+                          << "]: exp=" << exp << " act=" << act;
+          errors++;
+        }
+      }
+      for(int p = 0; p < num_pieces; p++) {
+        bool exp =
+            (subckt != p) && (ghost_nodes.count(i) > 0) && (ghost_nodes[i].count(p) > 0);
+        bool act = p_ghost[p].contains(i);
+        if(exp != act) {
+          log_app.error() << "mismatch: node " << i << " in p_ghost[" << p
+                          << "]: exp=" << exp << " act=" << act;
+          errors++;
+        }
+      }
+    }
+
+    is_shared.destroy();
+    is_private.destroy();
+    for(unsigned idx = 0; idx < p_pvt.size(); idx++) {
+      p_pvt[idx].destroy();
+    }
+    for(unsigned idx = 0; idx < p_shr.size(); idx++) {
+      p_shr[idx].destroy();
+    }
+    for(unsigned idx = 0; idx < p_ghost.size(); idx++) {
+      p_ghost[idx].destroy();
+    }
+    for(unsigned idx = 0; idx < p_edges.size(); idx++) {
+      p_edges[idx].destroy();
+    }
+
+    return errors;
+  }
+};
+
+class PennantTest : public TestInterface {
+public:
+public:
+  // graph config parameters
+  enum MeshType
+  {
+    RectangularMesh,
+  };
+  MeshType mesh_type = RectangularMesh;
+  int nzx = 10;   // number of zones in x
+  int nzy = 10;   // number of zones in y
+  int numpcx = 2; // number of submeshes in x
+  int numpcy = 2; // number of submeshes in y
+
+  int npx, npy;                      // number of points in each dimension
+  int nz, ns, np, numpc;             // total number of zones, sides, points, and pieces
+  std::vector<int> zxbound, zybound; // x and y split points between submeshes
+  std::vector<int> lz, ls, lp;       // number of zones, sides, and points in each submesh
+
+  // can't do 64-bit index types right now, so at least get most of our 32-bit space
+  typedef int INDEXTYPE;
+  static const INDEXTYPE FIRST_INDEX = -2000000000; // easier to read than INT_MIN+1
+
+  PennantTest(int argc, const char *argv[])
+  {
+#define INT_ARG(s, v)                                                                    \
+  if(!strcmp(argv[i], s)) {                                                              \
+    v = atoi(argv[++i]);                                                                 \
+    continue;                                                                            \
+  }
+    for(int i = 1; i < argc; i++) {
+      INT_ARG("-nzx", nzx)
+      INT_ARG("-nzy", nzy)
+      INT_ARG("-numpcx", numpcx)
+      INT_ARG("-numpcy", numpcy)
+      if(!strcmp(argv[i], "-nz")) {
+        int v = atoi(argv[++i]);
+        nzx = nzy = v;
+        continue;
+      }
+      if(!strcmp(argv[i], "-numpc")) {
+        int v = atoi(argv[++i]);
+        numpcx = numpcy = v;
+        continue;
+      }
+    }
+#undef INT_ARG
+
+    switch(mesh_type) {
+    case RectangularMesh:
+    {
+      npx = nzx + 1;
+      npy = nzy + 1;
+      numpc = numpcx * numpcy;
+
+      zxbound.resize(numpcx + 1);
+      for(int i = 0; i <= numpcx; i++)
+        zxbound[i] = (i * nzx) / numpcx;
+
+      zybound.resize(numpcy + 1);
+      for(int i = 0; i <= numpcy; i++)
+        zybound[i] = (i * nzy) / numpcy;
+
+      nz = ns = np = 0;
+      for(int pcy = 0; pcy < numpcy; pcy++) {
+        for(int pcx = 0; pcx < numpcx; pcx++) {
+          int lx = zxbound[pcx + 1] - zxbound[pcx];
+          int ly = zybound[pcy + 1] - zybound[pcy];
+
+          int zones = lx * ly;
+          int sides = zones * 4;
+          // points are a little funny - shared edges go to the lower numbered piece
+          int points = ((pcx == 0) ? (lx + 1) : lx) * ((pcy == 0) ? (ly + 1) : ly);
+
+          lz.push_back(zones);
+          ls.push_back(sides);
+          lp.push_back(points);
+          nz += zones;
+          ns += sides;
+          np += points;
+        }
+      }
+
+      assert(nz == (nzx * nzy));
+      assert(ns == (4 * nzx * nzy));
+      assert(np == (npx * npy));
+
+      break;
+    }
+    }
+  }
+
+  virtual void print_info(void)
+  {
+    printf("Realm dependent partitioning test - pennant: %d x %d zones, %d x %d pieces\n",
+           (int)nzx, (int)nzy, (int)numpcx, (int)numpcy);
+  }
+
+  IndexSpace<1> is_zones, is_sides, is_points;
+  std::vector<RegionInstance> ri_zones;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, int>> zone_color_field_data;
+  std::vector<RegionInstance> ri_sides;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1>>> side_mapsz_field_data;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1>>> side_mapss3_field_data;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1>>> side_mapsp1_field_data;
+  std::vector<FieldDataDescriptor<IndexSpace<1>, bool>> side_ok_field_data;
+
+  struct InitDataArgs {
+    int index;
+    RegionInstance ri_zones, ri_sides;
+  };
+
+  virtual Event initialize_data(const std::vector<Memory> &memories,
+                                const std::vector<Processor> &procs)
+  {
+    // top level index spaces
+    is_zones = Rect<1>(FIRST_INDEX, FIRST_INDEX + nz - 1);
+    is_sides = Rect<1>(FIRST_INDEX, FIRST_INDEX + ns - 1);
+    is_points = Rect<1>(FIRST_INDEX, FIRST_INDEX + np - 1);
+
+    // weighted partitions based on the distribution we already computed
+    std::vector<IndexSpace<1>> ss_zones_w;
+    std::vector<IndexSpace<1>> ss_sides_w;
+    std::vector<IndexSpace<1>> ss_points_w;
+
+    is_zones
+        .create_weighted_subspaces(numpc, 1, lz, ss_zones_w, Realm::ProfilingRequestSet())
+        .wait();
+    is_sides
+        .create_weighted_subspaces(numpc, 1, ls, ss_sides_w, Realm::ProfilingRequestSet())
+        .wait();
+    is_points
+        .create_weighted_subspaces(numpc, 1, lp, ss_points_w,
+                                   Realm::ProfilingRequestSet())
+        .wait();
+
+    log_app.debug() << "Initial partitions:";
+    for(size_t i = 0; i < ss_zones_w.size(); i++)
+      log_app.debug() << " Zones #" << i << ": " << ss_zones_w[i];
+    for(size_t i = 0; i < ss_sides_w.size(); i++)
+      log_app.debug() << " Sides #" << i << ": " << ss_sides_w[i];
+    for(size_t i = 0; i < ss_points_w.size(); i++)
+      log_app.debug() << " Points #" << i << ": " << ss_points_w[i];
+
+    // create instances for each of these subspaces
+    std::vector<size_t> zone_fields, side_fields;
+    zone_fields.push_back(sizeof(int)); // color
+    assert(sizeof(int) == sizeof(Point<1>));
+    side_fields.push_back(sizeof(Point<1>)); // mapsz
+    side_fields.push_back(sizeof(Point<1>)); // mapss3
+    side_fields.push_back(sizeof(Point<1>)); // mapsp1
+    side_fields.push_back(sizeof(bool));     // ok
+
+    ri_zones.resize(numpc);
+    zone_color_field_data.resize(numpc);
+
+    for(size_t i = 0; i < ss_zones_w.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri, memories[i % memories.size()], ss_zones_w[i],
+                                      zone_fields, 0 /*SOA*/,
+                                      Realm::ProfilingRequestSet())
+          .wait();
+      ri_zones[i] = ri;
+
+      zone_color_field_data[i].index_space = ss_zones_w[i];
+      zone_color_field_data[i].inst = ri_zones[i];
+      zone_color_field_data[i].field_offset = 0;
+    }
+
+    ri_sides.resize(numpc);
+    side_mapsz_field_data.resize(numpc);
+    side_mapss3_field_data.resize(numpc);
+    side_mapsp1_field_data.resize(numpc);
+    side_ok_field_data.resize(numpc);
+
+    for(size_t i = 0; i < ss_sides_w.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri, memories[i % memories.size()], ss_sides_w[i],
+                                      side_fields, 0 /*SOA*/,
+                                      Realm::ProfilingRequestSet())
+          .wait();
+      ri_sides[i] = ri;
+
+      side_mapsz_field_data[i].index_space = ss_sides_w[i];
+      side_mapsz_field_data[i].inst = ri_sides[i];
+      side_mapsz_field_data[i].field_offset = 0 * sizeof(Point<1>);
+
+      side_mapss3_field_data[i].index_space = ss_sides_w[i];
+      side_mapss3_field_data[i].inst = ri_sides[i];
+      side_mapss3_field_data[i].field_offset = 1 * sizeof(Point<1>);
+
+      side_mapsp1_field_data[i].index_space = ss_sides_w[i];
+      side_mapsp1_field_data[i].inst = ri_sides[i];
+      side_mapsp1_field_data[i].field_offset = 2 * sizeof(Point<1>);
+
+      side_ok_field_data[i].index_space = ss_sides_w[i];
+      side_ok_field_data[i].inst = ri_sides[i];
+      side_ok_field_data[i].field_offset = 3 * sizeof(Point<1>);
+    }
+
+    // fire off tasks to initialize data
+    std::set<Event> events;
+    for(int i = 0; i < numpc; i++) {
+      Processor p = procs[i % memories.size()];
+      InitDataArgs args;
+      args.index = i;
+      args.ri_zones = ri_zones[i];
+      args.ri_sides = ri_sides[i];
+      Event e = p.spawn(INIT_PENNANT_DATA_TASK, &args, sizeof(args));
+      events.insert(e);
+    }
+
+    return Event::merge_events(events);
+  }
+
+  static void init_data_task_wrapper(const void *args, size_t arglen,
+                                     const void *userdata, size_t userlen, Processor p)
+  {
+    PennantTest *me = (PennantTest *)testcfg;
+    me->init_data_task(args, arglen, p);
+  }
+
+  Point<1> global_point_pointer(int py, int px) const
+  {
+    int pp = FIRST_INDEX;
+
+    // start by steping over whole y slabs - again be careful that the extra slab belongs
+    // to pcy == 0
+    int dy;
+    if(py > zybound[1]) {
+      int pcy = 1;
+      while(py > zybound[pcy + 1])
+        pcy++;
+      int slabs = zybound[pcy] + 1;
+      pp += npx * slabs;
+      py -= slabs;
+      dy = zybound[pcy + 1] - zybound[pcy];
+    } else {
+      dy = zybound[1] + 1;
+    }
+
+    // now chunks in x, using just the y width of this row of chunks
+    int dx;
+    if(px > zxbound[1]) {
+      int pcx = 1;
+      while(px > zxbound[pcx + 1])
+        pcx++;
+      int strips = zxbound[pcx] + 1;
+      pp += dy * strips;
+      px -= strips;
+      dx = zxbound[pcx + 1] - zxbound[pcx];
+    } else {
+      dx = zxbound[1] + 1;
+    }
+
+    // finally, px and py are now local and are handled easily
+    pp += py * dx + px;
+
+    return pp;
+  }
+
+  void init_data_task(const void *args, size_t arglen, Processor p)
+  {
+    const InitDataArgs &i_args = *(const InitDataArgs *)args;
+
+    log_app.info() << "init task #" << i_args.index << " (ri_zones=" << i_args.ri_zones
+                   << ", ri_sides=" << i_args.ri_sides << ")";
+
+    i_args.ri_zones.fetch_metadata(p).wait();
+    i_args.ri_sides.fetch_metadata(p).wait();
+
+    IndexSpace<1> is_zones = i_args.ri_zones.get_indexspace<1>();
+    IndexSpace<1> is_sides = i_args.ri_sides.get_indexspace<1>();
+
+    log_app.debug() << "Z: " << is_zones;
+    log_app.debug() << "S: " << is_sides;
+
+    int pcx = i_args.index % numpcx;
+    int pcy = i_args.index / numpcx;
+
+    int zxlo = zxbound[pcx];
+    int zxhi = zxbound[pcx + 1];
+    int zylo = zybound[pcy];
+    int zyhi = zybound[pcy + 1];
+
+    {
+      AffineAccessor<int, 1> a_zone_color(i_args.ri_zones, 0 /* offset */);
+      AffineAccessor<Point<1>, 1> a_side_mapsz(i_args.ri_sides,
+                                               0 * sizeof(Point<1>) /* offset */);
+      AffineAccessor<Point<1>, 1> a_side_mapss3(i_args.ri_sides,
+                                                1 * sizeof(Point<1>) /* offset */);
+      AffineAccessor<Point<1>, 1> a_side_mapsp1(i_args.ri_sides,
+                                                2 * sizeof(Point<1>) /* offset */);
+      AffineAccessor<bool, 1> a_side_ok(i_args.ri_sides,
+                                        3 * sizeof(Point<1>) /* offset */);
+
+      Point<1> pz = is_zones.bounds.lo;
+      Point<1> ps = is_sides.bounds.lo;
+
+      for(int zy = zylo; zy < zyhi; zy++) {
+        for(int zx = zxlo; zx < zxhi; zx++) {
+          // get 4 side pointers
+          Point<1> ps0 = ps;
+          ps[0]++;
+          Point<1> ps1 = ps;
+          ps[0]++;
+          Point<1> ps2 = ps;
+          ps[0]++;
+          Point<1> ps3 = ps;
+          ps[0]++;
+
+          // point pointers are ugly because they can be in neighbors - use a helper
+          Point<1> pp0 = global_point_pointer(zy, zx); // go CCW
+          Point<1> pp1 = global_point_pointer(zy + 1, zx);
+          Point<1> pp2 = global_point_pointer(zy + 1, zx + 1);
+          Point<1> pp3 = global_point_pointer(zy, zx + 1);
+
+          a_zone_color.write(pz, i_args.index);
+
+          a_side_mapsz.write(ps0, pz);
+          a_side_mapsz.write(ps1, pz);
+          a_side_mapsz.write(ps2, pz);
+          a_side_mapsz.write(ps3, pz);
+
+          a_side_mapss3.write(ps0, ps1);
+          a_side_mapss3.write(ps1, ps2);
+          a_side_mapss3.write(ps2, ps3);
+          a_side_mapss3.write(ps3, ps0);
+
+          a_side_mapsp1.write(ps0, pp0);
+          a_side_mapsp1.write(ps1, pp1);
+          a_side_mapsp1.write(ps2, pp2);
+          a_side_mapsp1.write(ps3, pp3);
+
+          a_side_ok.write(ps0, true);
+          a_side_ok.write(ps1, true);
+          a_side_ok.write(ps2, true);
+          a_side_ok.write(ps3, true);
+
+          pz[0]++;
+        }
+      }
+      assert(pz[0] == is_zones.bounds.hi[0] + 1);
+      assert(ps[0] == is_sides.bounds.hi[0] + 1);
+    }
+
+    if(show_graph) {
+      AffineAccessor<int, 1> a_zone_color(i_args.ri_zones, 0 /* offset */);
+
+      for(int i = is_zones.bounds.lo; i <= is_zones.bounds.hi; i++)
+        std::cout << "Z[" << i << "]: color=" << a_zone_color.read(i) << "\n";
+
+      AffineAccessor<Point<1>, 1> a_side_mapsz(i_args.ri_sides,
+                                               0 * sizeof(Point<1>) /* offset */);
+      AffineAccessor<Point<1>, 1> a_side_mapss3(i_args.ri_sides,
+                                                1 * sizeof(Point<1>) /* offset */);
+      AffineAccessor<Point<1>, 1> a_side_mapsp1(i_args.ri_sides,
+                                                2 * sizeof(Point<1>) /* offset */);
+      AffineAccessor<bool, 1> a_side_ok(i_args.ri_sides,
+                                        3 * sizeof(Point<1>) /* offset */);
+
+      for(int i = is_sides.bounds.lo; i <= is_sides.bounds.hi; i++)
+        std::cout << "S[" << i << "]:"
+                  << " mapsz=" << a_side_mapsz.read(i)
+                  << " mapss3=" << a_side_mapss3.read(i)
+                  << " mapsp1=" << a_side_mapsp1.read(i) << " ok=" << a_side_ok.read(i)
+                  << "\n";
+    }
+  }
+
+  // the outputs of our partitioning will be:
+  //  p_zones               - subsets of is_zones split by piece
+  //  p_sides               - subsets of is_sides split by piece (with bad sides removed)
+  //  p_points              - subsets of is_points by piece (aliased)
+
+  std::vector<IndexSpace<1>> p_zones;
+  std::vector<IndexSpace<1>> p_sides;
+  std::vector<IndexSpace<1>> p_points;
+
+  virtual Event perform_partitioning(void)
+  {
+    // first get the set of bad sides (i.e. ok == false)
+    IndexSpace<1> bad_sides;
+
+    Event e1 = is_sides.create_subspace_by_field(side_ok_field_data, false, bad_sides,
+                                                 Realm::ProfilingRequestSet());
+    if(wait_on_events)
+      e1.wait();
+
+    // map the bad sides through to bad zones
+    IndexSpace<1> bad_zones;
+    Event e2 = is_zones.create_subspace_by_image(
+        side_mapsz_field_data, bad_sides, bad_zones, Realm::ProfilingRequestSet(), e1);
+    if(wait_on_events)
+      e2.wait();
+    bad_sides.destroy(e2);
+
+    // subtract bad zones to get good zones
+    IndexSpace<1> good_zones;
+    Event e3 = IndexSpace<1>::compute_difference(is_zones, bad_zones, good_zones,
+                                                 Realm::ProfilingRequestSet(), e2);
+    if(wait_on_events)
+      e3.wait();
+    bad_zones.destroy(e3);
+
+    // now do actual partitions with just good zones
+    std::vector<int> colors(numpc);
+    for(int i = 0; i < numpc; i++)
+      colors[i] = i;
+
+    Event e4 = good_zones.create_subspaces_by_field(
+        zone_color_field_data, colors, p_zones, Realm::ProfilingRequestSet(), e3);
+    if(wait_on_events)
+      e4.wait();
+    good_zones.destroy(e4);
+
+    // preimage of zones is sides
+    Event e5 = is_sides.create_subspaces_by_preimage(
+        side_mapsz_field_data, p_zones, p_sides, Realm::ProfilingRequestSet(), e4);
+    if(wait_on_events)
+      e5.wait();
+
+    // and image of sides->mapsp1 is points
+    Event e6 = is_points.create_subspaces_by_image(
+        side_mapsp1_field_data, p_sides, p_points, Realm::ProfilingRequestSet(), e5);
+    if(wait_on_events)
+      e6.wait();
+
+    return e6;
+  }
+
+  virtual int perform_dynamic_checks(void)
+  {
+    int errors = 0;
+
+    // pennant's checks are actually slower with the fused image/diff
+#ifdef PENNANT_USE_IMAGE_DIFF
+    std::vector<IndexSpace<1>> p_z_test, p_p_test, p_s_test;
+    Event e4 = is_zones.create_subspaces_by_image_with_difference(
+        side_mapsz_field_data, p_sides, p_zones, p_z_test, Realm::ProfilingRequestSet());
+    Event e5 = is_points.create_subspaces_by_image_with_difference(
+        side_mapsp1_field_data, p_sides, p_points, p_p_test,
+        Realm::ProfilingRequestSet());
+    Event e6 = is_sides.create_subspaces_by_image_with_difference(
+        side_mapss3_field_data, p_sides, p_sides, p_s_test, Realm::ProfilingRequestSet());
+#else
+    std::vector<IndexSpace<1>> p_img_mapsz, p_img_mapsp1, p_img_mapss3;
+    Event e1 = is_zones.create_subspaces_by_image(
+        side_mapsz_field_data, p_sides, p_img_mapsz, Realm::ProfilingRequestSet());
+    Event e2 = is_points.create_subspaces_by_image(
+        side_mapsp1_field_data, p_sides, p_img_mapsp1, Realm::ProfilingRequestSet());
+    Event e3 = is_sides.create_subspaces_by_image(
+        side_mapss3_field_data, p_sides, p_img_mapss3, Realm::ProfilingRequestSet());
+    std::vector<IndexSpace<1>> p_z_test, p_p_test, p_s_test;
+    Event e4 = IndexSpace<1>::compute_differences(p_img_mapsz, p_zones, p_z_test,
+                                                  Realm::ProfilingRequestSet(), e1);
+    for(unsigned idx = 0; idx < p_img_mapsz.size(); idx++) {
+      p_img_mapsz[idx].destroy(e4);
+    }
+    Event e5 = IndexSpace<1>::compute_differences(p_img_mapsp1, p_points, p_p_test,
+                                                  Realm::ProfilingRequestSet(), e2);
+    for(unsigned idx = 0; idx < p_img_mapsp1.size(); idx++) {
+      p_img_mapsp1[idx].destroy(e5);
+    }
+    Event e6 = IndexSpace<1>::compute_differences(p_img_mapss3, p_sides, p_s_test,
+                                                  Realm::ProfilingRequestSet(), e3);
+    for(unsigned idx = 0; idx < p_img_mapss3.size(); idx++) {
+      p_img_mapss3[idx].destroy(e6);
+    }
+#endif
+    errors += check_empty(e4, p_z_test, "p_z_test");
+    errors += check_empty(e5, p_p_test, "p_p_test");
+    errors += check_empty(e6, p_s_test, "p_s_test");
+    for(unsigned idx = 0; idx < p_z_test.size(); idx++) {
+      p_z_test[idx].destroy(e4);
+    }
+    for(unsigned idx = 0; idx < p_p_test.size(); idx++) {
+      p_p_test[idx].destroy(e5);
+    }
+    for(unsigned idx = 0; idx < p_s_test.size(); idx++) {
+      p_s_test[idx].destroy(e6);
+    }
+
+    return errors;
+  }
+
+  virtual int check_partitioning(void)
+  {
+    int errors = 0;
+
+    for(int pcy = 0; pcy < numpcy; pcy++) {
+      for(int pcx = 0; pcx < numpcx; pcx++) {
+        int idx = pcy * numpcx + pcx;
+
+        int lx = zxbound[pcx + 1] - zxbound[pcx];
+        int ly = zybound[pcy + 1] - zybound[pcy];
+
+        int exp_zones = lx * ly;
+        int exp_sides = exp_zones * 4;
+        int exp_points = (lx + 1) * (ly + 1); // easier because of aliasing
+
+        int act_zones = p_zones[idx].volume();
+        int act_sides = p_sides[idx].volume();
+        int act_points = p_points[idx].volume();
+
+        if(exp_zones != act_zones) {
+          log_app.error() << "Piece #" << idx
+                          << ": zone count mismatch: exp = " << exp_zones
+                          << ", act = " << act_zones;
+          errors++;
+        }
+        if(exp_sides != act_sides) {
+          log_app.error() << "Piece #" << idx
+                          << ": side count mismatch: exp = " << exp_sides
+                          << ", act = " << act_sides;
+          errors++;
+        }
+        if(exp_points != act_points) {
+          log_app.error() << "Piece #" << idx
+                          << ": point count mismatch: exp = " << exp_points
+                          << ", act = " << act_points;
+          errors++;
+        }
+      }
+    }
+
+    // check zones
+    Point<1> pz = is_zones.bounds.lo;
+    for(int pc = 0; pc < numpc; pc++) {
+      for(int i = 0; i < lz[pc]; i++) {
+        for(int j = 0; j < numpc; j++) {
+          bool exp = (j == pc);
+          bool act = p_zones[j].contains(pz);
+          if(exp != act) {
+            log_app.error() << "mismatch: zone " << pz << " in p_zones[" << j
+                            << "]: exp=" << exp << " act=" << act;
+            errors++;
+          }
+        }
+        pz[0]++;
+      }
+    }
+
+    // check sides
+    Point<1> ps = is_sides.bounds.lo;
+    for(int pc = 0; pc < numpc; pc++) {
+      for(int i = 0; i < ls[pc]; i++) {
+        for(int j = 0; j < numpc; j++) {
+          bool exp = (j == pc);
+          bool act = p_sides[j].contains(ps);
+          if(exp != act) {
+            log_app.error() << "mismatch: side " << ps << " in p_sides[" << j
+                            << "]: exp=" << exp << " act=" << act;
+            errors++;
+          }
+        }
+        ps[0]++;
+      }
+    }
+
+    // check points (trickier due to ghosting)
+    for(int py = 0; py < npy; py++)
+      for(int px = 0; px < npx; px++) {
+        Point<1> pp = global_point_pointer(py, px);
+        for(int pc = 0; pc < numpc; pc++) {
+          int pcy = pc / numpcx;
+          int pcx = pc % numpcx;
+          bool exp = ((py >= zybound[pcy]) && (py <= zybound[pcy + 1]) &&
+                      (px >= zxbound[pcx]) && (px <= zxbound[pcx + 1]));
+          bool act = p_points[pc].contains(pp);
+          if(exp != act) {
+            log_app.error() << "mismatch: point " << pp << " in p_points[" << pc
+                            << "]: exp=" << exp << " act=" << act;
+            errors++;
+          }
+        }
+      }
+
+    for(unsigned idx = 0; idx < p_zones.size(); idx++) {
+      p_zones[idx].destroy();
+    }
+    for(unsigned idx = 0; idx < p_sides.size(); idx++) {
+      p_sides[idx].destroy();
+    }
+    for(unsigned idx = 0; idx < p_points.size(); idx++) {
+      p_points[idx].destroy();
+    }
+
+    return errors;
+  }
+};
+
+template <typename PRNG = Philox_2x32<>>
+class RandStream {
+public:
+  RandStream(unsigned _seed)
+    : seed(_seed)
+    , idx(0)
+  {}
+
+  void setpos(unsigned long long _idx) { idx = _idx; }
+  void adjpos(long long _adj) { idx += _adj; }
+
+  unsigned rand_int(unsigned n)
+  {
+    unsigned v = PRNG::rand_int(seed, idx >> 32, idx, n);
+    idx++;
+    return v;
+  }
+
+  float rand_float(void)
+  {
+    float v = PRNG::rand_float(seed, idx >> 32, idx);
+    idx++;
+    return v;
+  }
+
+  unsigned seed;
+  unsigned long long idx;
+};
+
+template <typename FT>
+FT randval(RandStream<> &rs);
+
+template <>
+float randval<float>(RandStream<> &rs)
+{
+  return rs.rand_float();
+}
+
+template <>
+int randval<int>(RandStream<> &rs)
+{
+  return rs.rand_int(INT_MAX);
+}
+
+template <int N1, typename T1, int N2, typename T2, typename FT>
+class RandomTest : public TestInterface {
+public:
+  RandomTest(int argc, const char *argv[]);
+  virtual ~RandomTest(void);
+
+  virtual void print_info(void);
+
+  virtual Event initialize_data(const std::vector<Memory> &memories,
+                                const std::vector<Processor> &procs);
+
+  virtual Event perform_partitioning(void);
+
+  virtual int perform_dynamic_checks(void);
+
+  virtual int check_partitioning(void);
+
+  void fill_instance_data(IndexSpace<N1, T1> ibounds, RegionInstance inst);
+
+protected:
+  T1 base1_min, base1_max, extent1_min, extent1_max;
+  T2 base2_min, base2_max, extent2_min, extent2_max;
+  int num_pieces, num_colors;
+
+  Rect<N1, T1> bounds1;
+  Rect<N2, T2> bounds2;
+  IndexSpace<N1, T1> root1;
+  IndexSpace<N2, T2> root2;
+  std::vector<FT> colors;
+  std::vector<RegionInstance> ri_data1;
+  std::vector<FieldDataDescriptor<IndexSpace<N1, T1>, FT>> fd_vals1;
+  std::vector<FieldDataDescriptor<IndexSpace<N1, T1>, Point<N2, T2>>> fd_ptrs1;
+};
+
+template <int N1, typename T1, int N2, typename T2, typename FT>
+RandomTest<N1, T1, N2, T2, FT>::RandomTest(int argc, const char *argv[])
+  : base1_min(0)
+  , base1_max(0)
+  , extent1_min(4)
+  , extent1_max(6)
+  , base2_min(0)
+  , base2_max(0)
+  , extent2_min(4)
+  , extent2_max(6)
+  , num_pieces(2)
+  , num_colors(4)
+{
+  RandStream<> rs(random_seed + 0);
+
+  for(int i = 0; i < N1; i++) {
+    bounds1.lo[i] = base1_min + rs.rand_int(base1_max - base1_min + 1);
+    bounds1.hi[i] =
+        (bounds1.lo[i] + extent1_min + rs.rand_int(extent1_max - extent1_min + 1));
+  }
+  for(int i = 0; i < N2; i++) {
+    bounds2.lo[i] = base2_min + rs.rand_int(base2_max - base2_min + 1);
+    bounds2.hi[i] =
+        (bounds2.lo[i] + extent2_min + rs.rand_int(extent2_max - extent2_min + 1));
+  }
+
+  colors.resize(num_colors);
+  for(int i = 0; i < num_colors; i++)
+    colors[i] = randval<FT>(rs);
+}
+
+template <int N1, typename T1, int N2, typename T2, typename FT>
+RandomTest<N1, T1, N2, T2, FT>::~RandomTest(void)
+{}
+
+template <int N1, typename T1, int N2, typename T2, typename FT>
+void RandomTest<N1, T1, N2, T2, FT>::print_info(void)
+{
+  printf("Realm dependent partitioning test - random\n");
+}
+
+template <int N1, typename T1, int N2, typename T2, typename FT>
+void RandomTest<N1, T1, N2, T2, FT>::fill_instance_data(IndexSpace<N1, T1> ibounds,
+                                                        RegionInstance inst)
+{
+  {
+    // start with value field
+    AffineAccessor<FT, N1, T1> a_vals(inst, 0);
+
+    // iterate over all points in root1 with initial random values
+    RandStream<> rs1(random_seed + 1);
+    for(PointInRectIterator<N1, T1> pir(bounds1); pir.valid; pir.step()) {
+      FT v = colors[rs1.rand_int(colors.size())];
+      if(ibounds.contains(pir.p))
+        a_vals.write(pir.p, v);
+    }
+
+    // print results
+    for(PointInRectIterator<N1, T1> pir(bounds1); pir.valid; pir.step()) {
+      if(ibounds.contains(pir.p))
+        log_app.debug() << "v[" << pir.p << "] = " << a_vals.read(pir.p);
+    }
+  }
+
+  {
+    // now pointer field
+    AffineAccessor<Point<N2, T2>, N1, T1> a_ptrs(inst, 0 + sizeof(FT));
+
+    // iterate over all points in root1 with initial random values
+    RandStream<> rs2(random_seed + 2);
+    for(PointInRectIterator<N1, T1> pir(bounds1); pir.valid; pir.step()) {
+      Point<N2, T2> p2;
+      for(int i = 0; i < N2; i++)
+        p2[i] = bounds2.lo[i] + rs2.rand_int(bounds2.hi[i] - bounds2.lo[i] + 1);
+      if(ibounds.contains(pir.p))
+        a_ptrs.write(pir.p, p2);
+    }
+
+    // print results
+    for(PointInRectIterator<N1, T1> pir(bounds1); pir.valid; pir.step()) {
+      if(ibounds.contains(pir.p))
+        log_app.debug() << "p[" << pir.p << "] = " << a_ptrs.read(pir.p);
+    }
+  }
+}
+
+template <int N1, typename T1, int N2, typename T2, typename FT>
+Event RandomTest<N1, T1, N2, T2, FT>::initialize_data(const std::vector<Memory> &memories,
+                                                      const std::vector<Processor> &procs)
+{
+  root1 = IndexSpace<N1, T1>(bounds1);
+  root2 = IndexSpace<N2, T2>(bounds2);
+  log_app.debug() << "root1 = " << root1;
+  log_app.debug() << "root2 = " << root2;
+
+  // create instances to hold actual data
+  size_t num_insts = memories.size();
+  log_app.debug() << "procs: " << procs;
+  log_app.debug() << "mems: " << memories;
+  std::vector<IndexSpace<N1, T1>> ss_inst1;
+  root1.create_equal_subspaces(num_insts, 1, ss_inst1, Realm::ProfilingRequestSet())
+      .wait();
+
+  std::vector<size_t> field_sizes;
+  field_sizes.push_back(sizeof(FT));
+  field_sizes.push_back(sizeof(Point<N2, T2>));
+
+  ri_data1.resize(num_insts);
+  fd_vals1.resize(num_insts);
+  fd_ptrs1.resize(num_insts);
+
+  for(size_t i = 0; i < num_insts; i++) {
+    RegionInstance ri;
+    RegionInstance::create_instance(ri, memories[i], ss_inst1[i], field_sizes, 0 /*SOA*/,
+                                    Realm::ProfilingRequestSet())
+        .wait();
+    log_app.debug() << "inst[" << i << "] = " << ri << " (" << ss_inst1[i] << ")";
+    ri_data1[i] = ri;
+
+    fd_vals1[i].index_space = ss_inst1[i];
+    fd_vals1[i].inst = ri;
+    fd_vals1[i].field_offset = 0;
+
+    fd_ptrs1[i].index_space = ss_inst1[i];
+    fd_ptrs1[i].inst = ri;
+    fd_ptrs1[i].field_offset = 0 + sizeof(FT);
+  }
+
+  log_app.debug() << "colors = " << colors;
+
+  for(size_t i = 0; i < num_insts; i++) {
+    fill_instance_data(root1 /*ss_inst1[i]*/, ri_data1[i]);
+  }
+
+  return Event::NO_EVENT;
+}
+
+template <int N1, typename T1, int N2, typename T2, typename FT>
+Event RandomTest<N1, T1, N2, T2, FT>::perform_partitioning(void)
+{
+  // start by filtering root1 by color
+  std::vector<FT> piece_colors(colors.begin(), colors.begin() + num_pieces);
+  std::vector<IndexSpace<N1, T1>> ss_by_color;
+  Event e1 = root1.create_subspaces_by_field(fd_vals1, piece_colors, ss_by_color,
+                                             ProfilingRequestSet());
+  e1.wait();
+
+  for(int i = 0; i < num_pieces; i++) {
+    log_app.debug() << "bycolor[" << i << "] (" << colors[i] << ") = " << ss_by_color[i];
+    dump_sparse_index_space("", ss_by_color[i]);
+  }
+
+  // images
+  std::vector<IndexSpace<N2, T2>> ss_images;
+  Event e2 = root2.create_subspaces_by_image(fd_ptrs1, ss_by_color, ss_images,
+                                             ProfilingRequestSet(), e1);
+
+  e2.wait();
+
+  for(int i = 0; i < num_pieces; i++) {
+    log_app.debug() << "image[" << i << "] = " << ss_images[i];
+    dump_sparse_index_space("", ss_images[i]);
+  }
+
+  // preimages
+  std::vector<IndexSpace<N1, T1>> ss_preimages;
+  Event e3 = root1.create_subspaces_by_preimage(fd_ptrs1, ss_images, ss_preimages,
+                                                ProfilingRequestSet(), e2);
+
+  e3.wait();
+
+  for(int i = 0; i < num_pieces; i++) {
+    log_app.debug() << "preimage[" << i << "] = " << ss_preimages[i];
+    dump_sparse_index_space("", ss_preimages[i]);
+    ss_by_color[i].destroy();
+    ss_images[i].destroy();
+    ss_preimages[i].destroy();
+  }
+
+  return Event::NO_EVENT;
+}
+
+template <int N1, typename T1, int N2, typename T2, typename FT>
+int RandomTest<N1, T1, N2, T2, FT>::perform_dynamic_checks(void)
+{
+  return 0;
+}
+
+template <int N1, typename T1, int N2, typename T2, typename FT>
+int RandomTest<N1, T1, N2, T2, FT>::check_partitioning(void)
+{
+  return 0;
+}
+
+void top_level_task(const void *args, size_t arglen, const void *userdata, size_t userlen,
+                    Processor p)
+{
+  int errors = 0;
+
+  testcfg->print_info();
+
+  // find all the system memories - we'll stride our data across them
+  // for each memory, we'll need one CPU that can do the initialization of the data
+  std::vector<Memory> sysmems;
+  std::vector<Processor> procs;
+
+  Machine machine = Machine::get_machine();
+  {
+    std::set<Memory> all_memories;
+    machine.get_all_memories(all_memories);
+    for(std::set<Memory>::const_iterator it = all_memories.begin();
+        it != all_memories.end(); it++) {
+      Memory m = *it;
+
+      // skip memories with no capacity for creating instances
+      if(m.capacity() == 0)
+        continue;
+
+      if(m.kind() == Memory::SYSTEM_MEM) {
+        sysmems.push_back(m);
+        std::set<Processor> pset;
+        machine.get_shared_processors(m, pset);
+        Processor p = Processor::NO_PROC;
+        for(std::set<Processor>::const_iterator it2 = pset.begin(); it2 != pset.end();
+            it2++) {
+          if(it2->kind() == Processor::LOC_PROC) {
+            p = *it2;
+            break;
+          }
+        }
+        assert(p.exists());
+        procs.push_back(p);
+        log_app.debug() << "System mem #" << (sysmems.size() - 1) << " = "
+                        << *sysmems.rbegin() << " (" << *procs.rbegin() << ")";
+      }
+    }
+  }
+  assert(sysmems.size() > 0);
+
+  {
+    Realm::TimeStamp ts("initialization", true, &log_app);
+
+    Event e = testcfg->initialize_data(sysmems, procs);
+    // wait for all initialization to be done
+    e.wait();
+  }
+
+  // now actual partitioning work
+  {
+    Realm::TimeStamp ts("dependent partitioning work", true, &log_app);
+
+    Event e = testcfg->perform_partitioning();
+
+    e.wait();
+  }
+
+  // dynamic checks (which would be eliminated by compiler)
+  {
+    Realm::TimeStamp ts("dynamic checks", true, &log_app);
+    errors += testcfg->perform_dynamic_checks();
+  }
+
+  if(!skip_check) {
+    log_app.print() << "checking correctness of partitioning";
+    Realm::TimeStamp ts("verification", true, &log_app);
+    errors += testcfg->check_partitioning();
+  }
+
+  if(errors > 0) {
+    printf("Exiting with errors\n");
+    exit(1);
+  }
+
+  printf("all done!\n");
+}
+
+template <int N1, typename T1, int N2, typename T2, typename FT, typename TRANSFORM>
+class RandomAffineTest : public TestInterface {
+public:
+  RandomAffineTest(int argc, const char *argv[],
+                   const std::vector<TRANSFORM> &transforms);
+  virtual ~RandomAffineTest(void);
+
+  virtual void print_info(void);
+
+  virtual Event initialize_data(const std::vector<Memory> &memories,
+                                const std::vector<Processor> &procs);
+
+  virtual Event perform_partitioning(void);
+
+  virtual int perform_dynamic_checks(void);
+
+  virtual int check_partitioning(void);
+
+  void fill_instance_data(IndexSpace<N1, T1> ibounds, RegionInstance inst);
+
+  int verify_results(const IndexSpace<N2, T2> &root, const TRANSFORM &transform,
+                     const std::vector<std::vector<IndexSpace<N2, T1>>> &images,
+                     const std::vector<std::vector<IndexSpace<N1, T1>>> &preimages);
+
+protected:
+  std::vector<TRANSFORM> transforms;
+  T1 base1_min, base1_max, extent1_min, extent1_max;
+  T2 base2_min, base2_max, extent2_min, extent2_max;
+  int num_pieces, num_colors;
+
+  // std::vector<AffineTransform<N2, N1, T2>> transforms;
+
+  std::vector<std::vector<IndexSpace<N2, T1>>> dense_images;
+  std::vector<std::vector<IndexSpace<N2, T1>>> sparse_images;
+
+  std::vector<IndexSpace<N1, T1>> ss_by_color;
+
+  std::vector<std::vector<IndexSpace<N1, T1>>> dense_preimages;
+  std::vector<std::vector<IndexSpace<N1, T1>>> sparse_preimages;
+
+  Rect<N1, T1> bounds1;
+  Rect<N2, T2> bounds2;
+  IndexSpace<N1, T1> root1;
+  IndexSpace<N2, T2> root2;
+  IndexSpace<N2, T2> root2_sparse;
+  std::vector<FT> colors;
+  std::vector<RegionInstance> ri_data1;
+  std::vector<FieldDataDescriptor<IndexSpace<N1, T1>, FT>> fd_vals1;
+};
+
+template <int N1, typename T1, int N2, typename T2, typename FT, typename TRANSFORM>
+RandomAffineTest<N1, T1, N2, T2, FT, TRANSFORM>::RandomAffineTest(
+    int argc, const char *argv[], const std::vector<TRANSFORM> &_transforms)
+  : transforms(_transforms)
+  , base1_min(0)
+  , base1_max(0)
+  , extent1_min(4)
+  , extent1_max(6)
+  , base2_min(0)
+  , base2_max(0)
+  , extent2_min(4)
+  , extent2_max(6)
+  , num_pieces(2)
+  , num_colors(4)
+{
+  RandStream<> rs(random_seed + 2);
+
+  for(int i = 0; i < N1; i++) {
+    bounds1.lo[i] = base1_min + rs.rand_int(base1_max - base1_min + 1);
+    bounds1.hi[i] =
+        (bounds1.lo[i] + extent1_min + rs.rand_int(extent1_max - extent1_min + 1));
+  }
+  for(int i = 0; i < N2; i++) {
+    bounds2.lo[i] = base2_min + rs.rand_int(base2_max - base2_min + 1);
+    bounds2.hi[i] =
+        (bounds2.lo[i] + extent2_min + rs.rand_int(extent2_max - extent2_min + 1));
+  }
+
+  colors.resize(num_colors);
+
+  for(int i = 0; i < num_colors; i++)
+    colors[i] = randval<FT>(rs);
+
+  dense_images.resize(transforms.size());
+  sparse_images.resize(transforms.size());
+
+  dense_preimages.resize(transforms.size());
+  sparse_preimages.resize(transforms.size());
+}
+
+template <int N1, typename T1, int N2, typename T2, typename FT, typename TRANSFORM>
+RandomAffineTest<N1, T1, N2, T2, FT, TRANSFORM>::~RandomAffineTest(void)
+{}
+
+template <int N1, typename T1, int N2, typename T2, typename FT, typename TRANSFORM>
+void RandomAffineTest<N1, T1, N2, T2, FT, TRANSFORM>::print_info(void)
+{
+  printf("Realm dependent partitioning test - random affine\n");
+}
+
+template <int N1, typename T1, int N2, typename T2, typename FT, typename TRANSFORM>
+void RandomAffineTest<N1, T1, N2, T2, FT, TRANSFORM>::fill_instance_data(
+    IndexSpace<N1, T1> ibounds, RegionInstance inst)
+{
+  {
+    // start with value field
+    AffineAccessor<FT, N1, T1> a_vals(inst, 0);
+
+    // iterate over all points in root1 with initial random values
+    RandStream<> rs1(random_seed + 1);
+    for(PointInRectIterator<N1, T1> pir(bounds1); pir.valid; pir.step()) {
+      FT v = colors[rs1.rand_int(2)];
+      if(ibounds.contains(pir.p))
+        a_vals.write(pir.p, v);
+    }
+
+    // print results
+    for(PointInRectIterator<N1, T1> pir(bounds1); pir.valid; pir.step()) {
+      if(ibounds.contains(pir.p))
+        log_app.debug() << "v[" << pir.p << "] = " << a_vals.read(pir.p);
+    }
+  }
+}
+
+template <int N1, typename T1, int N2, typename T2, typename FT, typename TRANSFORM>
+Event RandomAffineTest<N1, T1, N2, T2, FT, TRANSFORM>::initialize_data(
+    const std::vector<Memory> &memories, const std::vector<Processor> &procs)
+{
+  std::vector<Point<N2, T2>> sparse_points;
+  int index = 0;
+  for(PointInRectIterator<N2, T2> pir(bounds2); pir.valid; pir.step()) {
+    if(index % 2 == 0) {
+      sparse_points.push_back(pir.p);
+    }
+    index++;
+  }
+  SparsityMap<N2, T2> sparse_map =
+      SparsityMap<N2, T2>::construct(sparse_points, true, true);
+
+  root1 = IndexSpace<N1, T1>(bounds1);
+  root2 = IndexSpace<N2, T2>(bounds2);
+  root2_sparse = IndexSpace<N2, T2>(bounds2, sparse_map);
+
+  log_app.debug() << "root1 = " << root1;
+  log_app.debug() << "root2 = " << root2;
+  log_app.debug() << "root2_sparse = " << root2_sparse;
+
+  // create instances to hold actual data
+  size_t num_insts = memories.size();
+  log_app.debug() << "procs: " << procs;
+  log_app.debug() << "mems: " << memories;
+  std::vector<IndexSpace<N1, T1>> ss_inst1;
+  root1.create_equal_subspaces(num_insts, 1, ss_inst1, Realm::ProfilingRequestSet())
+      .wait();
+
+  std::vector<size_t> field_sizes;
+  field_sizes.push_back(sizeof(FT));
+  field_sizes.push_back(sizeof(Point<N2, T2>));
+
+  ri_data1.resize(num_insts);
+  fd_vals1.resize(num_insts);
+
+  for(size_t i = 0; i < num_insts; i++) {
+    RegionInstance ri;
+    RegionInstance::create_instance(ri, memories[i], ss_inst1[i], field_sizes, 0 /*SOA*/,
+                                    Realm::ProfilingRequestSet())
+        .wait();
+    log_app.debug() << "inst[" << i << "] = " << ri << " (" << ss_inst1[i] << ")";
+    ri_data1[i] = ri;
+
+    fd_vals1[i].index_space = ss_inst1[i];
+    fd_vals1[i].inst = ri;
+    fd_vals1[i].field_offset = 0;
+  }
+
+  log_app.debug() << "colors = " << colors;
+
+  for(size_t i = 0; i < num_insts; i++) {
+    fill_instance_data(root1 /*ss_inst1[i]*/, ri_data1[i]);
+  }
+
+  return Event::NO_EVENT;
+}
+
+template <int N1, typename T1, int N2, typename T2, typename FT, typename TRANSFORM>
+Event RandomAffineTest<N1, T1, N2, T2, FT, TRANSFORM>::perform_partitioning(void)
+{
+  // start by filtering root1 by color
+  std::vector<FT> piece_colors(colors.begin(), colors.begin() + num_pieces);
+
+  Event e1 = root1.create_subspaces_by_field(fd_vals1, piece_colors, ss_by_color,
+                                             ProfilingRequestSet());
+  e1.wait();
+
+  for(int i = 0; i < num_pieces; i++) {
+    log_app.debug() << "bycolor[" << i << "] (" << colors[i] << ") = " << ss_by_color[i];
+    dump_sparse_index_space("", ss_by_color[i]);
+  }
+
+  for(size_t idx = 0; idx < transforms.size(); idx++) {
+    log_app.debug() << "Compute images for transform idx=" << idx;
+
+    unsigned long long start_time = Clock::current_time_in_nanoseconds();
+    // images
+    Event e2 = root2.create_subspaces_by_image(
+        transforms[idx], ss_by_color, dense_images[idx], ProfilingRequestSet(), e1);
+    e2.wait();
+
+    log_app.debug() << "affine image time="
+                    << (Clock::current_time_in_nanoseconds() - start_time);
+
+    for(int i = 0; i < num_pieces; i++) {
+      log_app.debug() << "image[" << i << "] = " << dense_images[idx][i];
+      dump_sparse_index_space("", dense_images[idx][i]);
+    }
+
+    start_time = Clock::current_time_in_nanoseconds();
+    Event e3 = root2_sparse.create_subspaces_by_image(
+        transforms[idx], ss_by_color, sparse_images[idx], ProfilingRequestSet(), e2);
+
+    e3.wait();
+    log_app.debug() << "affine sparse image time="
+                    << (Clock::current_time_in_nanoseconds() - start_time);
+
+    for(int i = 0; i < num_pieces; i++) {
+      log_app.debug() << "sparse_image1[" << i << "] = " << sparse_images[idx][i];
+      dump_sparse_index_space("", sparse_images[idx][i]);
+    }
+
+    // preimages
+    Event e4 = root1.create_subspaces_by_preimage(transforms[idx], dense_images[idx],
+                                                  dense_preimages[idx],
+                                                  ProfilingRequestSet(), e3);
+    e4.wait();
+
+    for(int i = 0; i < num_pieces; i++) {
+      log_app.debug() << "dense_preimage[" << i << "] = " << dense_preimages[idx][i];
+      dump_sparse_index_space("", dense_preimages[idx][i]);
+    }
+
+    Event e5 = root1.create_subspaces_by_preimage(transforms[idx], sparse_images[idx],
+                                                  sparse_preimages[idx],
+                                                  ProfilingRequestSet(), e4);
+    e5.wait();
+
+    for(int i = 0; i < num_pieces; i++) {
+      log_app.debug() << "sparse_preimage[" << i << "] = " << sparse_preimages[idx][i];
+      dump_sparse_index_space("", sparse_preimages[idx][i]);
+    }
+  }
+
+  return Event::NO_EVENT;
+}
+
+template <int N1, typename T1, int N2, typename T2, typename FT, typename TRANSFORM>
+int RandomAffineTest<N1, T1, N2, T2, FT, TRANSFORM>::perform_dynamic_checks(void)
+{
+  return 0;
+}
+
+template <int N1, typename T1, int N2, typename T2, typename FT, typename TRANSFORM>
+int RandomAffineTest<N1, T1, N2, T2, FT, TRANSFORM>::verify_results(
+    const IndexSpace<N2, T2> &root, const TRANSFORM &transform,
+    const std::vector<std::vector<IndexSpace<N2, T1>>> &images,
+    const std::vector<std::vector<IndexSpace<N1, T1>>> &preimages)
+{
+  for(size_t idx = 0; idx < transforms.size(); idx++) {
+    assert(ss_by_color.size() == images[idx].size() &&
+           images[idx].size() == preimages[idx].size());
+    int image_total = 0;
+    for(const auto &image : images[idx]) {
+      for(IndexSpaceIterator<N2, T2> it2(image); it2.valid; it2.step()) {
+        image_total += it2.rect.volume();
+      }
+    }
+
+    int preimage_total = 0;
+    for(const auto &preimage : preimages[idx]) {
+      for(IndexSpaceIterator<N1, T1> it2(preimage); it2.valid; it2.step()) {
+        preimage_total += it2.rect.volume();
+      }
+    }
+
+    if(image_total != preimage_total)
+      return 1;
+
+    for(size_t i = 0; i < ss_by_color.size(); i++) {
+      for(IndexSpaceIterator<N1, T1> it(ss_by_color[i]); it.valid; it.step()) {
+        for(PointInRectIterator<N1, T1> point(it.rect); point.valid; point.step()) {
+          auto target_point = transforms[idx][point.p];
+          if(root.contains(target_point)) {
+            if(!images[idx][i].contains(target_point)) {
+              return 1;
+            }
+            if(!preimages[idx][i].contains(point.p)) {
+              return 1;
+            }
+          }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+template <int N1, typename T1, int N2, typename T2, typename FT, typename TRANSFORM>
+int RandomAffineTest<N1, T1, N2, T2, FT, TRANSFORM>::check_partitioning(void)
+{
+  int result = 0;
+  for(size_t i = 0; i < transforms.size(); i++) {
+    if(verify_results(root2, transforms[i], dense_images, dense_preimages) ||
+       verify_results(root2_sparse, transforms[i], sparse_images, sparse_preimages)) {
+      result++;
+    }
+  }
+  root1.destroy();
+  root2.destroy();
+  root2_sparse.destroy();
+  for(unsigned i = 0; i < dense_images.size(); i++) {
+    for(unsigned j = 0; j < dense_images[i].size(); j++) {
+      dense_images[i][j].destroy();
+    }
+  }
+  for(unsigned i = 0; i < sparse_images.size(); i++) {
+    for(unsigned j = 0; j < sparse_images[i].size(); j++) {
+      sparse_images[i][j].destroy();
+    }
+  }
+  for(unsigned i = 0; i < dense_preimages.size(); i++) {
+    for(unsigned j = 0; j < dense_preimages[i].size(); j++) {
+      dense_preimages[i][j].destroy();
+    }
+  }
+  for(unsigned i = 0; i < sparse_preimages.size(); i++) {
+    for(unsigned j = 0; j < sparse_preimages[i].size(); j++) {
+      sparse_preimages[i][j].destroy();
+    }
+  }
+  return result;
+}
+
+template <int N1, typename T1, int N2, typename T2, typename FT>
+std::vector<TranslationTransform<N2, T2>> create_translate_transforms(int size)
+{
+  RandStream<> rs(random_seed + 2);
+  std::vector<TranslationTransform<N2, T2>> transforms;
+  {
+    TranslationTransform<N2, T2> translate;
+    translate.offset = Point<N2, T2>::ZEROES();
+    for(int i = 0; i < N2; i++) {
+      translate.offset[i] = rs.rand_int(size - 1);
+    }
+    transforms.push_back(translate);
+  }
+  return transforms;
+}
+
+template <int N1, typename T1, int N2, typename T2, typename FT>
+std::vector<AffineTransform<N2, N1, T2>> create_affine_transforms()
+{
+  std::vector<AffineTransform<N2, N1, T2>> transforms;
+
+  {
+    AffineTransform<N2, N1, T2> transpose;
+    for(int i = 0; i < N2; i++) {
+      for(int j = 0; j < N1; j++) {
+        transpose.transform[i][j] = (i == N1 - j - 1);
+      }
+    }
+    transpose.offset = Point<N2, T2>::ZEROES();
+    transforms.push_back(transpose);
+  }
+
+  {
+    AffineTransform<N2, N1, T2> translate;
+    for(int i = 0; i < N2; i++) {
+      for(int j = 0; j < N1; j++) {
+        translate.transform[i][j] = (i == j);
+      }
+    }
+    translate.offset = Point<N2, T2>::ZEROES();
+    transforms.push_back(translate);
+  }
+
+  {
+    AffineTransform<N2, N1, T2> scale;
+    for(int i = 0; i < N2; i++) {
+      for(int j = 0; j < N1; j++) {
+        scale.transform[i][j] = (i == j) ? 2 : 0;
+      }
+    }
+    scale.offset = Point<N2, T2>::ZEROES();
+    transforms.push_back(scale);
+  }
+
+  {
+    AffineTransform<N2, N1, T2> shear;
+    for(int i = 0; i < N2; i++) {
+      for(int j = 0; j < N1; j++) {
+        shear.transform[i][j] = (i == j);
+      }
+      shear.transform[i][i + 1] = 1;
+    }
+    shear.offset = Point<N2, T2>::ZEROES();
+    transforms.push_back(shear);
+  }
+
+  {
+    AffineTransform<N2, N1, T2> reflect;
+    for(int i = 0; i < N2; i++) {
+      for(int j = 0; j < N1; j++) {
+        reflect.transform[i][j] = (i == j) ? -1 : 0;
+      }
+    }
+    reflect.offset = Point<N2, T2>::ZEROES();
+    // transforms.push_back(reflect);
+  }
+  return transforms;
+}
+
+TestInterface *run_structured_test(TransformType type, int argc, char **argv)
+{
+  switch(type) {
+  case TransformType::AFFINE:
+    return new RandomAffineTest<2, int, 2, int, int, AffineTransform<2, 2, int>>(
+        argc, const_cast<const char **>(argv),
+        create_affine_transforms<2, int, 2, int, int>());
+  case TransformType::TRANSLATION:
+    return new RandomAffineTest<2, int, 2, int, int, TranslationTransform<2, int>>(
+        argc, const_cast<const char **>(argv),
+        create_translate_transforms<2, int, 2, int, int>(4));
+  }
+  return nullptr;
+}
+
+int main(int argc, char **argv)
+{
+  Runtime rt;
+
+  rt.init(&argc, &argv);
+
+  // parse global options
+  for(int i = 1; i < argc; i++) {
+    if(!strcmp(argv[i], "-seed")) {
+      random_seed = atoi(argv[++i]);
+      continue;
+    }
+
+    if(!strcmp(argv[i], "-random")) {
+      random_colors = true;
+      continue;
+    }
+
+    if(!strcmp(argv[i], "-wait")) {
+      wait_on_events = true;
+      continue;
+    }
+
+    if(!strcmp(argv[i], "-show")) {
+      show_graph = true;
+      continue;
+    }
+
+    if(!strcmp(argv[i], "-nocheck")) {
+      skip_check = true;
+      continue;
+    }
+
+    // test cases consume the rest of the args
+    if(!strcmp(argv[i], "circuit")) {
+      testcfg = new CircuitTest(argc - i, const_cast<const char **>(argv + i));
+      break;
+    }
+
+    if(!strcmp(argv[i], "basic")) {
+      testcfg = new BasicTest(argc - i, const_cast<const char **>(argv + i));
+      break;
+    }
+
+    if(!strcmp(argv[i], "tile")) {
+      testcfg = new TileTest(argc - i, const_cast<const char **>(argv + i));
+      break;
+    }
+
+    if (!strcmp(argv[i], "range")) {
+      testcfg = new RangeTest(argc - i, const_cast<const char **>(argv + i));
+      break;
+    }
+
+    if (!strcmp(argv[i], "multi")) {
+      testcfg = new Range2DTest(argc - i, const_cast<const char **>(argv + i));
+      break;
+    }
+
+    if(!strcmp(argv[i], "pennant")) {
+      testcfg = new PennantTest(argc - i, const_cast<const char **>(argv + i));
+      break;
+    }
+
+    if(!strcmp(argv[i], "miniaero")) {
+      testcfg = new MiniAeroTest(argc - i, const_cast<const char **>(argv + i));
+      break;
+    }
+
+    if(!strcmp(argv[i], "random")) {
+      testcfg = new RandomTest<1, int, 2, int, int>(argc - i,
+                                                    const_cast<const char **>(argv + i));
+      break;
+    }
+
+    if(!strcmp(argv[i], "affine")) {
+      TransformType type = TransformType::AFFINE;
+      if(i < argc - 1 && !strcmp(argv[++i], "-type")) {
+        type = static_cast<TransformType>(atoi(argv[++i]));
+      }
+      testcfg = run_structured_test(type, argc, argv);
+      break;
+    }
+
+    // printf("unknown parameter: %s\n", argv[i]);
+  }
+
+  // if no test specified, use circuit (with default parameters)
+  if(!testcfg) {
+    testcfg = new CircuitTest(0, 0);
+  }
+
+  rt.register_task(TOP_LEVEL_TASK, top_level_task);
+  rt.register_task(INIT_CIRCUIT_DATA_TASK, CircuitTest::init_data_task_wrapper);
+  rt.register_task(INIT_PENNANT_DATA_TASK, PennantTest::init_data_task_wrapper);
+  rt.register_task(INIT_BASIC_DATA_TASK, BasicTest::init_data_task_wrapper);
+  rt.register_task(INIT_TILE_DATA_TASK, TileTest::init_data_task_wrapper);
+  rt.register_task(INIT_RANGE_DATA_TASK, RangeTest::init_data_task_wrapper);
+  rt.register_task(INIT_RANGE2D_DATA_TASK, Range2DTest::init_data_task_wrapper);
+  rt.register_task(INIT_MINIAERO_DATA_TASK, MiniAeroTest::init_data_task_wrapper);
+
+  signal(SIGALRM, sigalrm_handler);
+
+  Processor p = Machine::ProcessorQuery(Machine::get_machine())
+                    .only_kind(Processor::LOC_PROC)
+                    .first();
+  assert(p.exists());
+
+  // collective launch of a single task - everybody gets the same finish
+  // event
+  Event e = rt.collective_spawn(p, TOP_LEVEL_TASK, 0, 0);
+
+  // request shutdown once that task is complete
+  rt.shutdown(e);
+
+  // now sleep this thread until that shutdown actually happens
+  rt.wait_for_shutdown();
+
+  delete testcfg;
+
+  return 0;
+}
diff --git a/tests/deppart.cc b/tests/deppart.cc
index 8fde66845d..b6847f5513 100644
--- a/tests/deppart.cc
+++ b/tests/deppart.cc
@@ -502,25 +502,29 @@ class BasicTest : public TestInterface {
 	wait_on_events = true;
         log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n";
         const char* val = std::getenv("TILE_SIZE");  // or any env var
-        size_t tile_size = 100000000; //default
+        size_t tile_size = 10000000; //default
         if (val) {
           tile_size = atoi(val);
         }
         std::vector<size_t> byte_fields = {sizeof(char)};
         IndexSpace<1> instance_index_space(Rect<1>(0, tile_size-1));
+        IndexSpace<1> dst_index_space(Rect<1>(0, tile_size/100-1));
         for (size_t i = 0; i < piece_field_data_gpu.size(); i++) {
           RegionInstance::create_instance(piece_field_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
         }
         for (size_t i = 0; i < src_field_data_gpu.size(); i++) {
           RegionInstance::create_instance(src_field_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
         }
+        for (size_t i = 0; i < dst_field_data_gpu.size(); i++) {
+          RegionInstance::create_instance(dst_field_data_gpu[i].scratch_buffer, gpu_memory, dst_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
+        }
         std::vector<IndexSpace<1> > p_garbage_nodes, p_garbage_edges, p_garbage_rd, p_garbage_preimage_edges;
     Event e01 = is_nodes.create_subspaces_by_field(piece_field_data_gpu,
                                                   colors,
                                                   p_garbage_nodes,
                                                   Realm::ProfilingRequestSet());
         if (wait_on_events) e01.wait();
-    Event e02 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+    Event e02 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu,
                                                      p_garbage_nodes,
                                                      p_garbage_edges,
                                                      Realm::ProfilingRequestSet(),
@@ -536,7 +540,7 @@ class BasicTest : public TestInterface {
                                                   e02);
     if(wait_on_events) e03.wait();
 
-    Event e04 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+    Event e04 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu,
                                                   p_garbage_rd,
                                                   p_garbage_preimage_edges,
                                                   Realm::ProfilingRequestSet(),
@@ -553,7 +557,7 @@ class BasicTest : public TestInterface {
   	log_app.info() << "GPU By Field complete " << Clock::current_time_in_microseconds() << "\n";
   	log_app.info() << "Starting GPU Preimage " << Clock::current_time_in_microseconds() << "\n";
     // now compute p_edges based on the color of their in_node (i.e. a preimage)
-    Event e2 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+    Event e2 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu,
 						     p_nodes,
 						     p_edges,
 						     Realm::ProfilingRequestSet(),
@@ -576,7 +580,7 @@ class BasicTest : public TestInterface {
   	log_app.info() << "GPU Image complete " << Clock::current_time_in_microseconds() << "\n";
   	log_app.info() << "Starting second GPU preimage " << Clock::current_time_in_microseconds() << "\n";
 
-    Event e4 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
+    Event e4 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu,
 						  p_rd,
 						  p_preimage_edges,
 						  Realm::ProfilingRequestSet(),
@@ -1647,7 +1651,7 @@ class RangeTest : public TestInterface {
     }
     is_nodes.by_image_buffer_requirements(subspace_input, image_estimate_input, image_estimate_output);
     for (size_t i = 0; i < rect_val_data_gpu.size(); i++) {
-      IndexSpace<1> instance_index_space(Rect<1>(0, (image_estimate_output[i].upper_bound)/4-1));
+      IndexSpace<1> instance_index_space(Rect<1>(0, (image_estimate_output[i].upper_bound)/12-1));
       RegionInstance::create_instance(rect_val_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
     }
     Event e002 = is_nodes.create_subspaces_by_image(rect_val_data_gpu,

From 1fc63681df000fe7706324aed709821ac9b584b3 Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Mon, 23 Feb 2026 01:41:38 -0800
Subject: [PATCH 17/32] benchmarks done for byfield and image

---
 tests/benchmark.cc | 4793 ++++----------------------------------------
 1 file changed, 362 insertions(+), 4431 deletions(-)

diff --git a/tests/benchmark.cc b/tests/benchmark.cc
index b6847f5513..9615a3bcbc 100644
--- a/tests/benchmark.cc
+++ b/tests/benchmark.cc
@@ -40,13 +40,12 @@ Logger log_app("app");
 enum
 {
   TOP_LEVEL_TASK = Processor::TASK_ID_FIRST_AVAILABLE + 0,
-  INIT_CIRCUIT_DATA_TASK,
-  INIT_BASIC_DATA_TASK,
-  INIT_TILE_DATA_TASK,
-  INIT_RANGE_DATA_TASK,
-  INIT_RANGE2D_DATA_TASK,
-  INIT_PENNANT_DATA_TASK,
-  INIT_MINIAERO_DATA_TASK,
+  INIT_BYFIELD_DATA_TASK,
+  INIT_IMAGE_DATA_TASK,
+};
+
+enum TestType {
+  BYFIELD = 0
 };
 
 enum TransformType
@@ -84,41 +83,6 @@ void sigalrm_handler(int sig)
   exit(1);
 }
 
-template <int N, typename T>
-void dump_sparse_index_space(const char *pfx, IndexSpace<N, T> is)
-{
-  std::cout << pfx << ": " << is << "\n";
-  if(!is.sparsity.exists())
-    return;
-  SparsityMapPublicImpl<N, T> *impl = is.sparsity.impl();
-  span<SparsityMapEntry<N, T>> entries = impl->get_entries();
-  for(size_t i = 0; i < entries.size(); i++) {
-    SparsityMapEntry<N, T> entry = entries[i];
-    std::cout << "  " << entry.bounds;
-    if(entry.bitmap)
-      std::cout << " bitmap(" << entry.bitmap << ")";
-    if(entry.sparsity.exists())
-      std::cout << " sparsity(" << entry.sparsity << ")";
-    std::cout << "\n";
-  }
-}
-
-static int check_empty(Event e, const std::vector<IndexSpace<1>> &p, const char *pfx)
-{
-  int errors = 0;
-  e.wait();
-  for(size_t i = 0; i < p.size(); i++) {
-    p[i].make_valid().wait();
-    if(p[i].volume() > 0) {
-      log_app.error() << "HELP! " << pfx << "[" << i << "] space " << p[i]
-                      << " isn't empty?";
-      dump_sparse_index_space(pfx, p[i]);
-      errors++;
-    }
-  }
-  return errors;
-}
-
 class TestInterface {
 public:
   virtual ~TestInterface(void) {}
@@ -142,43 +106,64 @@ namespace {
   bool wait_on_events = false;
   bool show_graph = false;
   bool skip_check = false;
+  int dimension1 = 1;
+  int dimension2 = 1;
+  TestType test_type = BYFIELD;
+
   TestInterface *testcfg = 0;
 }; // namespace
 
-template <typename T>
-void split_evenly(T total, T pieces, std::vector<T> &cuts)
+template<typename IS, typename FT>
+Event copy_piece(FieldDataDescriptor<IS, FT> src_data, FieldDataDescriptor<IS, FT> &dst_data, const std::vector<size_t> &fields, size_t field_idx, Memory dst_memory)
 {
-  cuts.resize(pieces + 1);
-  for(T i = 0; i <= pieces; i++)
-    cuts[i] = ((long long)total * i) / pieces;
+  size_t offset = 0;
+  for (size_t i = 0; i < field_idx; i++) {
+    offset += fields[i];
+  }
+  size_t size = fields[field_idx];
+  RegionInstance::create_instance(dst_data.inst,
+                                        dst_memory,
+                                        src_data.index_space,
+                                        fields,
+                                        0 /*SOA*/,
+                                        Realm::ProfilingRequestSet()).wait();
+  CopySrcDstField src_field, dst_field;
+  src_field.inst = src_data.inst;
+  src_field.size = size;
+  src_field.field_id = offset;
+  dst_field.inst = dst_data.inst;
+  dst_field.size = size;
+  dst_field.field_id = offset;
+  dst_data.index_space = src_data.index_space;
+  dst_data.field_offset = src_data.field_offset;
+  std::vector<CopySrcDstField> src_fields = {src_field};
+  std::vector<CopySrcDstField> dst_fields = {dst_field};
+  return src_data.index_space.copy(src_fields, dst_fields, Realm::ProfilingRequestSet());
 }
 
-template <typename T>
-int find_split(const std::vector<T> &cuts, T v)
-{
-  // dumb linear search
-  assert(v >= cuts[0]);
-  for(size_t i = 1; i < cuts.size(); i++)
-    if(v < cuts[i])
-      return i - 1;
-  assert(false);
-  return 0;
+Event alloc_piece(RegionInstance &result, size_t size, Memory location) {
+  assert(location != Memory::NO_MEMORY);
+  assert(size > 0);
+  std::vector<size_t> byte_fields = {sizeof(char)};
+  IndexSpace<1> instance_index_space(Rect<1>(0, size-1));
+  return RegionInstance::create_instance(result, location, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet());
 }
 
 /*
- * Basic test - create a graph, partition it by
+ * Byfield test - create a graph, partition it by
  * node subgraph id and then check that the partitioning
  * is correct
  */
-class BasicTest : public TestInterface {
+template<int N>
+class ByfieldTest : public TestInterface {
 public:
   // graph config parameters
   int num_nodes = 1000;
-  int num_edges = 1000;
   int num_pieces = 4;
+  int num_colors = 4;
   std::string filename;
 
-  BasicTest(int argc, const char *argv[])
+  ByfieldTest(int argc, const char *argv[])
   {
     for(int i = 1; i < argc; i++) {
 
@@ -190,23 +175,22 @@ class BasicTest : public TestInterface {
         num_nodes = atoi(argv[++i]);
         continue;
       }
-      if(!strcmp(argv[i], "-e")) {
-        num_edges = atoi(argv[++i]);
+      if(!strcmp(argv[i], "-c")) {
+        num_colors = atoi(argv[++i]);
         continue;
       }
     }
 
 
-    if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0) {
-      log_app.error() << "Invalid config: nodes=" << num_nodes << " edges=" << num_edges << " pieces=" << num_pieces << "\n";
+    if (num_nodes <= 0 || num_pieces <= 0 || num_colors <= 0) {
+      log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_colors << " pieces=" << num_pieces << "\n";
       exit(1);
     }
   }
 
   struct InitDataArgs {
     int index;
-    RegionInstance ri_nodes;
-    RegionInstance ri_edges;
+    RegionInstance ri_colors;
   };
 
   enum PRNGStreams
@@ -215,25 +199,18 @@ class BasicTest : public TestInterface {
   };
 
   // assign subgraph ids to nodes
-  void random_node_data(int idx, int &subgraph)
+  void color_point(int idx, int& color)
   {
     if(random_colors)
-      subgraph =
-          Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces);
-    else
-      subgraph = idx * num_pieces / num_nodes;
-  }
-
-  void random_edge_data(int idx, int& src, int& dst)
-  {
-    src = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes);
-    dst = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, num_nodes);
+        color = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_colors);
+      else
+        color = (idx * num_colors / num_nodes) % num_colors;
   }
 
   static void init_data_task_wrapper(const void *args, size_t arglen,
                                      const void *userdata, size_t userlen, Processor p)
   {
-    BasicTest *me = (BasicTest *)testcfg;
+    ByfieldTest *me = (ByfieldTest *)testcfg;
     me->init_data_task(args, arglen, p);
   }
 
@@ -242,95 +219,68 @@ class BasicTest : public TestInterface {
   {
     const InitDataArgs &i_args = *(const InitDataArgs *)args;
 
-    log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes
+    log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_colors
                    << ")";
 
-    i_args.ri_nodes.fetch_metadata(p).wait();
-    i_args.ri_edges.fetch_metadata(p).wait();
+    i_args.ri_colors.fetch_metadata(p).wait();
 
-    IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>();
-    IndexSpace<1> is_edges = i_args.ri_edges.get_indexspace<1>();
+    IndexSpace<N> colors_space = i_args.ri_colors.template get_indexspace<N>();
 
-    log_app.debug() << "N: " << is_nodes;
-    log_app.debug() << "E: " << is_edges;
+    log_app.debug() << "N: " << is_colors;
 
     //For each node in the graph, mark it with a random (or deterministic) subgraph id
     {
-      AffineAccessor<int, 1> a_piece_id(i_args.ri_nodes, 0 /* offset */);
-
-      for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) {
-        int subgraph;
-        random_node_data(i, subgraph);
-        a_piece_id.write(i, subgraph);
-      }
-
-      AffineAccessor<Point<1>,1> a_src(i_args.ri_edges, 0 /* offset */);
-      AffineAccessor<Point<1>,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */);
-
-      for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) {
-        int src, dst;
-        random_edge_data(i, src, dst);
-        a_src.write(i, Point<1>(src));
-        a_dst.write(i, Point<1>(dst));
+      AffineAccessor<int, N> a_piece_id(i_args.ri_colors, 0 /* offset */);
+
+      for (IndexSpaceIterator<N> it(is_colors); it.valid; it.step()) {
+        for (PointInRectIterator<N> point(it.rect); point.valid; point.step()) {
+          int idx = 0;
+          int stride = 1;
+          for (int d = 0; d < N; d++) {
+            idx += (point.p[d] - is_colors.bounds.lo[d]) * stride;
+            stride *= (is_colors.bounds.hi[d] - is_colors.bounds.lo[d] + 1);
+          }
+          int subgraph;
+          color_point(idx, subgraph);
+          a_piece_id.write(point.p, subgraph);
+        }
       }
     }
-
-    //Optionally print out the assigned subgraph ids
-    if(show_graph) {
-      AffineAccessor<int, 1> a_piece_id(i_args.ri_nodes, 0 /* offset */);
-
-      for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++)
-        log_app.info() << "piece_id[" << i << "] = " << a_piece_id.read(i) << "\n";
-
-      AffineAccessor<Point<1>,1> a_src(i_args.ri_edges, 0 /* offset */);
-      AffineAccessor<Point<1>,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */);
-
-      for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++)
-        log_app.info() << "src, dst[" << i << "] = " << a_src.read(i) << ", " << a_dst.read(i) << "\n";
-    }
   }
 
-  IndexSpace<1> is_nodes, is_edges;
-  std::vector<RegionInstance> ri_nodes, ri_edges;
-  std::vector<FieldDataDescriptor<IndexSpace<1>, int> > piece_id_field_data;
-  std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1> > > src_node_field_data, dst_node_field_data;
+  IndexSpace<N> is_colors;
+  std::vector<RegionInstance> ri_colors;
+  std::vector<FieldDataDescriptor<IndexSpace<N>, int> > piece_id_field_data;
 
   virtual void print_info(void)
   {
-    printf("Realm dependent partitioning test - basic: %d nodes, %d edges, %d pieces\n",
-	   (int)num_nodes, (int) num_edges, (int)num_pieces);
+    printf("Realm %dD Byfield dependent partitioning test: %d nodes, %d colors, %d pieces\n", (int) N,
+	   (int)num_nodes, (int) num_colors, (int)num_pieces);
   }
 
   virtual Event initialize_data(const std::vector<Memory> &memories,
                                 const std::vector<Processor> &procs)
   {
     // now create index space for nodes
-    is_nodes = Rect<1>(0, num_nodes - 1);
-    is_edges = Rect<1>(0, num_edges - 1);
+    Point<N> lo, hi;
+    for (int d = 0; d < N; d++) {
+      lo[d] = 0;
+      hi[d] = num_nodes - 1;
+    }
+    is_colors = Rect<N>(lo, hi);
 
     // equal partition is used to do initial population of edges and nodes
-    std::vector<IndexSpace<1> > ss_nodes_eq;
-    std::vector<IndexSpace<1> > ss_edges_eq;
+    std::vector<IndexSpace<N> > ss_nodes_eq;
 
     log_app.info() << "Creating equal subspaces\n";
 
-    is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait();
-    is_edges.create_equal_subspaces(num_pieces, 1, ss_edges_eq, Realm::ProfilingRequestSet()).wait();
-
-    log_app.debug() << "Initial partitions:";
-    for(size_t i = 0; i < ss_nodes_eq.size(); i++)
-      log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i];
-    for(size_t i = 0; i < ss_edges_eq.size(); i++)
-      log_app.debug() << " Edges #" << i << ": " << ss_edges_eq[i];
+    is_colors.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait();
 
     // create instances for each of these subspaces
-    std::vector<size_t> node_fields, edge_fields;
-    node_fields.push_back(sizeof(int));  // piece_id
-    assert(sizeof(int) == sizeof(Point<1>));
-    edge_fields.push_back(sizeof(Point<1>));  // src_node
-    edge_fields.push_back(sizeof(Point<1>));  // dst_node
-
-    ri_nodes.resize(num_pieces);
+    std::vector<size_t> node_fields;
+    node_fields.push_back(sizeof(int));
+    
+    ri_colors.resize(num_pieces);
     piece_id_field_data.resize(num_pieces);
 
     for(size_t i = 0; i < ss_nodes_eq.size(); i++) {
@@ -339,47 +289,21 @@ class BasicTest : public TestInterface {
                                       node_fields, 0 /*SOA*/,
                                       Realm::ProfilingRequestSet())
           .wait();
-      ri_nodes[i] = ri;
+      ri_colors[i] = ri;
 
       piece_id_field_data[i].index_space = ss_nodes_eq[i];
-      piece_id_field_data[i].inst = ri_nodes[i];
+      piece_id_field_data[i].inst = ri_colors[i];
       piece_id_field_data[i].field_offset = 0;
     }
 
-
-    // Fire off tasks to initialize data
-    ri_edges.resize(num_pieces);
-    src_node_field_data.resize(num_pieces);
-    dst_node_field_data.resize(num_pieces);
-
-    for(size_t i = 0; i < ss_edges_eq.size(); i++) {
-      RegionInstance ri;
-      RegionInstance::create_instance(ri,
-				      memories[i % memories.size()],
-				      ss_edges_eq[i],
-				      edge_fields,
-				      0 /*SOA*/,
-				      Realm::ProfilingRequestSet()).wait();
-      ri_edges[i] = ri;
-
-      src_node_field_data[i].index_space = ss_edges_eq[i];
-      src_node_field_data[i].inst = ri_edges[i];
-      src_node_field_data[i].field_offset = 0 * sizeof(Point<1>);
-
-      dst_node_field_data[i].index_space = ss_edges_eq[i];
-      dst_node_field_data[i].inst = ri_edges[i];
-      dst_node_field_data[i].field_offset = 1 * sizeof(Point<1>);
-    }
-
     // fire off tasks to initialize data
     std::set<Event> events;
     for(int i = 0; i < num_pieces; i++) {
       Processor p = procs[i % procs.size()];
       InitDataArgs args;
       args.index = i;
-      args.ri_nodes = ri_nodes[i];
-      args.ri_edges = ri_edges[i];
-      Event e = p.spawn(INIT_BASIC_DATA_TASK, &args, sizeof(args));
+      args.ri_colors = ri_colors[i];
+      Event e = p.spawn(INIT_BYFIELD_DATA_TASK, &args, sizeof(args));
       events.insert(e);
     }
 
@@ -391,19 +315,15 @@ class BasicTest : public TestInterface {
   //  p_nodes_cpu - nodes partitioned by subgraph id (from CPU)
 
 
-    std::vector<IndexSpace<1> > p_nodes, p_rd;
-    std::vector<IndexSpace<1> > p_edges, p_preimage_edges;
-
-    std::vector<IndexSpace<1> > p_nodes_cpu, p_rd_cpu;
-    std::vector<IndexSpace<1> > p_edges_cpu, p_preimage_edges_cpu;
+    std::vector<IndexSpace<N> > p_nodes, p_garbage_nodes, p_nodes_cpu;
 
   virtual Event perform_partitioning(void)
   {
     // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU
     // Ensure that the results are identical
 
-    std::vector<int> colors(num_pieces);
-    for(int i = 0; i < num_pieces; i++)
+    std::vector<int> colors(num_colors);
+    for(int i = 0; i < num_colors; i++)
       colors[i] = i;
 
     // We need a GPU memory for GPU partitioning
@@ -423,210 +343,52 @@ class BasicTest : public TestInterface {
       log_app.error() << "No GPU memory found for partitioning test\n";
       return Event::NO_EVENT;
     }
-    std::vector<size_t> edge_fields;
-    edge_fields.push_back(sizeof(Point<1>));
-    edge_fields.push_back(sizeof(Point<1>))	;
+
+
     std::vector<size_t> node_fields;
     node_fields.push_back(sizeof(int));
 
-    std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1> > > src_field_data_gpu;
-    std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1> > > dst_field_data_gpu;
-    std::vector<FieldDataDescriptor<IndexSpace<1>, int> > piece_field_data_gpu;
+    std::vector<FieldDataDescriptor<IndexSpace<N>, int> > piece_field_data_gpu;
     piece_field_data_gpu.resize(num_pieces);
-    src_field_data_gpu.resize(num_pieces);
-    dst_field_data_gpu.resize(num_pieces);
+
     for (int i = 0; i < num_pieces; i++) {
-        RegionInstance src_gpu_instance;
-        RegionInstance dst_gpu_instance;
-    	RegionInstance piece_gpu_instance;
-        RegionInstance::create_instance(src_gpu_instance,
-				      gpu_memory,
-				      src_node_field_data[i].index_space,
-				      edge_fields,
-				      0 /*SOA*/,
-				      Realm::ProfilingRequestSet()).wait();
-        RegionInstance::create_instance(dst_gpu_instance,
-				      gpu_memory,
-				      dst_node_field_data[i].index_space,
-				      edge_fields,
-				      0 /*SOA*/,
-				      Realm::ProfilingRequestSet()).wait();
-    	RegionInstance::create_instance(piece_gpu_instance,
-					  gpu_memory,
-					  piece_id_field_data[i].index_space,
-					  node_fields,
-					  0 /*SOA*/,
-					  Realm::ProfilingRequestSet()).wait();
-      CopySrcDstField src_gpu_field, src_cpu_field, dst_gpu_field, dst_cpu_field, piece_gpu_field, piece_cpu_field;
-      src_gpu_field.inst = src_gpu_instance;
-      src_gpu_field.size = sizeof(Point<1>);
-      src_gpu_field.field_id = 0;
-      src_cpu_field.inst = src_node_field_data[i].inst;
-      src_cpu_field.size = sizeof(Point<1>);
-      src_cpu_field.field_id = 0;
-      dst_gpu_field.inst = dst_gpu_instance;
-      dst_gpu_field.size = sizeof(Point<1>);
-      dst_gpu_field.field_id = sizeof(Point<1>);
-      dst_cpu_field.inst = dst_node_field_data[i].inst;
-      dst_cpu_field.size = sizeof(Point<1>);
-      dst_cpu_field.field_id = sizeof(Point<1>);
-      piece_gpu_field.inst = piece_gpu_instance;
-      piece_gpu_field.size = sizeof(int);
-      piece_gpu_field.field_id = 0;
-      piece_cpu_field.inst = piece_id_field_data[i].inst;
-      piece_cpu_field.size = sizeof(int);
-      piece_cpu_field.field_id = 0;
-      std::vector<CopySrcDstField> src_cpu_data, src_gpu_data, dst_cpu_data, dst_gpu_data, piece_cpu_data, piece_gpu_data;
-      src_cpu_data.push_back(src_cpu_field);
-      dst_cpu_data.push_back(dst_cpu_field);
-      src_gpu_data.push_back(src_gpu_field);
-      dst_gpu_data.push_back(dst_gpu_field);
-    	piece_gpu_data.push_back(piece_gpu_field);
-    	piece_cpu_data.push_back(piece_cpu_field);
-      Event copy_event = src_node_field_data[i].index_space.copy(src_cpu_data, src_gpu_data, Realm::ProfilingRequestSet());
-      copy_event.wait();
-      Event second_copy_event = dst_node_field_data[i].index_space.copy(dst_cpu_data, dst_gpu_data, Realm::ProfilingRequestSet());
-      second_copy_event.wait();
-    	Event third_copy_event = piece_id_field_data[i].index_space.copy(piece_cpu_data, piece_gpu_data, Realm::ProfilingRequestSet());
-    		  third_copy_event.wait();
-      src_field_data_gpu[i].inst = src_gpu_instance;
-      src_field_data_gpu[i].index_space = src_node_field_data[i].index_space;
-      src_field_data_gpu[i].field_offset = 0;
-      dst_field_data_gpu[i].inst = dst_gpu_instance;
-      dst_field_data_gpu[i].index_space = dst_node_field_data[i].index_space;
-      dst_field_data_gpu[i].field_offset = 1 * sizeof(Point<1>);
-    	piece_field_data_gpu[i].inst = piece_gpu_instance;
-    	piece_field_data_gpu[i].index_space = piece_id_field_data[i].index_space;
-    	piece_field_data_gpu[i].field_offset = 0;
+    	copy_piece(piece_id_field_data[i], piece_field_data_gpu[i], node_fields, 0, gpu_memory).wait();
     }
-	wait_on_events = true;
-        log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n";
-        const char* val = std::getenv("TILE_SIZE");  // or any env var
-        size_t tile_size = 10000000; //default
-        if (val) {
-          tile_size = atoi(val);
-        }
-        std::vector<size_t> byte_fields = {sizeof(char)};
-        IndexSpace<1> instance_index_space(Rect<1>(0, tile_size-1));
-        IndexSpace<1> dst_index_space(Rect<1>(0, tile_size/100-1));
-        for (size_t i = 0; i < piece_field_data_gpu.size(); i++) {
-          RegionInstance::create_instance(piece_field_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
-        }
-        for (size_t i = 0; i < src_field_data_gpu.size(); i++) {
-          RegionInstance::create_instance(src_field_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
-        }
-        for (size_t i = 0; i < dst_field_data_gpu.size(); i++) {
-          RegionInstance::create_instance(dst_field_data_gpu[i].scratch_buffer, gpu_memory, dst_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
-        }
-        std::vector<IndexSpace<1> > p_garbage_nodes, p_garbage_edges, p_garbage_rd, p_garbage_preimage_edges;
-    Event e01 = is_nodes.create_subspaces_by_field(piece_field_data_gpu,
+
+    std::vector<DeppartEstimateInput<N, int>> byfield_inputs(num_pieces);
+    std::vector<DeppartBufferRequirements> byfield_requirements(num_pieces);
+
+    for (int i = 0; i < num_pieces; i++) {
+      byfield_inputs[i].location = piece_field_data_gpu[i].inst.get_location();
+      byfield_inputs[i].space = piece_field_data_gpu[i].index_space;
+    }
+
+    is_colors.by_field_buffer_requirements(byfield_inputs, byfield_requirements);
+
+    for (int i = 0; i < num_pieces; i++) {
+      alloc_piece(piece_field_data_gpu[i].scratch_buffer, byfield_requirements[i].upper_bound, gpu_memory).wait();
+    }
+
+    wait_on_events = true;
+    log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n";
+    Event warmup = is_colors.create_subspaces_by_field(piece_field_data_gpu,
                                                   colors,
                                                   p_garbage_nodes,
                                                   Realm::ProfilingRequestSet());
-        if (wait_on_events) e01.wait();
-    Event e02 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu,
-                                                     p_garbage_nodes,
-                                                     p_garbage_edges,
-                                                     Realm::ProfilingRequestSet(),
-                                                     e01);
-    if(wait_on_events) e02.wait();
-
-    // an image of p_edges through out_node gives us all the shared nodes, along
-    //  with some private nodes
-    Event e03 = is_nodes.create_subspaces_by_image(src_field_data_gpu,
-                                                  p_garbage_edges,
-                                                  p_garbage_rd,
-                                                  Realm::ProfilingRequestSet(),
-                                                  e02);
-    if(wait_on_events) e03.wait();
-
-    Event e04 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu,
-                                                  p_garbage_rd,
-                                                  p_garbage_preimage_edges,
-                                                  Realm::ProfilingRequestSet(),
-                                                  e03);
-    e04.wait();
-        log_app.info() << "warming up complete " << Clock::current_time_in_microseconds() << "\n";
-  	log_app.info() << "Starting GPU Partitioning " << Clock::current_time_in_microseconds() << "\n";
-  	log_app.info() << "Starting GPU By Field " << Clock::current_time_in_microseconds() << "\n";
-    Event e1 = is_nodes.create_subspaces_by_field(piece_field_data_gpu,
-						  colors,
-						  p_nodes,
-						  Realm::ProfilingRequestSet());
-    if(wait_on_events) e1.wait();
-  	log_app.info() << "GPU By Field complete " << Clock::current_time_in_microseconds() << "\n";
-  	log_app.info() << "Starting GPU Preimage " << Clock::current_time_in_microseconds() << "\n";
-    // now compute p_edges based on the color of their in_node (i.e. a preimage)
-    Event e2 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu,
-						     p_nodes,
-						     p_edges,
-						     Realm::ProfilingRequestSet(),
-						     e1);
-    if(wait_on_events) e2.wait();
-  	log_app.info() << "GPU Preimage complete " << Clock::current_time_in_microseconds() << "\n";
-	log_app.info() << "Starting GPU Image " << Clock::current_time_in_microseconds() << "\n";
-
-    std::vector<DeppartEstimateInput<1, int>> spaces = {};
-    std::vector<DeppartBufferRequirements> requirements;
-    is_nodes.by_field_buffer_requirements(spaces, requirements);
-    // an image of p_edges through out_node gives us all the shared nodes, along
-    //  with some private nodes
-    Event e3 = is_nodes.create_subspaces_by_image(src_field_data_gpu,
-                                                  p_edges,
-                                                  p_rd,
-                                                  Realm::ProfilingRequestSet(),
-                                                  e2);
-    if(wait_on_events) e3.wait();
-  	log_app.info() << "GPU Image complete " << Clock::current_time_in_microseconds() << "\n";
-  	log_app.info() << "Starting second GPU preimage " << Clock::current_time_in_microseconds() << "\n";
-
-    Event e4 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu,
-						  p_rd,
-						  p_preimage_edges,
-						  Realm::ProfilingRequestSet(),
-						  e3);
-  	e4.wait();
-  	log_app.info() << "Second GPU preimage complete " << Clock::current_time_in_microseconds() << "\n";
-  	log_app.info() << "GPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n";
-  	log_app.info() << "Starting CPU Partitioning " << Clock::current_time_in_microseconds() << "\n";
-  	log_app.info() << "Starting CPU By Field " << Clock::current_time_in_microseconds() << "\n";
-  	Event e5 = is_nodes.create_subspaces_by_field(piece_id_field_data,
-						  colors,
-						  p_nodes_cpu,
-						  Realm::ProfilingRequestSet());
-  	if(wait_on_events) e5.wait();
-  	log_app.info() << "CPU By Field complete " << Clock::current_time_in_microseconds() << "\n";
-  	// now compute p_edges based on the color of their in_node (i.e. a preimage)
-  	log_app.info() << "Starting CPU Preimage " << Clock::current_time_in_microseconds() << "\n";
-  	Event e6 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
-							   p_nodes_cpu,
-							   p_edges_cpu,
-							   Realm::ProfilingRequestSet(),
-							   e5);
-  	if(wait_on_events) e6.wait();
-  	log_app.info() << "CPU Preimage complete " << Clock::current_time_in_microseconds() << "\n";
-
-  	// an image of p_edges through out_node gives us all the shared nodes, along
-  	//  with some private nodes
-  	log_app.info() << "Starting CPU Image " << Clock::current_time_in_microseconds() << "\n";
-  	Event e7 = is_nodes.create_subspaces_by_image(src_node_field_data,
-							p_edges_cpu,
-							p_rd_cpu,
-							Realm::ProfilingRequestSet(),
-							e6);
-  	if(wait_on_events) e7.wait();
-  	log_app.info() << "CPU Image complete " << Clock::current_time_in_microseconds() << "\n";
-  	log_app.info() << "Starting second CPU preimage " << Clock::current_time_in_microseconds() << "\n";
-
-  	Event e8 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
-							p_rd_cpu,
-							p_preimage_edges_cpu,
-							Realm::ProfilingRequestSet(),
-							e7);
-  	e8.wait();
-  	log_app.info() << "Second CPU preimage complete " << Clock::current_time_in_microseconds() << "\n";
-  	log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n";
-    return e8;
+    warmup.wait();
+
+    Event gpu_call = is_colors.create_subspaces_by_field(piece_field_data_gpu,
+                                                  colors,
+                                                  p_nodes,
+                                                  Realm::ProfilingRequestSet());
+
+    Event cpu_call = is_colors.create_subspaces_by_field(piece_id_field_data,
+                                                  colors,
+                                                  p_nodes_cpu,
+                                                  Realm::ProfilingRequestSet());
+
+    return Event::merge_events({gpu_call, cpu_call});
+
   }
 
   virtual int perform_dynamic_checks(void)
@@ -640,14 +402,14 @@ class BasicTest : public TestInterface {
     int errors = 0;
 
     if (!p_nodes.size()) {
-      return 0;
+      return p_nodes.size() == p_nodes_cpu.size();
     }
 
     log_app.info() << "Checking correctness of partitioning " << "\n";
 
     for(int i = 0; i < num_pieces; i++) {
-      for(IndexSpaceIterator<1> it(p_nodes[i]); it.valid; it.step()) {
-        for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+      for(IndexSpaceIterator<N> it(p_nodes[i]); it.valid; it.step()) {
+        for(PointInRectIterator<N> point(it.rect); point.valid; point.step()) {
           if (!p_nodes_cpu[i].contains(point.p)) {
             log_app.error() << "Mismatch! GPU has extra byfield point " << point.p
                             << " on piece " << i << "\n";
@@ -655,8 +417,8 @@ class BasicTest : public TestInterface {
           }
         }
       }
-      for(IndexSpaceIterator<1> it(p_nodes_cpu[i]); it.valid; it.step()) {
-        for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
+      for(IndexSpaceIterator<N> it(p_nodes_cpu[i]); it.valid; it.step()) {
+        for(PointInRectIterator<N> point(it.rect); point.valid; point.step()) {
           if (!p_nodes[i].contains(point.p)) {
             log_app.error() << "Mismatch! GPU is missing byfield point " << point.p
                           << " on piece " << i << "\n";
@@ -664,76 +426,23 @@ class BasicTest : public TestInterface {
           }
         }
       }
-      for (IndexSpaceIterator<1> it(p_edges[i]); it.valid; it.step()) {
-        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
-           if (!p_edges_cpu[i].contains(point.p)) {
-             log_app.error() << "Mismatch! GPU has extra preimage edge " << point.p
-                         << " on piece " << i << "\n";
-             errors++;
-          }
-        }
-      }
-      for (IndexSpaceIterator<1> it(p_edges_cpu[i]); it.valid; it.step()) {
-        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
-            if (!p_edges[i].contains(point.p)) {
-              log_app.error() << "Mismatch! GPU is missing preimage edge " << point.p
-                         << " on piece " << i << "\n";
-              errors++;
-            }
-        }
-      }
-      for (IndexSpaceIterator<1> it(p_rd[i]); it.valid; it.step()) {
-        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
-           if (!p_rd_cpu[i].contains(point.p)) {
-            log_app.error() << "Mismatch! GPU has extra image node " << point.p
-            << " on piece " << i << "\n";
-            errors++;
-           }
-        }
-      }
-      for (IndexSpaceIterator<1> it(p_rd_cpu[i]); it.valid; it.step()) {
-        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
-           if (!p_rd[i].contains(point.p)) {
-               log_app.error() << "Mismatch! GPU is missing image node " << point.p
-                           << " on piece " << i << "\n";
-               errors++;
-           }
-        }
-      }
-      for (IndexSpaceIterator<1> it(p_preimage_edges[i]); it.valid; it.step()) {
-        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
-            if (!p_preimage_edges_cpu[i].contains(point.p)) {
-                  log_app.error() << "Mismatch! GPU has extra second preimage edge " << point.p
-                                  << " on piece " << i << "\n";
-                  errors++;
-            }
-        }
-      }
-      for (IndexSpaceIterator<1> it(p_preimage_edges_cpu[i]); it.valid; it.step()) {
-        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
-           if (!p_preimage_edges[i].contains(point.p)) {
-           log_app.error() << "Mismatch! GPU is missing second preimage edge " << point.p
-                           << " on piece " << i << "\n";
-               errors++;
-           }
-        }
-      }
 
     }
     return errors;
   }
 };
 
-class TileTest : public TestInterface {
+template<int N1, int N2>
+class ImageTest : public TestInterface {
 public:
   // graph config parameters
   int num_nodes = 1000;
   int num_edges = 1000;
+  int num_sources = 4;
   int num_pieces = 4;
-  int num_tiles = 1;
   std::string filename;
 
-  TileTest(int argc, const char *argv[])
+  ImageTest(int argc, const char *argv[])
   {
     for(int i = 1; i < argc; i++) {
 
@@ -749,15 +458,15 @@ class TileTest : public TestInterface {
         num_edges = atoi(argv[++i]);
         continue;
       }
-      if(!strcmp(argv[i], "-t")) {
-        num_tiles = atoi(argv[++i]);
+      if(!strcmp(argv[i], "-s")) {
+        num_sources = atoi(argv[++i]);
         continue;
       }
     }
 
 
-    if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0) {
-      log_app.error() << "Invalid config: nodes=" << num_nodes << " edges=" << num_edges << " pieces=" << num_pieces << "\n";
+    if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_sources <= 0) {
+      log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " sources=" << num_sources <<  "\n";
       exit(1);
     }
   }
@@ -765,7 +474,6 @@ class TileTest : public TestInterface {
   struct InitDataArgs {
     int index;
     RegionInstance ri_nodes;
-    RegionInstance ri_edges;
   };
 
   enum PRNGStreams
@@ -774,25 +482,20 @@ class TileTest : public TestInterface {
   };
 
   // assign subgraph ids to nodes
-  void random_node_data(int idx, int &subgraph)
-  {
-    if(random_colors)
-      subgraph =
-          Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces);
-    else
-      subgraph = idx * num_pieces / num_nodes;
-  }
-
-  void random_edge_data(int idx, int& src, int& dst)
+  void chase_point(int idx, Point<N1>& color)
   {
-    src = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes);
-    dst = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, num_nodes);
+    for (int d = 0; d < N1; d++) {
+      if(random_colors)
+        color[d] = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_edges);
+      else
+        color[d] = (idx * num_edges / num_nodes) % num_edges;
+    }
   }
 
   static void init_data_task_wrapper(const void *args, size_t arglen,
                                      const void *userdata, size_t userlen, Processor p)
   {
-    TileTest *me = (TileTest *)testcfg;
+    ImageTest *me = (ImageTest *)testcfg;
     me->init_data_task(args, arglen, p);
   }
 
@@ -805,128 +508,84 @@ class TileTest : public TestInterface {
                    << ")";
 
     i_args.ri_nodes.fetch_metadata(p).wait();
-    i_args.ri_edges.fetch_metadata(p).wait();
 
-    IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>();
-    IndexSpace<1> is_edges = i_args.ri_edges.get_indexspace<1>();
+    IndexSpace<N2> nodes_space = i_args.ri_nodes.template get_indexspace<N2>();
 
     log_app.debug() << "N: " << is_nodes;
-    log_app.debug() << "E: " << is_edges;
 
     //For each node in the graph, mark it with a random (or deterministic) subgraph id
     {
-      AffineAccessor<int, 1> a_piece_id(i_args.ri_nodes, 0 /* offset */);
-
-      for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) {
-        int subgraph;
-        random_node_data(i, subgraph);
-        a_piece_id.write(i, subgraph);
-      }
-
-      AffineAccessor<Point<1>,1> a_src(i_args.ri_edges, 0 /* offset */);
-      AffineAccessor<Point<1>,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */);
-
-      for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) {
-        int src, dst;
-        random_edge_data(i, src, dst);
-        a_src.write(i, Point<1>(src));
-        a_dst.write(i, Point<1>(dst));
+      AffineAccessor<Point<N1>, N2> a_point(i_args.ri_nodes, 0 /* offset */);
+
+      for (IndexSpaceIterator<N2> it(is_nodes); it.valid; it.step()) {
+        for (PointInRectIterator<N2> point(it.rect); point.valid; point.step()) {
+          int idx = 0;
+          int stride = 1;
+          for (int d = 0; d < N2; d++) {
+            idx += (point.p[d] - is_nodes.bounds.lo[d]) * stride;
+            stride *= (is_nodes.bounds.hi[d] - is_nodes.bounds.lo[d] + 1);
+          }
+          Point<N1> destination;
+          chase_point(idx, destination);
+          a_point.write(point.p, destination);
+        }
       }
     }
-
-    //Optionally print out the assigned subgraph ids
-    if(show_graph) {
-      AffineAccessor<int, 1> a_piece_id(i_args.ri_nodes, 0 /* offset */);
-
-      for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++)
-        log_app.info() << "piece_id[" << i << "] = " << a_piece_id.read(i) << "\n";
-
-      AffineAccessor<Point<1>,1> a_src(i_args.ri_edges, 0 /* offset */);
-      AffineAccessor<Point<1>,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */);
-
-      for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++)
-        log_app.info() << "src, dst[" << i << "] = " << a_src.read(i) << ", " << a_dst.read(i) << "\n";
-    }
   }
 
-  IndexSpace<1> is_nodes, is_edges;
-  std::vector<RegionInstance> ri_nodes, ri_edges;
-  std::vector<FieldDataDescriptor<IndexSpace<1>, int> > piece_id_field_data;
-  std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1> > > src_node_field_data, dst_node_field_data;
+  IndexSpace<N2> is_nodes;
+  IndexSpace<N1> is_edges;
+  std::vector<RegionInstance> ri_nodes;
+  std::vector<FieldDataDescriptor<IndexSpace<N2>, Point<N1>> > point_field_data;
 
   virtual void print_info(void)
   {
-    printf("Realm dependent partitioning test - tile: %d nodes, %d edges, %d pieces, %d tiles\n",
-	   (int)num_nodes, (int) num_edges, (int)num_pieces, (int)num_tiles);
+    printf("Realm %dD -> %dD Image dependent partitioning test: %d nodes, %d edges, %d pieces ,%d sources\n", (int) N2, (int) N1,
+	   (int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_sources);
   }
 
   virtual Event initialize_data(const std::vector<Memory> &memories,
                                 const std::vector<Processor> &procs)
   {
     // now create index space for nodes
-    is_nodes = Rect<1>(0, num_nodes - 1);
-    is_edges = Rect<1>(0, num_edges - 1);
+    Point<N2> node_lo, node_hi;
+    for (int d = 0; d < N2; d++) {
+      node_lo[d] = 0;
+      node_hi[d] = num_nodes - 1;
+    }
+    is_nodes = Rect<N2>(node_lo, node_hi);
+
+    Point<N1> edge_lo, edge_hi;
+    for (int d = 0; d < N1; d++) {
+      edge_lo[d] = 0;
+      edge_hi[d] = num_edges - 1;
+    }
+    is_edges = Rect<N1>(edge_lo, edge_hi);
+
     // equal partition is used to do initial population of edges and nodes
-    std::vector<IndexSpace<1> > ss_nodes_eq;
-    std::vector<IndexSpace<1> > ss_edges_eq;
+    std::vector<IndexSpace<N2> > ss_nodes_eq;
 
     log_app.info() << "Creating equal subspaces\n";
 
     is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait();
-    is_edges.create_equal_subspaces(num_pieces, 1, ss_edges_eq, Realm::ProfilingRequestSet()).wait();
-
-    log_app.debug() << "Initial partitions:";
-    for(size_t i = 0; i < ss_nodes_eq.size(); i++)
-      log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i];
-    for(size_t i = 0; i < ss_edges_eq.size(); i++)
-      log_app.debug() << " Edges #" << i << ": " << ss_edges_eq[i];
 
     // create instances for each of these subspaces
-    std::vector<size_t> node_fields, edge_fields;
-    node_fields.push_back(sizeof(int));  // piece_id
-    assert(sizeof(int) == sizeof(Point<1>));
-    edge_fields.push_back(sizeof(Point<1>));  // src_node
-    edge_fields.push_back(sizeof(Point<1>));  // dst_node
+    std::vector<size_t> node_fields;
+    node_fields.push_back(sizeof(Point<N1>));
 
     ri_nodes.resize(num_pieces);
-    piece_id_field_data.resize(num_pieces);
+    point_field_data.resize(num_pieces);
 
     for(size_t i = 0; i < ss_nodes_eq.size(); i++) {
       RegionInstance ri;
       RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i],
                                       node_fields, 0 /*SOA*/,
-                                      Realm::ProfilingRequestSet())
-          .wait();
+                                      Realm::ProfilingRequestSet()).wait();
       ri_nodes[i] = ri;
 
-      piece_id_field_data[i].index_space = ss_nodes_eq[i];
-      piece_id_field_data[i].inst = ri_nodes[i];
-      piece_id_field_data[i].field_offset = 0;
-    }
-
-
-    // Fire off tasks to initialize data
-    ri_edges.resize(num_pieces);
-    src_node_field_data.resize(num_pieces);
-    dst_node_field_data.resize(num_pieces);
-
-    for(size_t i = 0; i < ss_edges_eq.size(); i++) {
-      RegionInstance ri;
-      RegionInstance::create_instance(ri,
-				      memories[i % memories.size()],
-				      ss_edges_eq[i],
-				      edge_fields,
-				      0 /*SOA*/,
-				      Realm::ProfilingRequestSet()).wait();
-      ri_edges[i] = ri;
-
-      src_node_field_data[i].index_space = ss_edges_eq[i];
-      src_node_field_data[i].inst = ri_edges[i];
-      src_node_field_data[i].field_offset = 0 * sizeof(Point<1>);
-
-      dst_node_field_data[i].index_space = ss_edges_eq[i];
-      dst_node_field_data[i].inst = ri_edges[i];
-      dst_node_field_data[i].field_offset = 1 * sizeof(Point<1>);
+      point_field_data[i].index_space = ss_nodes_eq[i];
+      point_field_data[i].inst = ri_nodes[i];
+      point_field_data[i].field_offset = 0;
     }
 
     // fire off tasks to initialize data
@@ -936,8 +595,7 @@ class TileTest : public TestInterface {
       InitDataArgs args;
       args.index = i;
       args.ri_nodes = ri_nodes[i];
-      args.ri_edges = ri_edges[i];
-      Event e = p.spawn(INIT_TILE_DATA_TASK, &args, sizeof(args));
+      Event e = p.spawn(INIT_IMAGE_DATA_TASK, &args, sizeof(args));
       events.insert(e);
     }
 
@@ -949,20 +607,16 @@ class TileTest : public TestInterface {
   //  p_nodes_cpu - nodes partitioned by subgraph id (from CPU)
 
 
-    std::vector<IndexSpace<1> > p_nodes, p_rd;
-    std::vector<IndexSpace<1> > p_edges, p_preimage_edges;
-
-    std::vector<IndexSpace<1> > p_nodes_cpu, p_rd_cpu;
-    std::vector<IndexSpace<1> > p_edges_cpu, p_preimage_edges_cpu;
+    std::vector<IndexSpace<N1> > p_edges, p_garbage_edges, p_edges_cpu;
 
   virtual Event perform_partitioning(void)
   {
     // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU
     // Ensure that the results are identical
 
-    std::vector<int> colors(num_pieces);
-    for(int i = 0; i < num_pieces; i++)
-      colors[i] = i;
+    std::vector<IndexSpace<N2>> sources(num_pieces);
+    for(int i = 0; i < num_sources; i++)
+      sources[i] = point_field_data[i % num_pieces].index_space;
 
     // We need a GPU memory for GPU partitioning
     Memory gpu_memory;
@@ -981,190 +635,58 @@ class TileTest : public TestInterface {
       log_app.error() << "No GPU memory found for partitioning test\n";
       return Event::NO_EVENT;
     }
-    std::vector<size_t> edge_fields;
-    edge_fields.push_back(sizeof(Point<1>));
-    edge_fields.push_back(sizeof(Point<1>))	;
+
+
     std::vector<size_t> node_fields;
-    node_fields.push_back(sizeof(int));
+    node_fields.push_back(sizeof(Point<N1>));
+
+    std::vector<FieldDataDescriptor<IndexSpace<N2>, Point<N1>>> point_field_data_gpu;
+    point_field_data_gpu.resize(num_pieces);
 
-    std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1> > > src_field_data_gpu;
-    std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1> > > dst_field_data_gpu;
-    std::vector<FieldDataDescriptor<IndexSpace<1>, int> > piece_field_data_gpu;
-    piece_field_data_gpu.resize(num_pieces);
-    src_field_data_gpu.resize(num_pieces);
-    dst_field_data_gpu.resize(num_pieces);
     for (int i = 0; i < num_pieces; i++) {
-        RegionInstance src_gpu_instance;
-        RegionInstance dst_gpu_instance;
-    	RegionInstance piece_gpu_instance;
-        RegionInstance::create_instance(src_gpu_instance,
-				      gpu_memory,
-				      src_node_field_data[i].index_space,
-				      edge_fields,
-				      0 /*SOA*/,
-				      Realm::ProfilingRequestSet()).wait();
-        RegionInstance::create_instance(dst_gpu_instance,
-				      gpu_memory,
-				      dst_node_field_data[i].index_space,
-				      edge_fields,
-				      0 /*SOA*/,
-				      Realm::ProfilingRequestSet()).wait();
-    	RegionInstance::create_instance(piece_gpu_instance,
-					  gpu_memory,
-					  piece_id_field_data[i].index_space,
-					  node_fields,
-					  0 /*SOA*/,
-					  Realm::ProfilingRequestSet()).wait();
-      CopySrcDstField src_gpu_field, src_cpu_field, dst_gpu_field, dst_cpu_field, piece_gpu_field, piece_cpu_field;
-      src_gpu_field.inst = src_gpu_instance;
-      src_gpu_field.size = sizeof(Point<1>);
-      src_gpu_field.field_id = 0;
-      src_cpu_field.inst = src_node_field_data[i].inst;
-      src_cpu_field.size = sizeof(Point<1>);
-      src_cpu_field.field_id = 0;
-      dst_gpu_field.inst = dst_gpu_instance;
-      dst_gpu_field.size = sizeof(Point<1>);
-      dst_gpu_field.field_id = sizeof(Point<1>);
-      dst_cpu_field.inst = dst_node_field_data[i].inst;
-      dst_cpu_field.size = sizeof(Point<1>);
-      dst_cpu_field.field_id = sizeof(Point<1>);
-      piece_gpu_field.inst = piece_gpu_instance;
-      piece_gpu_field.size = sizeof(int);
-      piece_gpu_field.field_id = 0;
-      piece_cpu_field.inst = piece_id_field_data[i].inst;
-      piece_cpu_field.size = sizeof(int);
-      piece_cpu_field.field_id = 0;
-      std::vector<CopySrcDstField> src_cpu_data, src_gpu_data, dst_cpu_data, dst_gpu_data, piece_cpu_data, piece_gpu_data;
-      src_cpu_data.push_back(src_cpu_field);
-      dst_cpu_data.push_back(dst_cpu_field);
-      src_gpu_data.push_back(src_gpu_field);
-      dst_gpu_data.push_back(dst_gpu_field);
-    	piece_gpu_data.push_back(piece_gpu_field);
-    	piece_cpu_data.push_back(piece_cpu_field);
-      Event copy_event = src_node_field_data[i].index_space.copy(src_cpu_data, src_gpu_data, Realm::ProfilingRequestSet());
-      copy_event.wait();
-      Event second_copy_event = dst_node_field_data[i].index_space.copy(dst_cpu_data, dst_gpu_data, Realm::ProfilingRequestSet());
-      second_copy_event.wait();
-    	Event third_copy_event = piece_id_field_data[i].index_space.copy(piece_cpu_data, piece_gpu_data, Realm::ProfilingRequestSet());
-    		  third_copy_event.wait();
-      src_field_data_gpu[i].inst = src_gpu_instance;
-      src_field_data_gpu[i].index_space = src_node_field_data[i].index_space;
-      src_field_data_gpu[i].field_offset = 0;
-      dst_field_data_gpu[i].inst = dst_gpu_instance;
-      dst_field_data_gpu[i].index_space = dst_node_field_data[i].index_space;
-      dst_field_data_gpu[i].field_offset = 1 * sizeof(Point<1>);
-    	piece_field_data_gpu[i].inst = piece_gpu_instance;
-    	piece_field_data_gpu[i].index_space = piece_id_field_data[i].index_space;
-    	piece_field_data_gpu[i].field_offset = 0;
+    	copy_piece(point_field_data[i], point_field_data_gpu[i], node_fields, 0, gpu_memory).wait();
     }
-	wait_on_events = true;
-        log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n";
-        std::vector<IndexSpace<1> > p_garbage_nodes, p_garbage_edges, p_garbage_rd, p_garbage_preimage_edges;
-    Event e01 = is_nodes.create_subspaces_by_field(piece_field_data_gpu,
-                                                  colors,
-                                                  p_garbage_nodes,
-                                                  Realm::ProfilingRequestSet());
-        if (wait_on_events) e01.wait();
-    Event e02 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
-                                                     p_garbage_nodes,
-                                                     p_garbage_edges,
-                                                     Realm::ProfilingRequestSet(),
-                                                     e01);
-    if(wait_on_events) e02.wait();
-
-    // an image of p_edges through out_node gives us all the shared nodes, along
-    //  with some private nodes
-    Event e03 = is_nodes.create_subspaces_by_image(src_field_data_gpu,
+
+    std::vector<DeppartEstimateInput<N2, int>> image_inputs(num_pieces);
+    std::vector<DeppartSubspace<N2, int>> image_subspaces(num_sources);
+    std::vector<DeppartBufferRequirements> image_requirements(num_pieces);
+
+    for (int i = 0; i < num_pieces; i++) {
+      image_inputs[i].location = point_field_data_gpu[i].inst.get_location();
+      image_inputs[i].space = point_field_data_gpu[i].index_space;
+    }
+
+    for (int i = 0; i < num_sources; i++) {
+      image_subspaces[i].space = sources[i];
+      image_subspaces[i].entries = sources[i].dense() ? 1 : sources[i].sparsity.impl()->get_entries().size();
+    }
+
+    is_edges.by_image_buffer_requirements(image_subspaces, image_inputs, image_requirements);
+
+    for (int i = 0; i < num_pieces; i++) {
+      alloc_piece(point_field_data_gpu[i].scratch_buffer, image_requirements[i].upper_bound, gpu_memory).wait();
+    }
+
+    wait_on_events = true;
+    log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n";
+    Event warmup = is_edges.create_subspaces_by_image(point_field_data_gpu,
+                                                  sources,
                                                   p_garbage_edges,
-                                                  p_garbage_rd,
-                                                  Realm::ProfilingRequestSet(),
-                                                  e02);
-    if(wait_on_events) e03.wait();
-
-    Event e04 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
-                                                  p_garbage_rd,
-                                                  p_garbage_preimage_edges,
-                                                  Realm::ProfilingRequestSet(),
-                                                  e03);
-    e04.wait();
-        log_app.info() << "warming up complete " << Clock::current_time_in_microseconds() << "\n";
-  	log_app.info() << "Starting GPU Partitioning " << Clock::current_time_in_microseconds() << "\n";
-  	log_app.info() << "Starting GPU By Field " << Clock::current_time_in_microseconds() << "\n";
-    Event e1 = is_nodes.create_subspaces_by_field(piece_field_data_gpu,
-						  colors,
-						  p_nodes,
-						  Realm::ProfilingRequestSet());
-    if(wait_on_events) e1.wait();
-  	log_app.info() << "GPU By Field complete " << Clock::current_time_in_microseconds() << "\n";
-  	log_app.info() << "Starting GPU Preimage " << Clock::current_time_in_microseconds() << "\n";
-    // now compute p_edges based on the color of their in_node (i.e. a preimage)
-    Event e2 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
-						     p_nodes,
-						     p_edges,
-						     Realm::ProfilingRequestSet(),
-						     e1);
-    if(wait_on_events) e2.wait();
-  	log_app.info() << "GPU Preimage complete " << Clock::current_time_in_microseconds() << "\n";
-	log_app.info() << "Starting GPU Image " << Clock::current_time_in_microseconds() << "\n";
-
-    // an image of p_edges through out_node gives us all the shared nodes, along
-    //  with some private nodes
-    Event e3 = is_nodes.create_subspaces_by_image(src_field_data_gpu,
-						  p_edges,
-						  p_rd,
-						  Realm::ProfilingRequestSet(),
-						  e2);
-    if(wait_on_events) e3.wait();
-  	log_app.info() << "GPU Image complete " << Clock::current_time_in_microseconds() << "\n";
-  	log_app.info() << "Starting second GPU preimage " << Clock::current_time_in_microseconds() << "\n";
-
-    Event e4 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
-						  p_rd,
-						  p_preimage_edges,
-						  Realm::ProfilingRequestSet(),
-						  e3);
-  	e4.wait();
-  	log_app.info() << "Second GPU preimage complete " << Clock::current_time_in_microseconds() << "\n";
-  	log_app.info() << "GPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n";
-  	log_app.info() << "Starting CPU Partitioning " << Clock::current_time_in_microseconds() << "\n";
-  	log_app.info() << "Starting CPU By Field " << Clock::current_time_in_microseconds() << "\n";
-  	Event e5 = is_nodes.create_subspaces_by_field(piece_id_field_data,
-						  colors,
-						  p_nodes_cpu,
-						  Realm::ProfilingRequestSet());
-  	if(wait_on_events) e5.wait();
-  	log_app.info() << "CPU By Field complete " << Clock::current_time_in_microseconds() << "\n";
-  	// now compute p_edges based on the color of their in_node (i.e. a preimage)
-  	log_app.info() << "Starting CPU Preimage " << Clock::current_time_in_microseconds() << "\n";
-  	Event e6 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
-							   p_nodes_cpu,
-							   p_edges_cpu,
-							   Realm::ProfilingRequestSet(),
-							   e5);
-  	if(wait_on_events) e6.wait();
-  	log_app.info() << "CPU Preimage complete " << Clock::current_time_in_microseconds() << "\n";
-
-  	// an image of p_edges through out_node gives us all the shared nodes, along
-  	//  with some private nodes
-  	log_app.info() << "Starting CPU Image " << Clock::current_time_in_microseconds() << "\n";
-  	Event e7 = is_nodes.create_subspaces_by_image(src_node_field_data,
-							p_edges_cpu,
-							p_rd_cpu,
-							Realm::ProfilingRequestSet(),
-							e6);
-  	if(wait_on_events) e7.wait();
-  	log_app.info() << "CPU Image complete " << Clock::current_time_in_microseconds() << "\n";
-  	log_app.info() << "Starting second CPU preimage " << Clock::current_time_in_microseconds() << "\n";
-
-  	Event e8 = is_edges.create_subspaces_by_preimage(dst_node_field_data,
-							p_rd_cpu,
-							p_preimage_edges_cpu,
-							Realm::ProfilingRequestSet(),
-							e7);
-  	e8.wait();
-  	log_app.info() << "Second CPU preimage complete " << Clock::current_time_in_microseconds() << "\n";
-  	log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n";
-    return e8;
+                                                  Realm::ProfilingRequestSet());
+    warmup.wait();
+
+    Event gpu_call = is_edges.create_subspaces_by_image(point_field_data_gpu,
+                                                  sources,
+                                                  p_edges,
+                                                  Realm::ProfilingRequestSet());
+
+    Event cpu_call = is_edges.create_subspaces_by_image(point_field_data,
+                                                  sources,
+                                                  p_edges_cpu,
+                                                  Realm::ProfilingRequestSet());
+
+    return Event::merge_events({gpu_call, cpu_call});
+
   }
 
   virtual int perform_dynamic_checks(void)
@@ -1177,3273 +699,103 @@ class TileTest : public TestInterface {
   {
     int errors = 0;
 
-    if (!p_nodes.size()) {
-      return 0;
+    if (!p_edges.size()) {
+      return p_edges.size() == p_edges_cpu.size();
     }
 
     log_app.info() << "Checking correctness of partitioning " << "\n";
 
     for(int i = 0; i < num_pieces; i++) {
-      for(IndexSpaceIterator<1> it(p_nodes[i]); it.valid; it.step()) {
-        for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
-          if (!p_nodes_cpu[i].contains(point.p)) {
-            log_app.error() << "Mismatch! GPU has extra byfield point " << point.p
+      for(IndexSpaceIterator<N1> it(p_edges[i]); it.valid; it.step()) {
+        for(PointInRectIterator<N1> point(it.rect); point.valid; point.step()) {
+          if (!p_edges_cpu[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU has extra image point " << point.p
                             << " on piece " << i << "\n";
             errors++;
           }
         }
       }
-      for(IndexSpaceIterator<1> it(p_nodes_cpu[i]); it.valid; it.step()) {
-        for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
-          if (!p_nodes[i].contains(point.p)) {
-            log_app.error() << "Mismatch! GPU is missing byfield point " << point.p
+      for(IndexSpaceIterator<N1> it(p_edges_cpu[i]); it.valid; it.step()) {
+        for(PointInRectIterator<N1> point(it.rect); point.valid; point.step()) {
+          if (!p_edges[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU is missing image point " << point.p
                           << " on piece " << i << "\n";
             errors++;
           }
         }
       }
-      for (IndexSpaceIterator<1> it(p_edges[i]); it.valid; it.step()) {
-        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
-           if (!p_edges_cpu[i].contains(point.p)) {
-             log_app.error() << "Mismatch! GPU has extra preimage edge " << point.p
-                         << " on piece " << i << "\n";
-             errors++;
-          }
-        }
-      }
-      for (IndexSpaceIterator<1> it(p_edges_cpu[i]); it.valid; it.step()) {
-        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
-            if (!p_edges[i].contains(point.p)) {
-              log_app.error() << "Mismatch! GPU is missing preimage edge " << point.p
-                         << " on piece " << i << "\n";
-              errors++;
-            }
-        }
-      }
-      for (IndexSpaceIterator<1> it(p_rd[i]); it.valid; it.step()) {
-        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
-           if (!p_rd_cpu[i].contains(point.p)) {
-            log_app.error() << "Mismatch! GPU has extra image node " << point.p
-            << " on piece " << i << "\n";
-            errors++;
-           }
-        }
-      }
-      for (IndexSpaceIterator<1> it(p_rd_cpu[i]); it.valid; it.step()) {
-        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
-           if (!p_rd[i].contains(point.p)) {
-               log_app.error() << "Mismatch! GPU is missing image node " << point.p
-                           << " on piece " << i << "\n";
-               errors++;
-           }
-        }
-      }
-      for (IndexSpaceIterator<1> it(p_preimage_edges[i]); it.valid; it.step()) {
-        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
-            if (!p_preimage_edges_cpu[i].contains(point.p)) {
-                  log_app.error() << "Mismatch! GPU has extra second preimage edge " << point.p
-                                  << " on piece " << i << "\n";
-                  errors++;
-            }
-        }
-      }
-      for (IndexSpaceIterator<1> it(p_preimage_edges_cpu[i]); it.valid; it.step()) {
-        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
-           if (!p_preimage_edges[i].contains(point.p)) {
-           log_app.error() << "Mismatch! GPU is missing second preimage edge " << point.p
-                           << " on piece " << i << "\n";
-               errors++;
-           }
-        }
-      }
 
     }
     return errors;
   }
 };
 
-class RangeTest : public TestInterface {
-public:
-  // graph config parameters
-  int num_nodes = 1000;
-  int num_rects = 1000;
-  int max_rect_size = 10;
-  int num_pieces = 4;
-  std::string filename;
+void top_level_task(const void *args, size_t arglen, const void *userdata, size_t userlen,
+                    Processor p)
+{
+  int errors = 0;
 
-  RangeTest(int argc, const char *argv[])
-  {
-    for(int i = 1; i < argc; i++) {
+  testcfg->print_info();
 
-      if(!strcmp(argv[i], "-p")) {
-	num_pieces = atoi(argv[++i]);
-	continue;
-      }
+  // find all the system memories - we'll stride our data across them
+  // for each memory, we'll need one CPU that can do the initialization of the data
+  std::vector<Memory> sysmems;
+  std::vector<Processor> procs;
 
-      if(!strcmp(argv[i], "-n")) {
-        num_nodes = atoi(argv[++i]);
-        continue;
-      }
+  Machine machine = Machine::get_machine();
+  {
+    std::set<Memory> all_memories;
+    machine.get_all_memories(all_memories);
+    for(std::set<Memory>::const_iterator it = all_memories.begin();
+        it != all_memories.end(); it++) {
+      Memory m = *it;
 
-      if(!strcmp(argv[i], "-r")) {
-        num_rects = atoi(argv[++i]);
+      // skip memories with no capacity for creating instances
+      if(m.capacity() == 0)
         continue;
-      }
 
-      if(!strcmp(argv[i], "-m")) {
-        max_rect_size = atoi(argv[++i]);
-        continue;
+      if(m.kind() == Memory::SYSTEM_MEM) {
+        sysmems.push_back(m);
+        std::set<Processor> pset;
+        machine.get_shared_processors(m, pset);
+        Processor p = Processor::NO_PROC;
+        for(std::set<Processor>::const_iterator it2 = pset.begin(); it2 != pset.end();
+            it2++) {
+          if(it2->kind() == Processor::LOC_PROC) {
+            p = *it2;
+            break;
+          }
+        }
+        assert(p.exists());
+        procs.push_back(p);
+        log_app.debug() << "System mem #" << (sysmems.size() - 1) << " = "
+                        << *sysmems.rbegin() << " (" << *procs.rbegin() << ")";
       }
     }
-
-
-
-    if (num_nodes <= 0 || num_rects <= 0) {
-      log_app.error() << "Invalid graph dimensions in input file: rects=" << num_rects << " nodes=" << num_nodes;
-      exit(1);
-    }
-
   }
+  assert(sysmems.size() > 0);
 
+  {
+    Realm::TimeStamp ts("initialization", true, &log_app);
 
+    Event e = testcfg->initialize_data(sysmems, procs);
+    // wait for all initialization to be done
+    e.wait();
+  }
 
-  struct InitDataArgs {
-    int index;
-    RegionInstance ri_nodes;
-    RegionInstance ri_rects;
-  };
+  // now actual partitioning work
+  {
+    Realm::TimeStamp ts("dependent partitioning work", true, &log_app);
 
-  enum PRNGStreams {
-    NODE_SUBGRAPH_STREAM,
-  };
+    Event e = testcfg->perform_partitioning();
 
-  void random_rect_data(int idx, int& subgraph)
-  {
-    if(random_colors)
-      subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces);
-    else
-      subgraph = idx * num_pieces / num_rects;
+    e.wait();
   }
 
-  void random_node_data(int idx, int& subgraph)
+  // dynamic checks (which would be eliminated by compiler)
   {
-    if(true)
-      subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces);
-    else
-      subgraph = idx * num_pieces / num_nodes;
-  }
-
-  void initialize_rect_data(int idx, Rect<1> &rect, int max_rect_size = 10)
-  {
-
-    int first = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes);
-    int amount = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, max_rect_size);
-    rect = Rect<1>(first, first + amount);
-  }
-
-
-  static void init_data_task_wrapper(const void *args, size_t arglen,
-				     const void *userdata, size_t userlen, Processor p)
-  {
-    RangeTest *me = (RangeTest *)testcfg;
-    me->init_data_task(args, arglen, p);
-  }
-
-  void init_data_task(const void *args, size_t arglen, Processor p)
-  {
-    const InitDataArgs& i_args = *(const InitDataArgs *)args;
-
-    log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes << ", ri_rects=" << i_args.ri_rects << ")";
-
-    i_args.ri_nodes.fetch_metadata(p).wait();
-    i_args.ri_rects.fetch_metadata(p).wait();
-
-    IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>();
-    IndexSpace<1> is_rects = i_args.ri_rects.get_indexspace<1>();
-
-    log_app.debug() << "N: " << is_nodes;
-    log_app.debug() << "E: " << is_rects;
-
-    //Write out colors and rectangles
-
-    {
-      AffineAccessor<int,1> a_rect_id(i_args.ri_rects, 0 /* offset */);
-
-      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) {
-	      int subgraph;
-	      random_rect_data(i, subgraph);
-	      a_rect_id.write(i, subgraph);
-      }
-    }
-    {
-      AffineAccessor<int,1> a_piece_id(i_args.ri_nodes, 0 /* offset */);
-
-      for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) {
-        int subgraph;
-        random_node_data(i, subgraph);
-        a_piece_id.write(i, subgraph);
-      }
-    }
-
-
-    {
-
-      AffineAccessor<Rect<1>, 1> a_rect_val(i_args.ri_rects, 1 * sizeof(int) /* offset */);
-
-      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) {
-        Rect<1> rect;
-        initialize_rect_data(i, rect, max_rect_size);
-        a_rect_val.write(i, rect);
-      }
-    }
-
-    if(show_graph) {
-      AffineAccessor<int,1> a_piece_id(i_args.ri_nodes, 0 /* offset */);
-
-      for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++)
-	log_app.info() << "node_id[" << i << "] = " << a_piece_id.read(i) << "\n";
-
-      AffineAccessor<int,1> a_rect_id(i_args.ri_rects, 0 * sizeof(Point<1>) /* offset */);
-
-      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++)
-	log_app.info() << "rect_id[" << i << "] = " << a_rect_id.read(i) << "\n";
-
-      AffineAccessor<Rect<1>,1> a_rect_val(i_args.ri_rects, 1 * sizeof(int) /* offset */);
-
-      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++)
-	log_app.info() << "rect_val[" << i << "] = " << a_rect_val.read(i) << "\n";
-    }
-  }
-
-  IndexSpace<1> is_nodes, is_rects;
-  std::vector<RegionInstance> ri_nodes;
-  std::vector<FieldDataDescriptor<IndexSpace<1>, int> > node_id_field_data;
-  std::vector<RegionInstance> ri_rects;
-  std::vector<FieldDataDescriptor<IndexSpace<1>, int> > rect_id_field_data;
-  std::vector<FieldDataDescriptor<IndexSpace<1>, Rect<1> > > rect_val_field_data;
-
-  virtual void print_info(void)
-  {
-    printf("Realm dependent partitioning test - ranges: %d nodes, %d rects, %d pieces\n",
-	   (int)num_nodes, (int)num_rects, (int)num_pieces);
-  }
-
-  virtual Event initialize_data(const std::vector<Memory>& memories,
-				const std::vector<Processor>& procs)
-  {
-    // now create index spaces for nodes and edges
-    is_nodes = Rect<1>(0, num_nodes - 1);
-    is_rects = Rect<1>(0, num_rects - 1);
-
-    // equal partition is used to do initial population of edges and nodes
-    std::vector<IndexSpace<1> > ss_nodes_eq;
-    std::vector<IndexSpace<1> > ss_rects_eq;
-
-    log_app.info() << "Creating equal subspaces" << "\n";
-
-    is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait();
-    is_rects.create_equal_subspaces(num_pieces, 1, ss_rects_eq, Realm::ProfilingRequestSet()).wait();
-
-    log_app.debug() << "Initial partitions:";
-    for(size_t i = 0; i < ss_nodes_eq.size(); i++)
-      log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i];
-    for(size_t i = 0; i < ss_rects_eq.size(); i++)
-      log_app.debug() << " Rects #" << i << ": " << ss_rects_eq[i];
-
-    // create instances for each of these subspaces
-    std::vector<size_t> node_fields, rect_fields;
-    node_fields.push_back(sizeof(int));  // piece_id
-    rect_fields.push_back(sizeof(int));  // src_node
-    rect_fields.push_back(sizeof(Rect<1>));  // dst_node
-
-    ri_nodes.resize(num_pieces);
-    node_id_field_data.resize(num_pieces);
-
-    for(size_t i = 0; i < ss_nodes_eq.size(); i++) {
-      RegionInstance ri;
-      RegionInstance::create_instance(ri,
-				      memories[i % memories.size()],
-				      ss_nodes_eq[i],
-				      node_fields,
-				      0 /*SOA*/,
-				      Realm::ProfilingRequestSet()).wait();
-      ri_nodes[i] = ri;
-
-      node_id_field_data[i].index_space = ss_nodes_eq[i];
-      node_id_field_data[i].inst = ri_nodes[i];
-      node_id_field_data[i].field_offset = 0;
-    }
-
-    ri_rects.resize(num_pieces);
-    rect_id_field_data.resize(num_pieces);
-    rect_val_field_data.resize(num_pieces);
-
-    for(size_t i = 0; i < ss_rects_eq.size(); i++) {
-      RegionInstance ri;
-      RegionInstance::create_instance(ri,
-				      memories[i % memories.size()],
-				      ss_rects_eq[i],
-				      rect_fields,
-				      0 /*SOA*/,
-				      Realm::ProfilingRequestSet()).wait();
-      ri_rects[i] = ri;
-
-      rect_id_field_data[i].index_space = ss_rects_eq[i];
-      rect_id_field_data[i].inst = ri_rects[i];
-      rect_id_field_data[i].field_offset = 0;
-
-      rect_val_field_data[i].index_space = ss_rects_eq[i];
-      rect_val_field_data[i].inst = ri_rects[i];
-      rect_val_field_data[i].field_offset = 1 * sizeof(int);
-    }
-
-    // fire off tasks to initialize data
-    std::set<Event> events;
-    for(int i = 0; i < num_pieces; i++) {
-      Processor p = procs[i % procs.size()];
-      InitDataArgs args;
-      args.index = i;
-      args.ri_nodes = ri_nodes[i];
-      args.ri_rects = ri_rects[i];
-      Event e = p.spawn(INIT_RANGE_DATA_TASK, &args, sizeof(args));
-      events.insert(e);
-    }
-
-    return Event::merge_events(events);
-  }
-
-  // the outputs of our partitioning will be:
-  //p_colored_rects -> all of our rectangles marked with the color given by random_rect_data
-  //p_rects -> image range by p colored rects into nodes
-
-  std::vector<IndexSpace<1> > p_colored_rects, p_rects;
-  std::vector<IndexSpace<1> > p_colored_rects_cpu, p_rects_cpu;
-
-  virtual Event perform_partitioning(void)
-  {
-
-    std::vector<int> colors(num_pieces);
-    for(int i = 0; i < num_pieces; i++)
-      colors[i] = i;
-
-    Memory gpu_memory;
-    bool found_gpu_memory = false;
-    Machine machine = Machine::get_machine();
-    std::set<Memory> all_memories;
-    machine.get_all_memories(all_memories);
-    for(auto& memory : all_memories) {
-      if(memory.kind() == Memory::GPU_FB_MEM) {
-        gpu_memory = memory;
-        found_gpu_memory = true;
-        break;
-      }
-    }
-    assert(found_gpu_memory);
-    std::vector<size_t> rect_fields;
-    rect_fields.push_back(sizeof(int));
-    rect_fields.push_back(sizeof(Rect<1>));
-    std::vector<size_t> node_fields;
-    node_fields.push_back(sizeof(int));
-
-    std::vector<FieldDataDescriptor<IndexSpace<1>, int > > node_id_data_gpu;
-    std::vector<FieldDataDescriptor<IndexSpace<1>, int > > rect_id_data_gpu;
-    std::vector<FieldDataDescriptor<IndexSpace<1>, Rect<1>>> rect_val_data_gpu;
-    node_id_data_gpu.resize(num_pieces);
-    rect_id_data_gpu.resize(num_pieces);
-    rect_val_data_gpu.resize(num_pieces);
-    for (int i = 0; i < num_pieces; i++) {
-	RegionInstance node_id_instance;
-	RegionInstance rect_id_instance;
-    	RegionInstance rect_val_instance;
-        RegionInstance::create_instance(node_id_instance,
-				      gpu_memory,
-				      node_id_field_data[i].index_space,
-				      node_fields,
-				      0 /*SOA*/,
-				      Realm::ProfilingRequestSet()).wait();
-        RegionInstance::create_instance(rect_id_instance,
-				      gpu_memory,
-				      rect_id_field_data[i].index_space,
-				      rect_fields,
-				      0 /*SOA*/,
-				      Realm::ProfilingRequestSet()).wait();
-    	RegionInstance::create_instance(rect_val_instance,
-					  gpu_memory,
-					  rect_val_field_data[i].index_space,
-					  rect_fields,
-					  0 /*SOA*/,
-					  Realm::ProfilingRequestSet()).wait();
-      CopySrcDstField node_id_gpu_field, node_id_cpu_field, rect_id_gpu_field, rect_id_cpu_field, rect_val_gpu_field, rect_val_cpu_field;
-      node_id_gpu_field.inst = node_id_instance;
-      node_id_gpu_field.size = sizeof(int);
-      node_id_gpu_field.field_id = 0;
-      node_id_cpu_field.inst = node_id_field_data[i].inst;
-      node_id_cpu_field.size = sizeof(int);
-      node_id_cpu_field.field_id = 0;
-      rect_id_gpu_field.inst = rect_id_instance;
-      rect_id_gpu_field.size = sizeof(int);
-      rect_id_gpu_field.field_id = 0;
-      rect_id_cpu_field.inst = rect_id_field_data[i].inst;
-      rect_id_cpu_field.size = sizeof(int);
-      rect_id_cpu_field.field_id = 0;
-      rect_val_gpu_field.inst = rect_val_instance;
-      rect_val_gpu_field.size = sizeof(Rect<1>);
-      rect_val_gpu_field.field_id = sizeof(int);
-      rect_val_cpu_field.inst = rect_val_field_data[i].inst;
-      rect_val_cpu_field.size = sizeof(Rect<1>);
-      rect_val_cpu_field.field_id = sizeof(int);
-      std::vector<CopySrcDstField> node_id_gpu_data, node_id_cpu_data, rect_id_gpu_data, rect_id_cpu_data, rect_val_gpu_data, rect_val_cpu_data;
-      node_id_gpu_data.push_back(node_id_gpu_field);
-      node_id_cpu_data.push_back(node_id_cpu_field);
-      rect_id_gpu_data.push_back(rect_id_gpu_field);
-      rect_id_cpu_data.push_back(rect_id_cpu_field);
-      rect_val_gpu_data.push_back(rect_val_gpu_field);
-      rect_val_cpu_data.push_back(rect_val_cpu_field);
-      Event copy_event = node_id_field_data[i].index_space.copy(node_id_cpu_data, node_id_gpu_data, Realm::ProfilingRequestSet());
-      copy_event.wait();
-      Event second_copy_event = rect_id_field_data[i].index_space.copy(rect_id_cpu_data, rect_id_gpu_data, Realm::ProfilingRequestSet());
-      second_copy_event.wait();
-      Event third_copy_event = rect_val_field_data[i].index_space.copy(rect_val_cpu_data, rect_val_gpu_data, Realm::ProfilingRequestSet());
-      third_copy_event.wait();
-      node_id_data_gpu[i].inst = node_id_instance;
-      node_id_data_gpu[i].index_space = node_id_field_data[i].index_space;
-      node_id_data_gpu[i].field_offset = 0;
-      rect_id_data_gpu[i].inst = rect_id_instance;
-      rect_id_data_gpu[i].index_space = rect_id_field_data[i].index_space;
-      rect_id_data_gpu[i].field_offset = 0;
-      rect_val_data_gpu[i].inst = rect_val_instance;
-      rect_val_data_gpu[i].index_space = rect_val_field_data[i].index_space;
-      rect_val_data_gpu[i].field_offset = sizeof(int);
-    }
-    wait_on_events = true;
-    std::vector<IndexSpace<1>> p_garbage_rects, p_garbage_colors;
-    log_app.info() << "WARMING UP " << "\n";
-
-    std::vector<DeppartEstimateInput<1, int>> field_estimate_input(rect_id_data_gpu.size());
-    std::vector<DeppartBufferRequirements> field_estimate_output(rect_id_data_gpu.size());
-    std::vector<DeppartEstimateInput<1, int>> image_estimate_input(rect_val_data_gpu.size());
-    std::vector<DeppartBufferRequirements> image_estimate_output(rect_val_data_gpu.size());
-    std::vector<DeppartSubspace<1, int>> subspace_input(colors.size());
-    for (size_t i = 0; i < rect_id_data_gpu.size(); i++) {
-      field_estimate_input[i].location = rect_id_data_gpu[i].inst.get_location();
-      field_estimate_input[i].space = rect_id_data_gpu[i].index_space;
-    }
-    for (size_t i = 0; i < rect_val_data_gpu.size(); i++) {
-      image_estimate_input[i].location = rect_val_data_gpu[i].inst.get_location();
-      image_estimate_input[i].space = rect_val_data_gpu[i].index_space;
-    }
-
-    is_rects.by_field_buffer_requirements(field_estimate_input, field_estimate_output);
-    std::vector<size_t> byte_fields = {sizeof(char)};
-    for (size_t i = 0; i < rect_id_data_gpu.size(); i++) {
-      IndexSpace<1> instance_index_space(Rect<1>(0, field_estimate_output[i].upper_bound-1));
-      RegionInstance::create_instance(rect_id_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
-    }
-
-    Event e001 = is_rects.create_subspaces_by_field(rect_id_data_gpu,
-                                                  colors,
-                                                  p_garbage_colors,
-                                                  Realm::ProfilingRequestSet());
-    if (wait_on_events) e001.wait();
-    for (size_t i = 0; i < colors.size(); i++) {
-      subspace_input[i].space = p_garbage_colors[i];
-      subspace_input[i].entries = p_garbage_colors[i].sparsity.impl()->get_entries().size();
-    }
-    is_nodes.by_image_buffer_requirements(subspace_input, image_estimate_input, image_estimate_output);
-    for (size_t i = 0; i < rect_val_data_gpu.size(); i++) {
-      IndexSpace<1> instance_index_space(Rect<1>(0, (image_estimate_output[i].upper_bound)/12-1));
-      RegionInstance::create_instance(rect_val_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
-    }
-    Event e002 = is_nodes.create_subspaces_by_image(rect_val_data_gpu,
-                                                     p_garbage_colors,
-                                                     p_garbage_rects,
-                                                     Realm::ProfilingRequestSet(),
-                                                     e001);
-    if(wait_on_events) e002.wait();
-
-    log_app.info() << "FINISHED WARMING UP " << "\n";
-    log_app.info() << "starting GPU  partitioning " << Clock::current_time_in_microseconds() << "\n";
-
-    log_app.info() << "STARTING GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
-
-    Event e01 = is_rects.create_subspaces_by_field(rect_id_data_gpu,
-                                                  colors,
-                                                  p_colored_rects,
-                                                  Realm::ProfilingRequestSet());
-        if (wait_on_events) e01.wait();
-
-    log_app.info() << "FINISHED GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
-    log_app.info() << "STARTING GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
-    Event e02 = is_nodes.create_subspaces_by_image(rect_val_data_gpu,
-                                                     p_colored_rects,
-                                                     p_rects,
-                                                     Realm::ProfilingRequestSet(),
-                                                     e01);
-    if(wait_on_events) e02.wait();
-
-    log_app.info() << "FINISHED GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
-    log_app.info() << "STARTING CPU  partitioning " << Clock::current_time_in_microseconds() << "\n";
-    log_app.info() << "STARTING CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
-    Event e1 = is_rects.create_subspaces_by_field(rect_id_field_data,
-                                                  colors,
-                                                  p_colored_rects_cpu,
-                                                  Realm::ProfilingRequestSet());
-    if (wait_on_events) e1.wait();
-    log_app.info() << "FINISHED CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
-    log_app.info() << "STARTING CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
-    Event e2 = is_nodes.create_subspaces_by_image(rect_val_field_data,
-                                                     p_colored_rects_cpu,
-                                                     p_rects_cpu,
-                                                     Realm::ProfilingRequestSet(),
-                                                     e1);
-    if(wait_on_events) e2.wait();
-    log_app.info() << "FINISHED CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
-    log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n";
-    return e2;
-  }
-
-
-
-  virtual int perform_dynamic_checks(void)
-  {
-    return 0;
-  }
-
-  virtual int check_partitioning(void)
-  {
-    log_app.info() << "Checking correctness of partitioning " << "\n";
-    int errors = 0;
-
-    for (int i = 0; i < num_pieces; i++) {
-      for (IndexSpaceIterator<1> it(p_colored_rects[i]); it.valid; it.step()) {
-        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
-          if (!p_colored_rects_cpu[i].contains(point.p)) {
-            log_app.error() << "Mismatch! GPU has extra colored rect point " << point.p
-                            << " on piece " << i << "\n";
-            errors++;
-          }
-        }
-      }
-      for (IndexSpaceIterator<1> it(p_colored_rects_cpu[i]); it.valid; it.step()) {
-        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
-          if(!p_colored_rects[i].contains(point.p)) {
-                log_app.error() << "Mismatch! GPU is missing colored rect point " << point.p
-                                  << " on piece " << i << "\n";
-                errors++;
-          }
-        }
-      }
-      for (IndexSpaceIterator<1> it(p_rects[i]); it.valid; it.step()) {
-        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
-          if (!p_rects_cpu[i].contains(point.p)) {
-            log_app.error() << "Mismatch! GPU has extra rect point " << point.p
-                            << " on piece " << i << "\n";
-            errors++;
-          }
-        }
-      }
-      for (IndexSpaceIterator<1> it(p_rects_cpu[i]); it.valid; it.step()) {
-        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
-          if(!p_rects[i].contains(point.p)) {
-            log_app.error() << "Mismatch! GPU is missing rect point " << point.p
-                            << " on piece " << i << "\n";
-            errors++;
-          }
-        }
-      }
-    }
-    return errors;
-  }
-};
-
-class Range2DTest : public TestInterface {
-public:
-  // graph config parameters
-  int num_nodes = 1000;
-  int num_rects = 1000;
-  int max_rect_size = 10;
-  int num_pieces = 4;
-
-  Range2DTest(int argc, const char *argv[])
-  {
-    for(int i = 1; i < argc; i++) {
-
-      if(!strcmp(argv[i], "-p")) {
-	num_pieces = atoi(argv[++i]);
-	continue;
-      }
-
-        if(!strcmp(argv[i], "-n")) {
-          num_nodes = atoi(argv[++i]);
-          continue;
-        }
-
-        if (!strcmp(argv[i], "-r")) {
-          num_rects = atoi(argv[++i]);
-          continue;
-        }
-
-        if (!strcmp(argv[i], "-m")) {
-          max_rect_size = atoi(argv[++i]);
-          continue;
-        }
-    }
-
-    if (num_nodes <= 0 || num_rects <= 0) {
-      log_app.error() << "Invalid graph dimensions in input file: rects=" << num_rects << " nodes=" << num_nodes;
-      exit(1);
-    }
-
-  }
-
-
-
-  struct InitDataArgs {
-    int index;
-    RegionInstance ri_nodes;
-    RegionInstance ri_rects;
-  };
-
-  enum PRNGStreams {
-    NODE_SUBGRAPH_STREAM,
-  };
-
-  void random_rect_data(int idx, int& subgraph)
-  {
-    if(random_colors)
-      subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces);
-    else
-      subgraph = idx * num_pieces / num_rects;
-  }
-
-  void random_node_data(int idx, int& subgraph)
-  {
-    if(true)
-      subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces);
-    else
-      subgraph = idx * num_pieces / num_nodes;
-  }
-
-  void initialize_rect_data(int idx, Rect<2> &rect, int max_rect_size = 10)
-  {
-
-    int x = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes);
-    int y = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, num_nodes);
-    int length = Philox_2x32<>::rand_int(random_seed, idx + 2, NODE_SUBGRAPH_STREAM, max_rect_size);
-    int height = Philox_2x32<>::rand_int(random_seed, idx + 3, NODE_SUBGRAPH_STREAM, max_rect_size);
-    rect.lo[0] = x;
-    rect.hi[0] = x + length;
-    rect.lo[1] = y;
-    rect.hi[1] = y + height;
-  }
-
-
-  static void init_data_task_wrapper(const void *args, size_t arglen,
-				     const void *userdata, size_t userlen, Processor p)
-  {
-    Range2DTest *me = (Range2DTest *)testcfg;
-    me->init_data_task(args, arglen, p);
-  }
-
-  void init_data_task(const void *args, size_t arglen, Processor p)
-  {
-    const InitDataArgs& i_args = *(const InitDataArgs *)args;
-
-    log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes << ", ri_rects=" << i_args.ri_rects << ")";
-
-    i_args.ri_nodes.fetch_metadata(p).wait();
-    i_args.ri_rects.fetch_metadata(p).wait();
-
-    IndexSpace<2> is_nodes = i_args.ri_nodes.get_indexspace<2>();
-    IndexSpace<1> is_rects = i_args.ri_rects.get_indexspace<1>();
-
-    log_app.debug() << "N: " << is_nodes;
-    log_app.debug() << "E: " << is_rects;
-
-    {
-      AffineAccessor<int,1> a_piece_id(i_args.ri_rects, 0 /* offset */);
-
-      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) {
-	      int subgraph;
-	      random_rect_data(i, subgraph);
-	      a_piece_id.write(i, subgraph);
-      }
-    }
-    {
-      AffineAccessor<int,2> a_piece_id(i_args.ri_nodes, 0 /* offset */);
-
-      for(int i = is_nodes.bounds.lo[0]; i <= is_nodes.bounds.hi[0]; i++) {
-        for (int j = is_nodes.bounds.lo[1]; j <= is_nodes.bounds.hi[1]; j++) {
-          int idx = i * (is_nodes.bounds.hi[1] - is_nodes.bounds.lo[1] + 1) + j;
-          int subgraph;
-          random_node_data(idx, subgraph);
-          a_piece_id.write(Point<2>(i, j), subgraph);
-        }
-      }
-    }
-
-
-    {
-
-      AffineAccessor<Rect<2>, 1> a_rect(i_args.ri_rects, 1 * sizeof(int) /* offset */);
-
-      // Read edges line by line
-      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) {
-        Rect<2> rect;
-        initialize_rect_data(i, rect, max_rect_size);
-        a_rect.write(i, rect);
-      }
-    }
-
-    if(show_graph) {
-      AffineAccessor<int,2> a_piece_id(i_args.ri_nodes, 0 /* offset */);
-
-      for(int i = is_nodes.bounds.lo[0]; i <= is_nodes.bounds.hi[1]; i++) {
-        for (int j = is_nodes.bounds.lo[1]; j <= is_nodes.bounds.hi[1]; j++) {
-          Point<2> p(i, j);
-          log_app.info() << "node_id[" << p << "] = " << a_piece_id.read(p) << "\n";
-        }
-      }
-
-      AffineAccessor<int,1> a_rect_id(i_args.ri_rects, 0 * sizeof(Point<1>) /* offset */);
-
-      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++)
-	log_app.info() << "rect_id[" << i << "] = " << a_rect_id.read(i) << "\n";
-
-      AffineAccessor<Rect<2>,1> a_rect_val(i_args.ri_rects, 1 * sizeof(int) /* offset */);
-
-      for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++)
-	log_app.info() << "rect_val[" << i << "] = " << a_rect_val.read(i) << "\n";
-    }
-  }
-
-  IndexSpace<1> is_rects;
-  IndexSpace<2> is_nodes;
-  std::vector<RegionInstance> ri_nodes;
-  std::vector<FieldDataDescriptor<IndexSpace<2>, int> > node_id_field_data;
-  std::vector<RegionInstance> ri_rects;
-  std::vector<FieldDataDescriptor<IndexSpace<1>, int> > rect_id_field_data;
-  std::vector<FieldDataDescriptor<IndexSpace<1>, Rect<2> > > rect_val_field_data;
-
-  virtual void print_info(void)
-  {
-    printf("Realm dependent partitioning test - 2D ranges: %d nodes, %d rects, %d pieces\n",
-	   (int)num_nodes, (int)num_rects, (int)num_pieces);
-  }
-
-  virtual Event initialize_data(const std::vector<Memory>& memories,
-				const std::vector<Processor>& procs)
-  {
-    // now create index spaces for nodes and edges
-    is_nodes = Rect<2>(Point<2>(0, 0), Point<2>(num_nodes - 1, num_nodes - 1));
-    is_rects = Rect<1>(0, num_rects - 1);
-
-    // equal partition is used to do initial population of edges and nodes
-    std::vector<IndexSpace<2> > ss_nodes_eq;
-    std::vector<IndexSpace<1> > ss_rects_eq;
-
-    log_app.info() << "Creating equal subspaces" << "\n";
-
-    is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait();
-    is_rects.create_equal_subspaces(num_pieces, 1, ss_rects_eq, Realm::ProfilingRequestSet()).wait();
-
-    log_app.debug() << "Initial partitions:\n";
-    for(size_t i = 0; i < ss_nodes_eq.size(); i++)
-      log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i];
-    for(size_t i = 0; i < ss_rects_eq.size(); i++)
-      log_app.debug() << " Rects #" << i << ": " << ss_rects_eq[i];
-
-    // create instances for each of these subspaces
-    std::vector<size_t> node_fields, rect_fields;
-    node_fields.push_back(sizeof(int));  // piece_id
-    rect_fields.push_back(sizeof(int));  // src_node
-    rect_fields.push_back(sizeof(Rect<2>));  // dst_node
-
-    ri_nodes.resize(num_pieces);
-    node_id_field_data.resize(num_pieces);
-
-    for(size_t i = 0; i < ss_nodes_eq.size(); i++) {
-      RegionInstance ri;
-      RegionInstance::create_instance(ri,
-				      memories[i % memories.size()],
-				      ss_nodes_eq[i],
-				      node_fields,
-				      0 /*SOA*/,
-				      Realm::ProfilingRequestSet()).wait();
-      ri_nodes[i] = ri;
-
-      node_id_field_data[i].index_space = ss_nodes_eq[i];
-      node_id_field_data[i].inst = ri_nodes[i];
-      node_id_field_data[i].field_offset = 0;
-    }
-
-    ri_rects.resize(num_pieces);
-    rect_id_field_data.resize(num_pieces);
-    rect_val_field_data.resize(num_pieces);
-
-    for(size_t i = 0; i < ss_rects_eq.size(); i++) {
-      RegionInstance ri;
-      RegionInstance::create_instance(ri,
-				      memories[i % memories.size()],
-				      ss_rects_eq[i],
-				      rect_fields,
-				      0 /*SOA*/,
-				      Realm::ProfilingRequestSet()).wait();
-      ri_rects[i] = ri;
-
-      rect_id_field_data[i].index_space = ss_rects_eq[i];
-      rect_id_field_data[i].inst = ri_rects[i];
-      rect_id_field_data[i].field_offset = 0;
-
-      rect_val_field_data[i].index_space = ss_rects_eq[i];
-      rect_val_field_data[i].inst = ri_rects[i];
-      rect_val_field_data[i].field_offset = 1 * sizeof(int);
-    }
-
-    // fire off tasks to initialize data
-    std::set<Event> events;
-    for(int i = 0; i < num_pieces; i++) {
-      Processor p = procs[i % procs.size()];
-      InitDataArgs args;
-      args.index = i;
-      args.ri_nodes = ri_nodes[i];
-      args.ri_rects = ri_rects[i];
-      Event e = p.spawn(INIT_RANGE2D_DATA_TASK, &args, sizeof(args));
-      events.insert(e);
-    }
-
-    return Event::merge_events(events);
-  }
-
-  // the outputs of our partitioning will be:
-  //  is_private, is_shared - subsets of is_nodes based on private/shared
-  //  p_rd, p_wr, p_ghost - subsets of the above split by subckt
-  //  p_edges               - subsets of is_edges for each subckt
-
-  std::vector<IndexSpace<1> > p_colored_rects;
-  std::vector<IndexSpace<2>> p_rects, p_intersect, p_diff;
-  std::vector<IndexSpace<1>> p_colored_rects_cpu;
-  std::vector<IndexSpace<2>> p_rects_cpu, p_intersect_cpu, p_diff_cpu;
-
-  IndexSpace<2> cpu_union, gpu_union, garbage_union;
-
-  virtual Event perform_partitioning(void)
-  {
-    // first partition nodes by subckt id (this is the independent partition,
-    //  but not actually used by the app)
-
-    std::vector<int> colors(num_pieces);
-    for(int i = 0; i < num_pieces; i++)
-      colors[i] = i;
-
-    Memory gpu_memory;
-    bool found_gpu_memory = false;
-    Machine machine = Machine::get_machine();
-    std::set<Memory> all_memories;
-    machine.get_all_memories(all_memories);
-    for(auto& memory : all_memories) {
-      if(memory.kind() == Memory::GPU_FB_MEM) {
-        gpu_memory = memory;
-        found_gpu_memory = true;
-        break;
-      }
-    }
-    assert(found_gpu_memory);
-    std::vector<size_t> rect_fields;
-    rect_fields.push_back(sizeof(int));
-    rect_fields.push_back(sizeof(Rect<2>));
-    std::vector<size_t> node_fields;
-    node_fields.push_back(sizeof(int));
-
-    std::vector<FieldDataDescriptor<IndexSpace<2>, int > > node_id_data_gpu;
-    std::vector<FieldDataDescriptor<IndexSpace<1>, int > > rect_id_data_gpu;
-    std::vector<FieldDataDescriptor<IndexSpace<1>, Rect<2>>> rect_val_data_gpu;
-    node_id_data_gpu.resize(num_pieces);
-    rect_id_data_gpu.resize(num_pieces);
-    rect_val_data_gpu.resize(num_pieces);
-    for (int i = 0; i < num_pieces; i++) {
-	RegionInstance node_id_instance;
-	RegionInstance rect_id_instance;
-    	RegionInstance rect_val_instance;
-        RegionInstance::create_instance(node_id_instance,
-				      gpu_memory,
-				      node_id_field_data[i].index_space,
-				      node_fields,
-				      0 /*SOA*/,
-				      Realm::ProfilingRequestSet()).wait();
-        RegionInstance::create_instance(rect_id_instance,
-				      gpu_memory,
-				      rect_id_field_data[i].index_space,
-				      rect_fields,
-				      0 /*SOA*/,
-				      Realm::ProfilingRequestSet()).wait();
-    	RegionInstance::create_instance(rect_val_instance,
-					  gpu_memory,
-					  rect_val_field_data[i].index_space,
-					  rect_fields,
-					  0 /*SOA*/,
-					  Realm::ProfilingRequestSet()).wait();
-      CopySrcDstField node_id_gpu_field, node_id_cpu_field, rect_id_gpu_field, rect_id_cpu_field, rect_val_gpu_field, rect_val_cpu_field;
-      node_id_gpu_field.inst = node_id_instance;
-      node_id_gpu_field.size = sizeof(int);
-      node_id_gpu_field.field_id = 0;
-      node_id_cpu_field.inst = node_id_field_data[i].inst;
-      node_id_cpu_field.size = sizeof(int);
-      node_id_cpu_field.field_id = 0;
-      rect_id_gpu_field.inst = rect_id_instance;
-      rect_id_gpu_field.size = sizeof(int);
-      rect_id_gpu_field.field_id = 0;
-      rect_id_cpu_field.inst = rect_id_field_data[i].inst;
-      rect_id_cpu_field.size = sizeof(int);
-      rect_id_cpu_field.field_id = 0;
-      rect_val_gpu_field.inst = rect_val_instance;
-      rect_val_gpu_field.size = sizeof(Rect<2>);
-      rect_val_gpu_field.field_id = sizeof(int);
-      rect_val_cpu_field.inst = rect_val_field_data[i].inst;
-      rect_val_cpu_field.size = sizeof(Rect<2>);
-      rect_val_cpu_field.field_id = sizeof(int);
-      std::vector<CopySrcDstField> node_id_gpu_data, node_id_cpu_data, rect_id_gpu_data, rect_id_cpu_data, rect_val_gpu_data, rect_val_cpu_data;
-      node_id_gpu_data.push_back(node_id_gpu_field);
-      node_id_cpu_data.push_back(node_id_cpu_field);
-      rect_id_gpu_data.push_back(rect_id_gpu_field);
-      rect_id_cpu_data.push_back(rect_id_cpu_field);
-      rect_val_gpu_data.push_back(rect_val_gpu_field);
-      rect_val_cpu_data.push_back(rect_val_cpu_field);
-      Event copy_event = node_id_field_data[i].index_space.copy(node_id_cpu_data, node_id_gpu_data, Realm::ProfilingRequestSet());
-      copy_event.wait();
-      Event second_copy_event = rect_id_field_data[i].index_space.copy(rect_id_cpu_data, rect_id_gpu_data, Realm::ProfilingRequestSet());
-      second_copy_event.wait();
-      Event third_copy_event = rect_val_field_data[i].index_space.copy(rect_val_cpu_data, rect_val_gpu_data, Realm::ProfilingRequestSet());
-      third_copy_event.wait();
-      node_id_data_gpu[i].inst = node_id_instance;
-      node_id_data_gpu[i].index_space = node_id_field_data[i].index_space;
-      node_id_data_gpu[i].field_offset = 0;
-      rect_id_data_gpu[i].inst = rect_id_instance;
-      rect_id_data_gpu[i].index_space = rect_id_field_data[i].index_space;
-      rect_id_data_gpu[i].field_offset = 0;
-      rect_val_data_gpu[i].inst = rect_val_instance;
-      rect_val_data_gpu[i].index_space = rect_val_field_data[i].index_space;
-      rect_val_data_gpu[i].field_offset = sizeof(int);
-    }
-    wait_on_events = true;
-    std::vector<IndexSpace<1>> p_garbage_colors;
-    std::vector<IndexSpace<2>> p_garbage_rects;
-    log_app.info() << "WARMING UP " << "\n";
-
-    std::vector<DeppartEstimateInput<1, int>> field_estimate_input(rect_id_data_gpu.size());
-    std::vector<DeppartBufferRequirements> field_estimate_output(rect_id_data_gpu.size());
-    std::vector<DeppartEstimateInput<1, int>> image_estimate_input(rect_val_data_gpu.size());
-    std::vector<DeppartBufferRequirements> image_estimate_output(rect_val_data_gpu.size());
-    std::vector<DeppartSubspace<1, int>> subspace_input(colors.size());
-    for (size_t i = 0; i < rect_id_data_gpu.size(); i++) {
-      field_estimate_input[i].location = rect_id_data_gpu[i].inst.get_location();
-      field_estimate_input[i].space = rect_id_data_gpu[i].index_space;
-    }
-    for (size_t i = 0; i < rect_val_data_gpu.size(); i++) {
-      image_estimate_input[i].location = rect_val_data_gpu[i].inst.get_location();
-      image_estimate_input[i].space = rect_val_data_gpu[i].index_space;
-    }
-
-    is_rects.by_field_buffer_requirements(field_estimate_input, field_estimate_output);
-    std::vector<size_t> byte_fields = {sizeof(char)};
-    for (size_t i = 0; i < rect_id_data_gpu.size(); i++) {
-      IndexSpace<1> instance_index_space(Rect<1>(0, field_estimate_output[i].upper_bound-1));
-      RegionInstance::create_instance(rect_id_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
-    }
-
-    Event e001 = is_rects.create_subspaces_by_field(rect_id_data_gpu,
-                                                  colors,
-                                                  p_garbage_colors,
-                                                  Realm::ProfilingRequestSet());
-    if (wait_on_events) e001.wait();
-    for (size_t i = 0; i < colors.size(); i++) {
-      subspace_input[i].space = p_garbage_colors[i];
-      subspace_input[i].entries = p_garbage_colors[i].sparsity.impl()->get_entries().size();
-    }
-    is_nodes.by_image_buffer_requirements(subspace_input, image_estimate_input, image_estimate_output);
-    for (size_t i = 0; i < rect_val_data_gpu.size(); i++) {
-      IndexSpace<1> instance_index_space(Rect<1>(0, (image_estimate_output[i].upper_bound*5)-1));
-      RegionInstance::create_instance(rect_val_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
-    }
-    Event e002 = is_nodes.create_subspaces_by_image(rect_val_data_gpu,
-                                                     p_garbage_colors,
-                                                     p_garbage_rects,
-                                                     Realm::ProfilingRequestSet(),
-                                                     e001);
-    if(wait_on_events) e002.wait();
-
-    log_app.info() << "FINISHED WARMING UP " << "\n";
-    log_app.info() << "starting GPU  partitioning " << Clock::current_time_in_microseconds() << "\n";
-
-    log_app.info() << "STARTING GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
-
-    Event e01 = is_rects.create_subspaces_by_field(rect_id_data_gpu,
-                                                  colors,
-                                                  p_colored_rects,
-                                                  Realm::ProfilingRequestSet());
-        if (wait_on_events) e01.wait();
-
-    log_app.info() << "FINISHED GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
-    log_app.info() << "STARTING GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
-    Event e02 = is_nodes.create_subspaces_by_image(rect_val_data_gpu,
-                                                     p_colored_rects,
-                                                     p_rects,
-                                                     Realm::ProfilingRequestSet(),
-                                                     e01);
-    if(wait_on_events) e02.wait();
-    log_app.info() << "FINISHED GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
-    log_app.info() << "GPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n";
-
-    log_app.info() << "STARTING CPU  partitioning " << Clock::current_time_in_microseconds() << "\n";
-    log_app.info() << "STARTING CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
-    Event e1 = is_rects.create_subspaces_by_field(rect_id_field_data,
-                                                  colors,
-                                                  p_colored_rects_cpu,
-                                                  Realm::ProfilingRequestSet());
-    if (wait_on_events) e1.wait();
-    log_app.info() << "FINISHED CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n";
-    log_app.info() << "STARTING CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
-    Event e2 = is_nodes.create_subspaces_by_image(rect_val_field_data,
-                                                     p_colored_rects_cpu,
-                                                     p_rects_cpu,
-                                                     Realm::ProfilingRequestSet(),
-                                                     e1);
-    if(wait_on_events) e2.wait();
-    log_app.info() << "FINISHED CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n";
-    log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n";
-    return e2;
-  }
-
-
-
-  virtual int perform_dynamic_checks(void)
-  {
-    return 0;
-  }
-
-  virtual int check_partitioning(void)
-  {
-    log_app.info() << "Checking correctness of partitioning " << "\n";
-    int errors = 0;
-
-    for (int i = 0; i < num_pieces; i++) {
-      for (IndexSpaceIterator<1> it(p_colored_rects[i]); it.valid; it.step()) {
-        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
-          if(!p_colored_rects_cpu[i].contains(point.p)) {
-            log_app.error() << "Mismatch! GPU has extra colored rect point " << point.p
-                            << " on piece " << i << "\n";
-            errors++;
-          }
-        }
-      }
-      for (IndexSpaceIterator<1> it(p_colored_rects_cpu[i]); it.valid; it.step()) {
-        for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) {
-          if (!p_colored_rects[i].contains(point.p)) {
-            log_app.error() << "Mismatch! GPU is missing colored rect point " << point.p
-                            << " on piece " << i << "\n";
-            errors++;
-          }
-        }
-      }
-      for (IndexSpaceIterator<2> it(p_rects[i]); it.valid; it.step()) {
-        for (PointInRectIterator<2> point(it.rect); point.valid; point.step()) {
-          if (!p_rects_cpu[i].contains(point.p)) {
-            log_app.error() << "Mismatch! GPU has extra rect point " << point.p
-                            << " on piece " << i << "\n";
-            errors++;
-          }
-        }
-      }
-      for (IndexSpaceIterator<2> it(p_rects_cpu[i]); it.valid; it.step()) {
-        for (PointInRectIterator<2> point(it.rect); point.valid; point.step()) {
-          if (!p_rects[i].contains(point.p)) {
-            log_app.error() << "Mismatch! GPU is missing rect point " << point.p
-                            << " on piece " << i << "\n";
-            errors++;
-          }
-        }
-      }
-    }
-    return errors;
-  }
-};
-
-class MiniAeroTest : public TestInterface {
-public:
-  enum ProblemType
-  {
-    PTYPE_0,
-    PTYPE_1,
-    PTYPE_2,
-  };
-  enum FaceType
-  {
-    BC_INTERIOR = 0,
-    BC_TANGENT = 1,
-    BC_EXTRAPOLATE = 2,
-    BC_INFLOW = 3,
-    BC_NOSLIP = 4,
-    BC_BLOCK_BORDER = 5,
-    BC_TOTAL = 6,
-  };
-
-  ProblemType problem_type = PTYPE_0;
-  int global_x = 4;
-  int global_y = 4;
-  int global_z = 4;
-  int blocks_x = 2;
-  int blocks_y = 2;
-  int blocks_z = 2;
-
-  int n_cells;                             // total cell count
-  int n_blocks;                            // total block count
-  int n_faces;                             // total face count
-  std::vector<int> xsplit, ysplit, zsplit; // cut planes
-  std::vector<int> cells_per_block, faces_per_block;
-
-  // can't do 64-bit index types right now, so at least get most of our 32-bit space
-  typedef int INDEXTYPE;
-  static const INDEXTYPE FIRST_INDEX = -2000000000; // easier to read than INT_MIN+1
-
-  MiniAeroTest(int argc, const char *argv[])
-  {
-#define INT_ARG(s, v)                                                                    \
-  if(!strcmp(argv[i], s)) {                                                              \
-    v = atoi(argv[++i]);                                                                 \
-    continue;                                                                            \
-  }
-    for(int i = 1; i < argc; i++) {
-      if(!strcmp(argv[i], "-type")) {
-        problem_type = (ProblemType)atoi(argv[++i]);
-        continue;
-      }
-      INT_ARG("-gx", global_x);
-      INT_ARG("-gy", global_y);
-      INT_ARG("-gz", global_z);
-      INT_ARG("-bx", blocks_x);
-      INT_ARG("-by", blocks_y);
-      INT_ARG("-bz", blocks_z);
-      if(!strcmp(argv[i], "-g")) {
-        int v = atoi(argv[++i]);
-        global_x = global_y = global_z = v;
-        continue;
-      }
-      if(!strcmp(argv[i], "-b")) {
-        int v = atoi(argv[++i]);
-        blocks_x = blocks_y = blocks_z = v;
-        continue;
-      }
-    }
-#undef INT_ARG
-
-    // don't allow degenerate blocks
-    assert(global_x >= blocks_x);
-    assert(global_y >= blocks_y);
-    assert(global_z >= blocks_z);
-
-    split_evenly<int>(global_x, blocks_x, xsplit);
-    split_evenly<int>(global_y, blocks_y, ysplit);
-    split_evenly<int>(global_z, blocks_z, zsplit);
-
-    n_blocks = blocks_x * blocks_y * blocks_z;
-    n_cells = 0;
-    n_faces = 0;
-    for(int bz = 0; bz < blocks_z; bz++)
-      for(int by = 0; by < blocks_y; by++)
-        for(int bx = 0; bx < blocks_x; bx++) {
-          int nx = xsplit[bx + 1] - xsplit[bx];
-          int ny = ysplit[by + 1] - ysplit[by];
-          int nz = zsplit[bz + 1] - zsplit[bz];
-
-          int c = nx * ny * nz;
-          int f = (((nx + 1) * ny * nz) + (nx * (ny + 1) * nz) + (nx * ny * (nz + 1)));
-          cells_per_block.push_back(c);
-          faces_per_block.push_back(f);
-
-          n_cells += c;
-          n_faces += f;
-        }
-    assert(n_cells == global_x * global_y * global_z);
-    assert(n_faces == (((global_x + blocks_x) * global_y * global_z) +
-                       (global_x * (global_y + blocks_y) * global_z) +
-                       (global_x * global_y * (global_z + blocks_z))));
-  }
-
-  virtual void print_info(void)
-  {
-    printf("Realm dependent partitioning test - miniaero: %d x %d x %d cells, %d x %d x "
-           "%d blocks\n",
-           (int)global_x, (int)global_y, (int)global_z, (int)blocks_x, (int)blocks_y,
-           (int)blocks_z);
-  }
-
-  IndexSpace<1> is_cells, is_faces;
-  std::vector<RegionInstance> ri_cells;
-  std::vector<FieldDataDescriptor<IndexSpace<1>, int>> cell_blockid_field_data;
-  std::vector<RegionInstance> ri_faces;
-  std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1>>> face_left_field_data;
-  std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1>>> face_right_field_data;
-  std::vector<FieldDataDescriptor<IndexSpace<1>, int>> face_type_field_data;
-
-  struct InitDataArgs {
-    int index;
-    RegionInstance ri_cells, ri_faces;
-  };
-
-  virtual Event initialize_data(const std::vector<Memory> &memories,
-                                const std::vector<Processor> &procs)
-  {
-    // top level index spaces
-    is_cells = Rect<1>(FIRST_INDEX, FIRST_INDEX + n_cells - 1);
-    is_faces = Rect<1>(FIRST_INDEX, FIRST_INDEX + n_faces - 1);
-
-    // weighted partitions based on the distribution we already computed
-    std::vector<IndexSpace<1>> ss_cells_w;
-    std::vector<IndexSpace<1>> ss_faces_w;
-
-    is_cells
-        .create_weighted_subspaces(n_blocks, 1, cells_per_block, ss_cells_w,
-                                   Realm::ProfilingRequestSet())
-        .wait();
-    is_faces
-        .create_weighted_subspaces(n_blocks, 1, faces_per_block, ss_faces_w,
-                                   Realm::ProfilingRequestSet())
-        .wait();
-
-    log_app.debug() << "Initial partitions:";
-    for(size_t i = 0; i < ss_cells_w.size(); i++)
-      log_app.debug() << " Cells #" << i << ": " << ss_cells_w[i];
-    for(size_t i = 0; i < ss_faces_w.size(); i++)
-      log_app.debug() << " Faces #" << i << ": " << ss_faces_w[i];
-
-    // create instances for each of these subspaces
-    std::vector<size_t> cell_fields, face_fields;
-    cell_fields.push_back(sizeof(int)); // blockid
-    assert(sizeof(int) == sizeof(Point<1>));
-    face_fields.push_back(sizeof(Point<1>)); // left
-    face_fields.push_back(sizeof(Point<1>)); // right
-    face_fields.push_back(sizeof(int));      // type
-
-    ri_cells.resize(n_blocks);
-    cell_blockid_field_data.resize(n_blocks);
-
-    for(size_t i = 0; i < ss_cells_w.size(); i++) {
-      RegionInstance ri;
-      RegionInstance::create_instance(ri, memories[i % memories.size()], ss_cells_w[i],
-                                      cell_fields, 0 /*SOA*/,
-                                      Realm::ProfilingRequestSet())
-          .wait();
-      ri_cells[i] = ri;
-
-      cell_blockid_field_data[i].index_space = ss_cells_w[i];
-      cell_blockid_field_data[i].inst = ri_cells[i];
-      cell_blockid_field_data[i].field_offset = 0;
-    }
-
-    ri_faces.resize(n_blocks);
-    face_left_field_data.resize(n_blocks);
-    face_right_field_data.resize(n_blocks);
-    face_type_field_data.resize(n_blocks);
-
-    for(size_t i = 0; i < ss_faces_w.size(); i++) {
-      RegionInstance ri;
-      RegionInstance::create_instance(ri, memories[i % memories.size()], ss_faces_w[i],
-                                      face_fields, 0 /*SOA*/,
-                                      Realm::ProfilingRequestSet())
-          .wait();
-      ri_faces[i] = ri;
-
-      face_left_field_data[i].index_space = ss_faces_w[i];
-      face_left_field_data[i].inst = ri_faces[i];
-      face_left_field_data[i].field_offset = 0 * sizeof(Point<1>);
-
-      face_right_field_data[i].index_space = ss_faces_w[i];
-      face_right_field_data[i].inst = ri_faces[i];
-      face_right_field_data[i].field_offset = 1 * sizeof(Point<1>);
-
-      face_type_field_data[i].index_space = ss_faces_w[i];
-      face_type_field_data[i].inst = ri_faces[i];
-      face_type_field_data[i].field_offset = 2 * sizeof(Point<1>);
-    }
-
-    // fire off tasks to initialize data
-    std::set<Event> events;
-    for(int i = 0; i < n_blocks; i++) {
-      Processor p = procs[i % memories.size()];
-      InitDataArgs args;
-      args.index = i;
-      args.ri_cells = ri_cells[i];
-      args.ri_faces = ri_faces[i];
-      Event e = p.spawn(INIT_MINIAERO_DATA_TASK, &args, sizeof(args));
-      events.insert(e);
-    }
-
-    return Event::merge_events(events);
-  }
-
-  static void init_data_task_wrapper(const void *args, size_t arglen,
-                                     const void *userdata, size_t userlen, Processor p)
-  {
-    MiniAeroTest *me = (MiniAeroTest *)testcfg;
-    me->init_data_task(args, arglen, p);
-  }
-
-  Point<1> global_cell_pointer(int cx, int cy, int cz)
-  {
-    INDEXTYPE p = FIRST_INDEX;
-
-    // out of range?  return -1
-    if((cx < 0) || (cx >= global_x) || (cy < 0) || (cy >= global_y) || (cz < 0) ||
-       (cz >= global_z))
-      return -1;
-
-    // first chunks in z, then y, then x
-    int zi = find_split(zsplit, cz);
-    p += global_x * global_y * zsplit[zi];
-    cz -= zsplit[zi];
-    int local_z = zsplit[zi + 1] - zsplit[zi];
-
-    int yi = find_split(ysplit, cy);
-    p += global_x * ysplit[yi] * local_z;
-    cy -= ysplit[yi];
-    int local_y = ysplit[yi + 1] - ysplit[yi];
-
-    int xi = find_split(xsplit, cx);
-    p += xsplit[xi] * local_y * local_z;
-    cx -= xsplit[xi];
-    int local_x = xsplit[xi + 1] - xsplit[xi];
-
-    // now local addressing within this block
-    p += (cx + (cy * local_x) + (cz * local_x * local_y));
-    return p;
-  }
-
-  void init_data_task(const void *args, size_t arglen, Processor p)
-  {
-    const InitDataArgs &i_args = *(const InitDataArgs *)args;
-
-    i_args.ri_cells.fetch_metadata(p).wait();
-    i_args.ri_faces.fetch_metadata(p).wait();
-
-    log_app.info() << "init task #" << i_args.index << " (ri_cells=" << i_args.ri_cells
-                   << ", ri_faces=" << i_args.ri_faces << ")";
-
-    IndexSpace<1> is_cells = i_args.ri_cells.get_indexspace<1>();
-    IndexSpace<1> is_faces = i_args.ri_faces.get_indexspace<1>();
-
-    log_app.debug() << "C: " << is_cells;
-    log_app.debug() << "F: " << is_faces;
-
-    int bx = i_args.index % blocks_x;
-    int by = (i_args.index / blocks_x) % blocks_y;
-    int bz = i_args.index / blocks_x / blocks_y;
-
-    size_t nx = xsplit[bx + 1] - xsplit[bx];
-    size_t ny = ysplit[by + 1] - ysplit[by];
-    size_t nz = zsplit[bz + 1] - zsplit[bz];
-
-    size_t c = nx * ny * nz;
-    size_t f = (((nx + 1) * ny * nz) + (nx * (ny + 1) * nz) + (nx * ny * (nz + 1)));
-    assert(is_cells.bounds.volume() == c);
-    assert(is_faces.bounds.volume() == f);
-
-    // cells are all assigned to the local block
-    {
-      AffineAccessor<int, 1> a_cell_blockid(i_args.ri_cells, 0 /* offset */);
-
-      for(int cz = zsplit[bz]; cz < zsplit[bz + 1]; cz++)
-        for(int cy = ysplit[by]; cy < ysplit[by + 1]; cy++)
-          for(int cx = xsplit[bx]; cx < xsplit[bx + 1]; cx++) {
-            Point<1> pz = global_cell_pointer(cx, cy, cz);
-            assert(is_cells.bounds.contains(pz));
-
-            a_cell_blockid.write(pz, i_args.index);
-          }
-    }
-
-    // faces aren't in any globally-visible order
-    {
-      AffineAccessor<Point<1>, 1> a_face_left(i_args.ri_faces,
-                                              0 * sizeof(Point<1>) /* offset */);
-      AffineAccessor<Point<1>, 1> a_face_right(i_args.ri_faces,
-                                               1 * sizeof(Point<1>) /* offset */);
-      AffineAccessor<int, 1> a_face_type(i_args.ri_faces,
-                                         2 * sizeof(Point<1>) /* offset */);
-
-      Point<1> pf = is_faces.bounds.lo;
-
-      //  --           type 0      | type 1      | type 2
-      //  --           ------      | ------      | ------
-      //  -- left      extrapolate | inflow      | inflow
-      //  -- right     extrapolate | extrapolate | extrapolate
-      //  -- down      tangent     | noslip      | tangent
-      //  -- up        tangent     | extrapolate | tangent
-      //  -- back      tangent     | tangent     | tangent
-      //  -- front     tangent     | tangent     | tangent
-
-      // left/right faces first
-      for(int fx = xsplit[bx]; fx <= xsplit[bx + 1]; fx++) {
-        int ftype = BC_INTERIOR;
-        bool reversed = false;
-        if(fx == xsplit[bx]) {
-          // low boundary
-          reversed = true;
-          if(fx == 0)
-            switch(problem_type) {
-            case PTYPE_0:
-              ftype = BC_EXTRAPOLATE;
-              break;
-            case PTYPE_1:
-              ftype = BC_INFLOW;
-              break;
-            case PTYPE_2:
-              ftype = BC_INFLOW;
-              break;
-            }
-          else
-            ftype = BC_BLOCK_BORDER;
-        } else if(fx == xsplit[bx + 1]) {
-          // high boundary
-          if(fx == global_x)
-            switch(problem_type) {
-            case PTYPE_0:
-              ftype = BC_EXTRAPOLATE;
-              break;
-            case PTYPE_1:
-              ftype = BC_EXTRAPOLATE;
-              break;
-            case PTYPE_2:
-              ftype = BC_EXTRAPOLATE;
-              break;
-            }
-          else
-            ftype = BC_BLOCK_BORDER;
-        }
-
-        for(int cz = zsplit[bz]; cz < zsplit[bz + 1]; cz++)
-          for(int cy = ysplit[by]; cy < ysplit[by + 1]; cy++) {
-            a_face_left.write(pf, global_cell_pointer(fx - (reversed ? 0 : 1), cy, cz));
-            a_face_right.write(pf, global_cell_pointer(fx - (reversed ? 1 : 0), cy, cz));
-            a_face_type.write(pf, ftype);
-            pf[0]++;
-          }
-      }
-
-      // down/up faces next
-      for(int fy = ysplit[by]; fy <= ysplit[by + 1]; fy++) {
-        int ftype = BC_INTERIOR;
-        bool reversed = false;
-        if(fy == ysplit[by]) {
-          // low boundary
-          reversed = true;
-          if(fy == 0)
-            switch(problem_type) {
-            case PTYPE_0:
-              ftype = BC_TANGENT;
-              break;
-            case PTYPE_1:
-              ftype = BC_NOSLIP;
-              break;
-            case PTYPE_2:
-              ftype = BC_TANGENT;
-              break;
-            }
-          else
-            ftype = BC_BLOCK_BORDER;
-        } else if(fy == ysplit[by + 1]) {
-          // high boundary
-          if(fy == global_y)
-            switch(problem_type) {
-            case PTYPE_0:
-              ftype = BC_TANGENT;
-              break;
-            case PTYPE_1:
-              ftype = BC_EXTRAPOLATE;
-              break;
-            case PTYPE_2:
-              ftype = BC_TANGENT;
-              break;
-            }
-          else
-            ftype = BC_BLOCK_BORDER;
-        }
-
-        for(int cz = zsplit[bz]; cz < zsplit[bz + 1]; cz++)
-          for(int cx = xsplit[bx]; cx < xsplit[bx + 1]; cx++) {
-            a_face_left.write(pf, global_cell_pointer(cx, fy - (reversed ? 0 : 1), cz));
-            a_face_right.write(pf, global_cell_pointer(cx, fy - (reversed ? 1 : 0), cz));
-            a_face_type.write(pf, ftype);
-            pf[0]++;
-          }
-      }
-
-      // back/front faces last
-      for(int fz = zsplit[bz]; fz <= zsplit[bz + 1]; fz++) {
-        int ftype = BC_INTERIOR;
-        bool reversed = false;
-        if(fz == zsplit[bz]) {
-          // low boundary
-          reversed = true;
-          if(fz == 0)
-            switch(problem_type) {
-            case PTYPE_0:
-              ftype = BC_TANGENT;
-              break;
-            case PTYPE_1:
-              ftype = BC_TANGENT;
-              break;
-            case PTYPE_2:
-              ftype = BC_TANGENT;
-              break;
-            }
-          else
-            ftype = BC_BLOCK_BORDER;
-        } else if(fz == zsplit[bz + 1]) {
-          // high boundary
-          if(fz == global_z)
-            switch(problem_type) {
-            case PTYPE_0:
-              ftype = BC_TANGENT;
-              break;
-            case PTYPE_1:
-              ftype = BC_TANGENT;
-              break;
-            case PTYPE_2:
-              ftype = BC_TANGENT;
-              break;
-            }
-          else
-            ftype = BC_BLOCK_BORDER;
-        }
-
-        for(int cy = ysplit[by]; cy < ysplit[by + 1]; cy++)
-          for(int cx = xsplit[bx]; cx < xsplit[bx + 1]; cx++) {
-            a_face_left.write(pf, global_cell_pointer(cx, cy, fz - (reversed ? 0 : 1)));
-            a_face_right.write(pf, global_cell_pointer(cx, cy, fz - (reversed ? 1 : 0)));
-            a_face_type.write(pf, ftype);
-            pf[0]++;
-          }
-      }
-
-      assert(pf[0] == is_faces.bounds.hi[0] + 1);
-    }
-
-    if(show_graph) {
-      AffineAccessor<int, 1> a_cell_blockid(i_args.ri_cells, 0 /* offset */);
-
-      for(int i = is_cells.bounds.lo[0]; i <= is_cells.bounds.hi[0]; i++)
-        std::cout << "Z[" << i << "]: blockid=" << a_cell_blockid.read(i) << "\n";
-
-      AffineAccessor<Point<1>, 1> a_face_left(i_args.ri_faces,
-                                              0 * sizeof(Point<1>) /* offset */);
-      AffineAccessor<Point<1>, 1> a_face_right(i_args.ri_faces,
-                                               1 * sizeof(Point<1>) /* offset */);
-      AffineAccessor<int, 1> a_face_type(i_args.ri_faces,
-                                         2 * sizeof(Point<1>) /* offset */);
-
-      for(int i = is_faces.bounds.lo[0]; i <= is_faces.bounds.hi[0]; i++)
-        std::cout << "S[" << i << "]:"
-                  << " left=" << a_face_left.read(i) << " right=" << a_face_right.read(i)
-                  << " type=" << a_face_type.read(i) << "\n";
-    }
-  }
-
-  // the outputs of our partitioning will be:
-  //  p_cells               - subsets of is_cells split by block
-  //  p_faces               - subsets of_is_faces split by block (based on left cell)
-  //  p_facetypes[6]        - subsets of p_faces split further by face type
-  //  p_ghost               - subsets of is_cells reachable by each block's boundary faces
-
-  std::vector<IndexSpace<1>> p_cells;
-  std::vector<IndexSpace<1>> p_faces;
-  std::vector<std::vector<IndexSpace<1>>> p_facetypes;
-  std::vector<IndexSpace<1>> p_ghost;
-
-  virtual Event perform_partitioning(void)
-  {
-    // partition cells first
-    std::vector<int> colors(n_blocks);
-    for(int i = 0; i < n_blocks; i++)
-      colors[i] = i;
-
-    Event e1 = is_cells.create_subspaces_by_field(cell_blockid_field_data, colors,
-                                                  p_cells, Realm::ProfilingRequestSet());
-    if(wait_on_events)
-      e1.wait();
-
-    // now a preimage to get faces
-    Event e2 = is_faces.create_subspaces_by_preimage(
-        face_left_field_data, p_cells, p_faces, Realm::ProfilingRequestSet(), e1);
-    if(wait_on_events)
-      e2.wait();
-
-    // now split by face type
-    std::set<Event> evs;
-    std::vector<int> ftcolors(BC_TOTAL);
-    for(int i = 0; i < BC_TOTAL; i++)
-      ftcolors[i] = i;
-    p_facetypes.resize(n_blocks);
-    std::vector<IndexSpace<1>> p_border_faces(n_blocks);
-
-    for(int idx = 0; idx < n_blocks; idx++) {
-      Event e = p_faces[idx].create_subspaces_by_field(face_type_field_data, ftcolors,
-                                                       p_facetypes[idx],
-                                                       Realm::ProfilingRequestSet(), e2);
-      if(wait_on_events)
-        e.wait();
-      evs.insert(e);
-      p_border_faces[idx] = p_facetypes[idx][BC_BLOCK_BORDER];
-    }
-    Event e3 = Event::merge_events(evs);
-
-    // finally, the image of just the boundary faces through the right face gets us
-    //  ghost cells
-    Event e4 = is_cells.create_subspaces_by_image(
-        face_right_field_data, p_border_faces, p_ghost, Realm::ProfilingRequestSet(), e3);
-    if(wait_on_events)
-      e4.wait();
-
-    return e4;
-  }
-
-  virtual int perform_dynamic_checks(void)
-  {
-    int errors = 0;
-
-    std::vector<IndexSpace<1>> p_int_faces, p_border_faces;
-    for(int idx = 0; idx < n_blocks; idx++) {
-      p_int_faces.push_back(p_facetypes[idx][BC_INTERIOR]);
-      p_border_faces.push_back(p_facetypes[idx][BC_BLOCK_BORDER]);
-    }
-    // miniaero's checks are faster with image/diff on 1 thread, but slower on 4
-#ifdef MINIAERO_USE_IMAGE_DIFF
-    std::vector<IndexSpace<1>> p_l_test, p_ri_test, p_rb_test;
-    Event e4 = is_cells.create_subspaces_by_image_with_difference(
-        face_left_field_data, p_faces, p_cells, p_l_test, Realm::ProfilingRequestSet());
-    Event e5 = is_cells.create_subspaces_by_image_with_difference(
-        face_right_field_data, p_int_faces, p_cells, p_ri_test,
-        Realm::ProfilingRequestSet());
-    Event e6 = is_cells.create_subspaces_by_image_with_difference(
-        face_right_field_data, p_border_faces, p_ghost, p_rb_test,
-        Realm::ProfilingRequestSet());
-#else
-    std::vector<IndexSpace<1>> p_img_left, p_img_right_i, p_img_right_b;
-    Event e1 = is_cells.create_subspaces_by_image(
-        face_left_field_data, p_faces, p_img_left, Realm::ProfilingRequestSet());
-    Event e2 = is_cells.create_subspaces_by_image(
-        face_right_field_data, p_int_faces, p_img_right_i, Realm::ProfilingRequestSet());
-    Event e3 =
-        is_cells.create_subspaces_by_image(face_right_field_data, p_border_faces,
-                                           p_img_right_b, Realm::ProfilingRequestSet());
-    std::vector<IndexSpace<1>> p_l_test, p_ri_test, p_rb_test;
-    Event e4 = IndexSpace<1>::compute_differences(p_img_left, p_cells, p_l_test,
-                                                  Realm::ProfilingRequestSet(), e1);
-    for(unsigned idx = 0; idx < p_img_left.size(); idx++) {
-      p_img_left[idx].destroy(e4);
-    }
-    Event e5 = IndexSpace<1>::compute_differences(p_img_right_i, p_cells, p_ri_test,
-                                                  Realm::ProfilingRequestSet(), e2);
-    for(unsigned idx = 0; idx < p_img_right_i.size(); idx++) {
-      p_img_right_i[idx].destroy(e5);
-    }
-    Event e6 = IndexSpace<1>::compute_differences(p_img_right_b, p_ghost, p_rb_test,
-                                                  Realm::ProfilingRequestSet(), e3);
-    for(unsigned idx = 0; idx < p_img_right_b.size(); idx++) {
-      p_img_right_b[idx].destroy(e6);
-    }
-#endif
-    errors += check_empty(e4, p_l_test, "p_l_test");
-    errors += check_empty(e5, p_ri_test, "p_ri_test");
-    errors += check_empty(e6, p_rb_test, "p_rb_test");
-    for(unsigned idx = 0; idx < p_l_test.size(); idx++) {
-      p_l_test[idx].destroy(e4);
-    }
-    for(unsigned idx = 0; idx < p_ri_test.size(); idx++) {
-      p_ri_test[idx].destroy(e5);
-    }
-    for(unsigned idx = 0; idx < p_rb_test.size(); idx++) {
-      p_rb_test[idx].destroy(e6);
-    }
-
-    return errors;
-  }
-
-  virtual int check_partitioning(void)
-  {
-    int errors = 0;
-
-    Point<1> pc = is_cells.bounds.lo;
-    Point<1> pf = is_faces.bounds.lo;
-
-    for(int blkid = 0; blkid < n_blocks; blkid++) {
-      int bx = blkid % blocks_x;
-      int by = (blkid / blocks_x) % blocks_y;
-      int bz = blkid / blocks_x / blocks_y;
-
-      int nx = xsplit[bx + 1] - xsplit[bx];
-      int ny = ysplit[by + 1] - ysplit[by];
-      int nz = zsplit[bz + 1] - zsplit[bz];
-
-      // check cells
-      for(int i = 0; i < cells_per_block[blkid]; i++) {
-        for(int j = 0; j < n_blocks; j++) {
-          bool exp = (j == blkid);
-          bool act = p_cells[j].contains(pc);
-          if(exp != act) {
-            log_app.error() << "mismatch: cell " << pc << " in p_cells[" << j
-                            << "]: exp=" << exp << " act=" << act;
-            errors++;
-          }
-        }
-
-        std::set<int> exp_ghosts;
-        int cx = i % nx;
-        int cy = (i / nx) % ny;
-        int cz = i / nx / ny;
-        if((cx == 0) && (bx > 0))
-          exp_ghosts.insert(blkid - 1);
-        if((cx == (nx - 1)) && (bx < (blocks_x - 1)))
-          exp_ghosts.insert(blkid + 1);
-        if((cy == 0) && (by > 0))
-          exp_ghosts.insert(blkid - blocks_x);
-        if((cy == (ny - 1)) && (by < (blocks_y - 1)))
-          exp_ghosts.insert(blkid + blocks_x);
-        if((cz == 0) && (bz > 0))
-          exp_ghosts.insert(blkid - blocks_x * blocks_y);
-        if((cz == (nz - 1)) && (bz < (blocks_z - 1)))
-          exp_ghosts.insert(blkid + blocks_x * blocks_y);
-
-        for(int j = 0; j < n_blocks; j++) {
-          bool exp = exp_ghosts.count(j) > 0;
-          bool act = p_ghost[j].contains(pc);
-          if(exp != act) {
-            log_app.error() << "mismatch: cell " << pc << " in p_ghost[" << j
-                            << "]: exp=" << exp << " act=" << act;
-            errors++;
-          }
-        }
-
-        pc[0]++;
-      }
-
-      // check faces
-      for(int i = 0; i < faces_per_block[blkid]; i++) {
-        for(int j = 0; j < n_blocks; j++) {
-          bool exp = (j == blkid);
-          bool act = p_faces[j].contains(pf);
-          if(exp != act) {
-            log_app.error() << "mismatch: face " << pf << " in p_faces[" << j
-                            << "]: exp=" << exp << " act=" << act;
-            errors++;
-          }
-          FaceType exptype = BC_INTERIOR;
-          // luckily the faces on the edge of a block come in chunks
-          int lr_faces = (nx + 1) * ny * nz;
-          int du_faces = nx * (ny + 1) * nz;
-          int bf_faces = nx * ny * (nz + 1);
-          assert((lr_faces + du_faces + bf_faces) == faces_per_block[blkid]);
-          if(i < lr_faces) {
-            int x = i / ny / nz;
-            if(x == 0)
-              exptype = ((bx == 0) ? ((problem_type == PTYPE_0)   ? BC_EXTRAPOLATE
-                                      : (problem_type == PTYPE_1) ? BC_INFLOW
-                                                                  : BC_INFLOW)
-                                   : BC_BLOCK_BORDER);
-            if(x == nx)
-              exptype =
-                  ((bx == blocks_x - 1) ? ((problem_type == PTYPE_0)   ? BC_EXTRAPOLATE
-                                           : (problem_type == PTYPE_1) ? BC_EXTRAPOLATE
-                                                                       : BC_EXTRAPOLATE)
-                                        : BC_BLOCK_BORDER);
-          } else if(i < (lr_faces + du_faces)) {
-            int y = (i - lr_faces) / nx / nz;
-            if(y == 0)
-              exptype = ((by == 0) ? ((problem_type == PTYPE_0)   ? BC_TANGENT
-                                      : (problem_type == PTYPE_1) ? BC_NOSLIP
-                                                                  : BC_TANGENT)
-                                   : BC_BLOCK_BORDER);
-            if(y == ny)
-              exptype =
-                  ((by == blocks_y - 1) ? ((problem_type == PTYPE_0)   ? BC_TANGENT
-                                           : (problem_type == PTYPE_1) ? BC_EXTRAPOLATE
-                                                                       : BC_TANGENT)
-                                        : BC_BLOCK_BORDER);
-          } else {
-            int z = (i - lr_faces - du_faces) / nx / ny;
-            if(z == 0)
-              exptype = ((bz == 0) ? ((problem_type == PTYPE_0)   ? BC_TANGENT
-                                      : (problem_type == PTYPE_1) ? BC_TANGENT
-                                                                  : BC_TANGENT)
-                                   : BC_BLOCK_BORDER);
-            if(z == nz)
-              exptype = ((bz == blocks_z - 1) ? ((problem_type == PTYPE_0)   ? BC_TANGENT
-                                                 : (problem_type == PTYPE_1) ? BC_TANGENT
-                                                                             : BC_TANGENT)
-                                              : BC_BLOCK_BORDER);
-          }
-
-          for(int k = 0; k < BC_TOTAL; k++) {
-            bool exp = (j == blkid) && (k == exptype);
-            bool act = p_facetypes[j][k].contains(pf);
-            if(exp != act) {
-              log_app.error() << "mismatch: face " << pf << " in p_facetypes[" << j
-                              << "][" << k << "]: exp=" << exp << " act=" << act;
-              errors++;
-            }
-          }
-        }
-        pf[0]++;
-      }
-    }
-    for(unsigned idx = 0; idx < p_cells.size(); idx++) {
-      p_cells[idx].destroy();
-    }
-    for(unsigned idx = 0; idx < p_faces.size(); idx++) {
-      p_faces[idx].destroy();
-    }
-    for(unsigned i = 0; i < p_facetypes.size(); i++) {
-      for(unsigned j = 0; j < p_facetypes[i].size(); j++) {
-        p_facetypes[i][j].destroy();
-      }
-    }
-    for(unsigned idx = 0; idx < p_ghost.size(); idx++) {
-      p_ghost[idx].destroy();
-    }
-
-    return errors;
-  }
-};
-
-class CircuitTest : public TestInterface {
-public:
-  // graph config parameters
-  int num_nodes = 100;
-  int num_edges = 10;
-  int num_pieces = 2;
-  int pct_wire_in_piece = 50;
-
-  CircuitTest(int argc, const char *argv[])
-  {
-    for(int i = 1; i < argc; i++) {
-      if(!strcmp(argv[i], "-n")) {
-        num_nodes = atoi(argv[++i]);
-        continue;
-      }
-
-      if(!strcmp(argv[i], "-e")) {
-        num_edges = atoi(argv[++i]);
-        continue;
-      }
-
-      if(!strcmp(argv[i], "-p")) {
-        num_pieces = atoi(argv[++i]);
-        continue;
-      }
-    }
-  }
-
-  struct InitDataArgs {
-    int index;
-    RegionInstance ri_nodes, ri_edges;
-  };
-
-  enum PRNGStreams
-  {
-    NODE_SUBCKT_STREAM,
-    EDGE_IN_NODE_STREAM,
-    EDGE_OUT_NODE_STREAM1,
-    EDGE_OUT_NODE_STREAM2,
-  };
-
-  // nodes and edges are generated pseudo-randomly so that we can check the results
-  // without
-  //  needing all the field data in any one place
-  void random_node_data(int idx, int &subckt)
-  {
-    if(random_colors)
-      subckt = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBCKT_STREAM, num_pieces);
-    else
-      subckt = idx * num_pieces / num_nodes;
-  }
-
-  void random_edge_data(int idx, Point<1> &in_node, Point<1> &out_node)
-  {
-    if(random_colors) {
-      in_node = Philox_2x32<>::rand_int(random_seed, idx, EDGE_IN_NODE_STREAM, num_nodes);
-      out_node =
-          Philox_2x32<>::rand_int(random_seed, idx, EDGE_OUT_NODE_STREAM1, num_nodes);
-    } else {
-      int subckt = idx * num_pieces / num_edges;
-      int n_lo = subckt * num_nodes / num_pieces;
-      int n_hi = (subckt + 1) * num_nodes / num_pieces;
-      in_node = n_lo + Philox_2x32<>::rand_int(random_seed, idx, EDGE_IN_NODE_STREAM,
-                                               n_hi - n_lo);
-      int pct = Philox_2x32<>::rand_int(random_seed, idx, EDGE_OUT_NODE_STREAM2, 100);
-      if(pct < pct_wire_in_piece)
-        out_node = n_lo + Philox_2x32<>::rand_int(random_seed, idx, EDGE_OUT_NODE_STREAM1,
-                                                  n_hi - n_lo);
-      else
-        out_node =
-            Philox_2x32<>::rand_int(random_seed, idx, EDGE_OUT_NODE_STREAM1, num_nodes);
-    }
-  }
-
-  static void init_data_task_wrapper(const void *args, size_t arglen,
-                                     const void *userdata, size_t userlen, Processor p)
-  {
-    CircuitTest *me = (CircuitTest *)testcfg;
-    me->init_data_task(args, arglen, p);
-  }
-
-  void init_data_task(const void *args, size_t arglen, Processor p)
-  {
-    const InitDataArgs &i_args = *(const InitDataArgs *)args;
-
-    log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes
-                   << ", ri_edges=" << i_args.ri_edges << ")";
-
-    i_args.ri_nodes.fetch_metadata(p).wait();
-    i_args.ri_edges.fetch_metadata(p).wait();
-
-    IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>();
-    IndexSpace<1> is_edges = i_args.ri_edges.get_indexspace<1>();
-
-    log_app.debug() << "N: " << is_nodes;
-    log_app.debug() << "E: " << is_edges;
-
-    {
-      AffineAccessor<int, 1> a_subckt_id(i_args.ri_nodes, 0 /* offset */);
-
-      for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) {
-        int subckt;
-        random_node_data(i, subckt);
-        a_subckt_id.write(i, subckt);
-      }
-    }
-
-    {
-      AffineAccessor<Point<1>, 1> a_in_node(i_args.ri_edges,
-                                            0 * sizeof(Point<1>) /* offset */);
-      AffineAccessor<Point<1>, 1> a_out_node(i_args.ri_edges,
-                                             1 * sizeof(Point<1>) /* offset */);
-
-      for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) {
-        Point<1> in_node, out_node;
-        random_edge_data(i, in_node, out_node);
-        a_in_node.write(i, in_node);
-        a_out_node.write(i, out_node);
-      }
-    }
-
-    if(show_graph) {
-      AffineAccessor<int, 1> a_subckt_id(i_args.ri_nodes, 0 /* offset */);
-
-      for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++)
-        std::cout << "subckt_id[" << i << "] = " << a_subckt_id.read(i) << "\n";
-
-      AffineAccessor<Point<1>, 1> a_in_node(i_args.ri_edges,
-                                            0 * sizeof(Point<1>) /* offset */);
-
-      for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++)
-        std::cout << "in_node[" << i << "] = " << a_in_node.read(i) << "\n";
-
-      AffineAccessor<Point<1>, 1> a_out_node(i_args.ri_edges,
-                                             1 * sizeof(Point<1>) /* offset */);
-
-      for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++)
-        std::cout << "out_node[" << i << "] = " << a_out_node.read(i) << "\n";
-    }
-  }
-
-  IndexSpace<1> is_nodes, is_edges;
-  std::vector<RegionInstance> ri_nodes;
-  std::vector<FieldDataDescriptor<IndexSpace<1>, int>> subckt_field_data;
-  std::vector<RegionInstance> ri_edges;
-  std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1>>> in_node_field_data;
-  std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1>>> out_node_field_data;
-
-  virtual void print_info(void)
-  {
-    printf("Realm dependent partitioning test - circuit: %d nodes, %d edges, %d pieces\n",
-           (int)num_nodes, (int)num_edges, (int)num_pieces);
-  }
-
-  virtual Event initialize_data(const std::vector<Memory> &memories,
-                                const std::vector<Processor> &procs)
-  {
-    // now create index spaces for nodes and edges
-    is_nodes = Rect<1>(0, num_nodes - 1);
-    is_edges = Rect<1>(0, num_edges - 1);
-
-    // equal partition is used to do initial population of edges and nodes
-    std::vector<IndexSpace<1>> ss_nodes_eq;
-    std::vector<IndexSpace<1>> ss_edges_eq;
-
-    is_nodes
-        .create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet())
-        .wait();
-    is_edges
-        .create_equal_subspaces(num_pieces, 1, ss_edges_eq, Realm::ProfilingRequestSet())
-        .wait();
-
-    log_app.debug() << "Initial partitions:";
-    for(size_t i = 0; i < ss_nodes_eq.size(); i++)
-      log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i];
-    for(size_t i = 0; i < ss_edges_eq.size(); i++)
-      log_app.debug() << " Edges #" << i << ": " << ss_edges_eq[i];
-
-    // create instances for each of these subspaces
-    std::vector<size_t> node_fields, edge_fields;
-    node_fields.push_back(sizeof(int)); // subckt_id
-    assert(sizeof(int) == sizeof(Point<1>));
-    edge_fields.push_back(sizeof(Point<1>)); // in_node
-    edge_fields.push_back(sizeof(Point<1>)); // out_node
-
-    ri_nodes.resize(num_pieces);
-    subckt_field_data.resize(num_pieces);
-
-    for(size_t i = 0; i < ss_nodes_eq.size(); i++) {
-      RegionInstance ri;
-      RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i],
-                                      node_fields, 0 /*SOA*/,
-                                      Realm::ProfilingRequestSet())
-          .wait();
-      ri_nodes[i] = ri;
-
-      subckt_field_data[i].index_space = ss_nodes_eq[i];
-      subckt_field_data[i].inst = ri_nodes[i];
-      subckt_field_data[i].field_offset = 0;
-    }
-
-    ri_edges.resize(num_pieces);
-    in_node_field_data.resize(num_pieces);
-    out_node_field_data.resize(num_pieces);
-
-    for(size_t i = 0; i < ss_edges_eq.size(); i++) {
-      RegionInstance ri;
-      RegionInstance::create_instance(ri, memories[i % memories.size()], ss_edges_eq[i],
-                                      edge_fields, 0 /*SOA*/,
-                                      Realm::ProfilingRequestSet())
-          .wait();
-      ri_edges[i] = ri;
-
-      in_node_field_data[i].index_space = ss_edges_eq[i];
-      in_node_field_data[i].inst = ri_edges[i];
-      in_node_field_data[i].field_offset = 0 * sizeof(Point<1>);
-
-      out_node_field_data[i].index_space = ss_edges_eq[i];
-      out_node_field_data[i].inst = ri_edges[i];
-      out_node_field_data[i].field_offset = 1 * sizeof(Point<1>);
-    }
-
-    // fire off tasks to initialize data
-    std::set<Event> events;
-    for(int i = 0; i < num_pieces; i++) {
-      Processor p = procs[i % memories.size()];
-      InitDataArgs args;
-      args.index = i;
-      args.ri_nodes = ri_nodes[i];
-      args.ri_edges = ri_edges[i];
-      Event e = p.spawn(INIT_CIRCUIT_DATA_TASK, &args, sizeof(args));
-      events.insert(e);
-    }
-
-    return Event::merge_events(events);
-  }
-
-  // the outputs of our partitioning will be:
-  //  is_private, is_shared - subsets of is_nodes based on private/shared
-  //  p_pvt, p_shr, p_ghost - subsets of the above split by subckt
-  //  p_edges               - subsets of is_edges for each subckt
-
-  IndexSpace<1> is_shared, is_private;
-  std::vector<IndexSpace<1>> p_pvt, p_shr, p_ghost;
-  std::vector<IndexSpace<1>> p_edges;
-
-  virtual Event perform_partitioning(void)
-  {
-    // first partition nodes by subckt id (this is the independent partition,
-    //  but not actually used by the app)
-    std::vector<IndexSpace<1>> p_nodes;
-
-    std::vector<int> colors(num_pieces);
-    for(int i = 0; i < num_pieces; i++)
-      colors[i] = i;
-
-    Event e1 = is_nodes.create_subspaces_by_field(subckt_field_data, colors, p_nodes,
-                                                  Realm::ProfilingRequestSet());
-    if(wait_on_events)
-      e1.wait();
-
-    // now compute p_edges based on the color of their in_node (i.e. a preimage)
-    Event e2 = is_edges.create_subspaces_by_preimage(in_node_field_data, p_nodes, p_edges,
-                                                     Realm::ProfilingRequestSet(), e1);
-    if(wait_on_events)
-      e2.wait();
-
-      // an image of p_edges through out_node gives us all the shared nodes, along
-      //  with some private nodes
-#ifdef USE_IMAGE_DIFF
-    Event e4 = is_nodes.create_subspaces_by_image_with_difference(
-        out_node_field_data, p_edges, p_nodes, p_ghost, Realm::ProfilingRequestSet(), e2);
-    if(wait_on_events)
-      e4.wait();
-#else
-    std::vector<IndexSpace<1>> p_extra_nodes;
-
-    Event e3 = is_nodes.create_subspaces_by_image(
-        out_node_field_data, p_edges, p_extra_nodes, Realm::ProfilingRequestSet(), e2);
-    if(wait_on_events)
-      e3.wait();
-
-    // subtracting out those private nodes gives us p_ghost
-    Event e4 = IndexSpace<1>::compute_differences(p_extra_nodes, p_nodes, p_ghost,
-                                                  Realm::ProfilingRequestSet(), e3);
-    if(wait_on_events)
-      e4.wait();
-#endif
-
-    // the union of everybody's ghost nodes is is_shared
-    Event e5 = IndexSpace<1>::compute_union(p_ghost, is_shared,
-                                            Realm::ProfilingRequestSet(), e4);
-    if(wait_on_events)
-      e5.wait();
-
-    // and is_private is just the nodes of is_nodes that aren't in is_shared
-    Event e6 = IndexSpace<1>::compute_difference(is_nodes, is_shared, is_private,
-                                                 Realm::ProfilingRequestSet(), e5);
-    if(wait_on_events)
-      e6.wait();
-
-    // the intersection of the original p_nodes with is_shared gives us p_shr
-    // (note that we can do this in parallel with the computation of is_private)
-    Event e7 = IndexSpace<1>::compute_intersections(p_nodes, is_shared, p_shr,
-                                                    Realm::ProfilingRequestSet(), e5);
-    if(wait_on_events)
-      e7.wait();
-
-    // and finally, the intersection of p_nodes with is_private gives us p_pvt
-    Event e8 = IndexSpace<1>::compute_intersections(p_nodes, is_private, p_pvt,
-                                                    Realm::ProfilingRequestSet(), e6);
-    if(wait_on_events)
-      e8.wait();
-
-    // all done - wait on e7 and e8, which dominate every other operation
-    Event e9 = Event::merge_events(e7, e8);
-
-    for(unsigned idx = 0; idx < p_nodes.size(); idx++) {
-      p_nodes[idx].destroy(e9);
-    }
-
-    return e9;
-  }
-
-  virtual int perform_dynamic_checks(void)
-  {
-    int errors = 0;
-    // compute the intermediates for the checks - these duplicate things we
-    //  already have, but we're not supposed to know that here
-    std::vector<IndexSpace<1>> p_pvt_and_shr, p_all;
-    Event e1 = IndexSpace<1>::compute_unions(
-        p_pvt, p_shr, p_pvt_and_shr, Realm::ProfilingRequestSet(), Event::NO_EVENT);
-    Event e2 = IndexSpace<1>::compute_unions(p_pvt_and_shr, p_ghost, p_all,
-                                             Realm::ProfilingRequestSet(), e1);
-#ifdef USE_IMAGE_DIFF
-    std::vector<IndexSpace<1>> p_in_test, p_out_test;
-    Event e5 = is_nodes.create_subspaces_by_image_with_difference(
-        in_node_field_data, p_edges, p_pvt_and_shr, p_in_test,
-        Realm::ProfilingRequestSet(), e1);
-    Event e6 = is_nodes.create_subspaces_by_image_with_difference(
-        out_node_field_data, p_edges, p_all, p_out_test, Realm::ProfilingRequestSet(),
-        e2);
-#else
-    std::vector<IndexSpace<1>> p_in_img, p_out_img;
-    Event e3 =
-        is_nodes.create_subspaces_by_image(in_node_field_data, p_edges, p_in_img,
-                                           Realm::ProfilingRequestSet(), Event::NO_EVENT);
-    Event e4 =
-        is_nodes.create_subspaces_by_image(out_node_field_data, p_edges, p_out_img,
-                                           Realm::ProfilingRequestSet(), Event::NO_EVENT);
-    std::vector<IndexSpace<1>> p_in_test, p_out_test;
-    Event e5 = IndexSpace<1>::compute_differences(p_in_img, p_pvt_and_shr, p_in_test,
-                                                  Realm::ProfilingRequestSet(),
-                                                  Event::merge_events(e1, e3));
-    Event e6 = IndexSpace<1>::compute_differences(p_out_img, p_all, p_out_test,
-                                                  Realm::ProfilingRequestSet(),
-                                                  Event::merge_events(e2, e4));
-    for(unsigned idx = 0; idx < p_in_img.size(); idx++) {
-      p_in_img[idx].destroy(e5);
-    }
-    for(unsigned idx = 0; idx < p_out_img.size(); idx++) {
-      p_out_img[idx].destroy(e6);
-    }
-#endif
-
-    errors += check_empty(e5, p_in_test, "p_in_test");
-    errors += check_empty(e6, p_out_test, "p_out_test");
-    for(unsigned idx = 0; idx < p_pvt_and_shr.size(); idx++) {
-      p_pvt_and_shr[idx].destroy(e5);
-    }
-    for(unsigned idx = 0; idx < p_all.size(); idx++) {
-      p_all[idx].destroy(e6);
-    }
-    for(unsigned idx = 0; idx < p_in_test.size(); idx++) {
-      p_in_test[idx].destroy(e5);
-    }
-    for(unsigned idx = 0; idx < p_out_test.size(); idx++) {
-      p_out_test[idx].destroy(e6);
-    }
-
-    return errors;
-  }
-
-  virtual int check_partitioning(void)
-  {
-    int errors = 0;
-
-    // we'll make up the list of nodes we expect to be shared as we walk the edges
-    std::map<int, std::set<int>> ghost_nodes;
-
-#ifdef DUMP_OUTPUT_SPACES
-    dump_sparse_index_space<1, int>("is_private", is_private);
-    dump_sparse_index_space<1, int>("is_shared", is_shared);
-
-    for(int p = 0; p < num_pieces; p++) {
-      std::cout << "Piece #" << p << "\n";
-      dump_sparse_index_space<1, int>("p_pvt", p_pvt[p]);
-      dump_sparse_index_space<1, int>("p_shr", p_shr[p]);
-      dump_sparse_index_space<1, int>("p_ghost", p_ghost[p]);
-    }
-#endif
-
-    for(int i = 0; i < num_edges; i++) {
-      // regenerate the random info for this edge and the two nodes it touches
-      Point<1> in_node, out_node;
-      int in_subckt, out_subckt;
-      random_edge_data(i, in_node, out_node);
-      random_node_data(in_node, in_subckt);
-      random_node_data(out_node, out_subckt);
-
-      // the edge should be in exactly the p_edges for in_subckt
-      for(int p = 0; p < num_pieces; p++) {
-        bool exp = (p == in_subckt);
-        bool act = p_edges[p].contains(i);
-        if(exp != act) {
-          log_app.error() << "mismatch: edge " << i << " in p_edges[" << p
-                          << "]: exp=" << exp << " act=" << act;
-          errors++;
-        }
-      }
-
-      // is the output node a ghost for this wire?
-      if(in_subckt != out_subckt)
-        ghost_nodes[out_node].insert(in_subckt);
-    }
-
-    // now we can check the nodes
-    for(int i = 0; i < num_nodes; i++) {
-      int subckt;
-      random_node_data(i, subckt);
-      // check is_private and is_shared first
-      {
-        bool exp = ghost_nodes.count(i) == 0;
-        bool act = is_private.contains(i);
-        if(exp != act) {
-          log_app.error() << "mismatch: node " << i << " in is_private: exp=" << exp
-                          << " act=" << act;
-          errors++;
-        }
-      }
-      {
-        bool exp = ghost_nodes.count(i) > 0;
-        bool act = is_shared.contains(i);
-        if(exp != act) {
-          log_app.error() << "mismatch: node " << i << " in is_shared: exp=" << exp
-                          << " act=" << act;
-          errors++;
-        }
-      }
-
-      // now check p_pvt/shr/ghost
-      for(int p = 0; p < num_pieces; p++) {
-        bool exp = (subckt == p) && (ghost_nodes.count(i) == 0);
-        bool act = p_pvt[p].contains(i);
-        if(exp != act) {
-          log_app.error() << "mismatch: node " << i << " in p_pvt[" << p
-                          << "]: exp=" << exp << " act=" << act;
-          errors++;
-        }
-      }
-      for(int p = 0; p < num_pieces; p++) {
-        bool exp = (subckt == p) && (ghost_nodes.count(i) > 0);
-        bool act = p_shr[p].contains(i);
-        if(exp != act) {
-          log_app.error() << "mismatch: node " << i << " in p_shr[" << p
-                          << "]: exp=" << exp << " act=" << act;
-          errors++;
-        }
-      }
-      for(int p = 0; p < num_pieces; p++) {
-        bool exp =
-            (subckt != p) && (ghost_nodes.count(i) > 0) && (ghost_nodes[i].count(p) > 0);
-        bool act = p_ghost[p].contains(i);
-        if(exp != act) {
-          log_app.error() << "mismatch: node " << i << " in p_ghost[" << p
-                          << "]: exp=" << exp << " act=" << act;
-          errors++;
-        }
-      }
-    }
-
-    is_shared.destroy();
-    is_private.destroy();
-    for(unsigned idx = 0; idx < p_pvt.size(); idx++) {
-      p_pvt[idx].destroy();
-    }
-    for(unsigned idx = 0; idx < p_shr.size(); idx++) {
-      p_shr[idx].destroy();
-    }
-    for(unsigned idx = 0; idx < p_ghost.size(); idx++) {
-      p_ghost[idx].destroy();
-    }
-    for(unsigned idx = 0; idx < p_edges.size(); idx++) {
-      p_edges[idx].destroy();
-    }
-
-    return errors;
-  }
-};
-
-class PennantTest : public TestInterface {
-public:
-public:
-  // graph config parameters
-  enum MeshType
-  {
-    RectangularMesh,
-  };
-  MeshType mesh_type = RectangularMesh;
-  int nzx = 10;   // number of zones in x
-  int nzy = 10;   // number of zones in y
-  int numpcx = 2; // number of submeshes in x
-  int numpcy = 2; // number of submeshes in y
-
-  int npx, npy;                      // number of points in each dimension
-  int nz, ns, np, numpc;             // total number of zones, sides, points, and pieces
-  std::vector<int> zxbound, zybound; // x and y split points between submeshes
-  std::vector<int> lz, ls, lp;       // number of zones, sides, and points in each submesh
-
-  // can't do 64-bit index types right now, so at least get most of our 32-bit space
-  typedef int INDEXTYPE;
-  static const INDEXTYPE FIRST_INDEX = -2000000000; // easier to read than INT_MIN+1
-
-  PennantTest(int argc, const char *argv[])
-  {
-#define INT_ARG(s, v)                                                                    \
-  if(!strcmp(argv[i], s)) {                                                              \
-    v = atoi(argv[++i]);                                                                 \
-    continue;                                                                            \
-  }
-    for(int i = 1; i < argc; i++) {
-      INT_ARG("-nzx", nzx)
-      INT_ARG("-nzy", nzy)
-      INT_ARG("-numpcx", numpcx)
-      INT_ARG("-numpcy", numpcy)
-      if(!strcmp(argv[i], "-nz")) {
-        int v = atoi(argv[++i]);
-        nzx = nzy = v;
-        continue;
-      }
-      if(!strcmp(argv[i], "-numpc")) {
-        int v = atoi(argv[++i]);
-        numpcx = numpcy = v;
-        continue;
-      }
-    }
-#undef INT_ARG
-
-    switch(mesh_type) {
-    case RectangularMesh:
-    {
-      npx = nzx + 1;
-      npy = nzy + 1;
-      numpc = numpcx * numpcy;
-
-      zxbound.resize(numpcx + 1);
-      for(int i = 0; i <= numpcx; i++)
-        zxbound[i] = (i * nzx) / numpcx;
-
-      zybound.resize(numpcy + 1);
-      for(int i = 0; i <= numpcy; i++)
-        zybound[i] = (i * nzy) / numpcy;
-
-      nz = ns = np = 0;
-      for(int pcy = 0; pcy < numpcy; pcy++) {
-        for(int pcx = 0; pcx < numpcx; pcx++) {
-          int lx = zxbound[pcx + 1] - zxbound[pcx];
-          int ly = zybound[pcy + 1] - zybound[pcy];
-
-          int zones = lx * ly;
-          int sides = zones * 4;
-          // points are a little funny - shared edges go to the lower numbered piece
-          int points = ((pcx == 0) ? (lx + 1) : lx) * ((pcy == 0) ? (ly + 1) : ly);
-
-          lz.push_back(zones);
-          ls.push_back(sides);
-          lp.push_back(points);
-          nz += zones;
-          ns += sides;
-          np += points;
-        }
-      }
-
-      assert(nz == (nzx * nzy));
-      assert(ns == (4 * nzx * nzy));
-      assert(np == (npx * npy));
-
-      break;
-    }
-    }
-  }
-
-  virtual void print_info(void)
-  {
-    printf("Realm dependent partitioning test - pennant: %d x %d zones, %d x %d pieces\n",
-           (int)nzx, (int)nzy, (int)numpcx, (int)numpcy);
-  }
-
-  IndexSpace<1> is_zones, is_sides, is_points;
-  std::vector<RegionInstance> ri_zones;
-  std::vector<FieldDataDescriptor<IndexSpace<1>, int>> zone_color_field_data;
-  std::vector<RegionInstance> ri_sides;
-  std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1>>> side_mapsz_field_data;
-  std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1>>> side_mapss3_field_data;
-  std::vector<FieldDataDescriptor<IndexSpace<1>, Point<1>>> side_mapsp1_field_data;
-  std::vector<FieldDataDescriptor<IndexSpace<1>, bool>> side_ok_field_data;
-
-  struct InitDataArgs {
-    int index;
-    RegionInstance ri_zones, ri_sides;
-  };
-
-  virtual Event initialize_data(const std::vector<Memory> &memories,
-                                const std::vector<Processor> &procs)
-  {
-    // top level index spaces
-    is_zones = Rect<1>(FIRST_INDEX, FIRST_INDEX + nz - 1);
-    is_sides = Rect<1>(FIRST_INDEX, FIRST_INDEX + ns - 1);
-    is_points = Rect<1>(FIRST_INDEX, FIRST_INDEX + np - 1);
-
-    // weighted partitions based on the distribution we already computed
-    std::vector<IndexSpace<1>> ss_zones_w;
-    std::vector<IndexSpace<1>> ss_sides_w;
-    std::vector<IndexSpace<1>> ss_points_w;
-
-    is_zones
-        .create_weighted_subspaces(numpc, 1, lz, ss_zones_w, Realm::ProfilingRequestSet())
-        .wait();
-    is_sides
-        .create_weighted_subspaces(numpc, 1, ls, ss_sides_w, Realm::ProfilingRequestSet())
-        .wait();
-    is_points
-        .create_weighted_subspaces(numpc, 1, lp, ss_points_w,
-                                   Realm::ProfilingRequestSet())
-        .wait();
-
-    log_app.debug() << "Initial partitions:";
-    for(size_t i = 0; i < ss_zones_w.size(); i++)
-      log_app.debug() << " Zones #" << i << ": " << ss_zones_w[i];
-    for(size_t i = 0; i < ss_sides_w.size(); i++)
-      log_app.debug() << " Sides #" << i << ": " << ss_sides_w[i];
-    for(size_t i = 0; i < ss_points_w.size(); i++)
-      log_app.debug() << " Points #" << i << ": " << ss_points_w[i];
-
-    // create instances for each of these subspaces
-    std::vector<size_t> zone_fields, side_fields;
-    zone_fields.push_back(sizeof(int)); // color
-    assert(sizeof(int) == sizeof(Point<1>));
-    side_fields.push_back(sizeof(Point<1>)); // mapsz
-    side_fields.push_back(sizeof(Point<1>)); // mapss3
-    side_fields.push_back(sizeof(Point<1>)); // mapsp1
-    side_fields.push_back(sizeof(bool));     // ok
-
-    ri_zones.resize(numpc);
-    zone_color_field_data.resize(numpc);
-
-    for(size_t i = 0; i < ss_zones_w.size(); i++) {
-      RegionInstance ri;
-      RegionInstance::create_instance(ri, memories[i % memories.size()], ss_zones_w[i],
-                                      zone_fields, 0 /*SOA*/,
-                                      Realm::ProfilingRequestSet())
-          .wait();
-      ri_zones[i] = ri;
-
-      zone_color_field_data[i].index_space = ss_zones_w[i];
-      zone_color_field_data[i].inst = ri_zones[i];
-      zone_color_field_data[i].field_offset = 0;
-    }
-
-    ri_sides.resize(numpc);
-    side_mapsz_field_data.resize(numpc);
-    side_mapss3_field_data.resize(numpc);
-    side_mapsp1_field_data.resize(numpc);
-    side_ok_field_data.resize(numpc);
-
-    for(size_t i = 0; i < ss_sides_w.size(); i++) {
-      RegionInstance ri;
-      RegionInstance::create_instance(ri, memories[i % memories.size()], ss_sides_w[i],
-                                      side_fields, 0 /*SOA*/,
-                                      Realm::ProfilingRequestSet())
-          .wait();
-      ri_sides[i] = ri;
-
-      side_mapsz_field_data[i].index_space = ss_sides_w[i];
-      side_mapsz_field_data[i].inst = ri_sides[i];
-      side_mapsz_field_data[i].field_offset = 0 * sizeof(Point<1>);
-
-      side_mapss3_field_data[i].index_space = ss_sides_w[i];
-      side_mapss3_field_data[i].inst = ri_sides[i];
-      side_mapss3_field_data[i].field_offset = 1 * sizeof(Point<1>);
-
-      side_mapsp1_field_data[i].index_space = ss_sides_w[i];
-      side_mapsp1_field_data[i].inst = ri_sides[i];
-      side_mapsp1_field_data[i].field_offset = 2 * sizeof(Point<1>);
-
-      side_ok_field_data[i].index_space = ss_sides_w[i];
-      side_ok_field_data[i].inst = ri_sides[i];
-      side_ok_field_data[i].field_offset = 3 * sizeof(Point<1>);
-    }
-
-    // fire off tasks to initialize data
-    std::set<Event> events;
-    for(int i = 0; i < numpc; i++) {
-      Processor p = procs[i % memories.size()];
-      InitDataArgs args;
-      args.index = i;
-      args.ri_zones = ri_zones[i];
-      args.ri_sides = ri_sides[i];
-      Event e = p.spawn(INIT_PENNANT_DATA_TASK, &args, sizeof(args));
-      events.insert(e);
-    }
-
-    return Event::merge_events(events);
-  }
-
-  static void init_data_task_wrapper(const void *args, size_t arglen,
-                                     const void *userdata, size_t userlen, Processor p)
-  {
-    PennantTest *me = (PennantTest *)testcfg;
-    me->init_data_task(args, arglen, p);
-  }
-
-  Point<1> global_point_pointer(int py, int px) const
-  {
-    int pp = FIRST_INDEX;
-
-    // start by steping over whole y slabs - again be careful that the extra slab belongs
-    // to pcy == 0
-    int dy;
-    if(py > zybound[1]) {
-      int pcy = 1;
-      while(py > zybound[pcy + 1])
-        pcy++;
-      int slabs = zybound[pcy] + 1;
-      pp += npx * slabs;
-      py -= slabs;
-      dy = zybound[pcy + 1] - zybound[pcy];
-    } else {
-      dy = zybound[1] + 1;
-    }
-
-    // now chunks in x, using just the y width of this row of chunks
-    int dx;
-    if(px > zxbound[1]) {
-      int pcx = 1;
-      while(px > zxbound[pcx + 1])
-        pcx++;
-      int strips = zxbound[pcx] + 1;
-      pp += dy * strips;
-      px -= strips;
-      dx = zxbound[pcx + 1] - zxbound[pcx];
-    } else {
-      dx = zxbound[1] + 1;
-    }
-
-    // finally, px and py are now local and are handled easily
-    pp += py * dx + px;
-
-    return pp;
-  }
-
-  void init_data_task(const void *args, size_t arglen, Processor p)
-  {
-    const InitDataArgs &i_args = *(const InitDataArgs *)args;
-
-    log_app.info() << "init task #" << i_args.index << " (ri_zones=" << i_args.ri_zones
-                   << ", ri_sides=" << i_args.ri_sides << ")";
-
-    i_args.ri_zones.fetch_metadata(p).wait();
-    i_args.ri_sides.fetch_metadata(p).wait();
-
-    IndexSpace<1> is_zones = i_args.ri_zones.get_indexspace<1>();
-    IndexSpace<1> is_sides = i_args.ri_sides.get_indexspace<1>();
-
-    log_app.debug() << "Z: " << is_zones;
-    log_app.debug() << "S: " << is_sides;
-
-    int pcx = i_args.index % numpcx;
-    int pcy = i_args.index / numpcx;
-
-    int zxlo = zxbound[pcx];
-    int zxhi = zxbound[pcx + 1];
-    int zylo = zybound[pcy];
-    int zyhi = zybound[pcy + 1];
-
-    {
-      AffineAccessor<int, 1> a_zone_color(i_args.ri_zones, 0 /* offset */);
-      AffineAccessor<Point<1>, 1> a_side_mapsz(i_args.ri_sides,
-                                               0 * sizeof(Point<1>) /* offset */);
-      AffineAccessor<Point<1>, 1> a_side_mapss3(i_args.ri_sides,
-                                                1 * sizeof(Point<1>) /* offset */);
-      AffineAccessor<Point<1>, 1> a_side_mapsp1(i_args.ri_sides,
-                                                2 * sizeof(Point<1>) /* offset */);
-      AffineAccessor<bool, 1> a_side_ok(i_args.ri_sides,
-                                        3 * sizeof(Point<1>) /* offset */);
-
-      Point<1> pz = is_zones.bounds.lo;
-      Point<1> ps = is_sides.bounds.lo;
-
-      for(int zy = zylo; zy < zyhi; zy++) {
-        for(int zx = zxlo; zx < zxhi; zx++) {
-          // get 4 side pointers
-          Point<1> ps0 = ps;
-          ps[0]++;
-          Point<1> ps1 = ps;
-          ps[0]++;
-          Point<1> ps2 = ps;
-          ps[0]++;
-          Point<1> ps3 = ps;
-          ps[0]++;
-
-          // point pointers are ugly because they can be in neighbors - use a helper
-          Point<1> pp0 = global_point_pointer(zy, zx); // go CCW
-          Point<1> pp1 = global_point_pointer(zy + 1, zx);
-          Point<1> pp2 = global_point_pointer(zy + 1, zx + 1);
-          Point<1> pp3 = global_point_pointer(zy, zx + 1);
-
-          a_zone_color.write(pz, i_args.index);
-
-          a_side_mapsz.write(ps0, pz);
-          a_side_mapsz.write(ps1, pz);
-          a_side_mapsz.write(ps2, pz);
-          a_side_mapsz.write(ps3, pz);
-
-          a_side_mapss3.write(ps0, ps1);
-          a_side_mapss3.write(ps1, ps2);
-          a_side_mapss3.write(ps2, ps3);
-          a_side_mapss3.write(ps3, ps0);
-
-          a_side_mapsp1.write(ps0, pp0);
-          a_side_mapsp1.write(ps1, pp1);
-          a_side_mapsp1.write(ps2, pp2);
-          a_side_mapsp1.write(ps3, pp3);
-
-          a_side_ok.write(ps0, true);
-          a_side_ok.write(ps1, true);
-          a_side_ok.write(ps2, true);
-          a_side_ok.write(ps3, true);
-
-          pz[0]++;
-        }
-      }
-      assert(pz[0] == is_zones.bounds.hi[0] + 1);
-      assert(ps[0] == is_sides.bounds.hi[0] + 1);
-    }
-
-    if(show_graph) {
-      AffineAccessor<int, 1> a_zone_color(i_args.ri_zones, 0 /* offset */);
-
-      for(int i = is_zones.bounds.lo; i <= is_zones.bounds.hi; i++)
-        std::cout << "Z[" << i << "]: color=" << a_zone_color.read(i) << "\n";
-
-      AffineAccessor<Point<1>, 1> a_side_mapsz(i_args.ri_sides,
-                                               0 * sizeof(Point<1>) /* offset */);
-      AffineAccessor<Point<1>, 1> a_side_mapss3(i_args.ri_sides,
-                                                1 * sizeof(Point<1>) /* offset */);
-      AffineAccessor<Point<1>, 1> a_side_mapsp1(i_args.ri_sides,
-                                                2 * sizeof(Point<1>) /* offset */);
-      AffineAccessor<bool, 1> a_side_ok(i_args.ri_sides,
-                                        3 * sizeof(Point<1>) /* offset */);
-
-      for(int i = is_sides.bounds.lo; i <= is_sides.bounds.hi; i++)
-        std::cout << "S[" << i << "]:"
-                  << " mapsz=" << a_side_mapsz.read(i)
-                  << " mapss3=" << a_side_mapss3.read(i)
-                  << " mapsp1=" << a_side_mapsp1.read(i) << " ok=" << a_side_ok.read(i)
-                  << "\n";
-    }
-  }
-
-  // the outputs of our partitioning will be:
-  //  p_zones               - subsets of is_zones split by piece
-  //  p_sides               - subsets of is_sides split by piece (with bad sides removed)
-  //  p_points              - subsets of is_points by piece (aliased)
-
-  std::vector<IndexSpace<1>> p_zones;
-  std::vector<IndexSpace<1>> p_sides;
-  std::vector<IndexSpace<1>> p_points;
-
-  virtual Event perform_partitioning(void)
-  {
-    // first get the set of bad sides (i.e. ok == false)
-    IndexSpace<1> bad_sides;
-
-    Event e1 = is_sides.create_subspace_by_field(side_ok_field_data, false, bad_sides,
-                                                 Realm::ProfilingRequestSet());
-    if(wait_on_events)
-      e1.wait();
-
-    // map the bad sides through to bad zones
-    IndexSpace<1> bad_zones;
-    Event e2 = is_zones.create_subspace_by_image(
-        side_mapsz_field_data, bad_sides, bad_zones, Realm::ProfilingRequestSet(), e1);
-    if(wait_on_events)
-      e2.wait();
-    bad_sides.destroy(e2);
-
-    // subtract bad zones to get good zones
-    IndexSpace<1> good_zones;
-    Event e3 = IndexSpace<1>::compute_difference(is_zones, bad_zones, good_zones,
-                                                 Realm::ProfilingRequestSet(), e2);
-    if(wait_on_events)
-      e3.wait();
-    bad_zones.destroy(e3);
-
-    // now do actual partitions with just good zones
-    std::vector<int> colors(numpc);
-    for(int i = 0; i < numpc; i++)
-      colors[i] = i;
-
-    Event e4 = good_zones.create_subspaces_by_field(
-        zone_color_field_data, colors, p_zones, Realm::ProfilingRequestSet(), e3);
-    if(wait_on_events)
-      e4.wait();
-    good_zones.destroy(e4);
-
-    // preimage of zones is sides
-    Event e5 = is_sides.create_subspaces_by_preimage(
-        side_mapsz_field_data, p_zones, p_sides, Realm::ProfilingRequestSet(), e4);
-    if(wait_on_events)
-      e5.wait();
-
-    // and image of sides->mapsp1 is points
-    Event e6 = is_points.create_subspaces_by_image(
-        side_mapsp1_field_data, p_sides, p_points, Realm::ProfilingRequestSet(), e5);
-    if(wait_on_events)
-      e6.wait();
-
-    return e6;
-  }
-
-  virtual int perform_dynamic_checks(void)
-  {
-    int errors = 0;
-
-    // pennant's checks are actually slower with the fused image/diff
-#ifdef PENNANT_USE_IMAGE_DIFF
-    std::vector<IndexSpace<1>> p_z_test, p_p_test, p_s_test;
-    Event e4 = is_zones.create_subspaces_by_image_with_difference(
-        side_mapsz_field_data, p_sides, p_zones, p_z_test, Realm::ProfilingRequestSet());
-    Event e5 = is_points.create_subspaces_by_image_with_difference(
-        side_mapsp1_field_data, p_sides, p_points, p_p_test,
-        Realm::ProfilingRequestSet());
-    Event e6 = is_sides.create_subspaces_by_image_with_difference(
-        side_mapss3_field_data, p_sides, p_sides, p_s_test, Realm::ProfilingRequestSet());
-#else
-    std::vector<IndexSpace<1>> p_img_mapsz, p_img_mapsp1, p_img_mapss3;
-    Event e1 = is_zones.create_subspaces_by_image(
-        side_mapsz_field_data, p_sides, p_img_mapsz, Realm::ProfilingRequestSet());
-    Event e2 = is_points.create_subspaces_by_image(
-        side_mapsp1_field_data, p_sides, p_img_mapsp1, Realm::ProfilingRequestSet());
-    Event e3 = is_sides.create_subspaces_by_image(
-        side_mapss3_field_data, p_sides, p_img_mapss3, Realm::ProfilingRequestSet());
-    std::vector<IndexSpace<1>> p_z_test, p_p_test, p_s_test;
-    Event e4 = IndexSpace<1>::compute_differences(p_img_mapsz, p_zones, p_z_test,
-                                                  Realm::ProfilingRequestSet(), e1);
-    for(unsigned idx = 0; idx < p_img_mapsz.size(); idx++) {
-      p_img_mapsz[idx].destroy(e4);
-    }
-    Event e5 = IndexSpace<1>::compute_differences(p_img_mapsp1, p_points, p_p_test,
-                                                  Realm::ProfilingRequestSet(), e2);
-    for(unsigned idx = 0; idx < p_img_mapsp1.size(); idx++) {
-      p_img_mapsp1[idx].destroy(e5);
-    }
-    Event e6 = IndexSpace<1>::compute_differences(p_img_mapss3, p_sides, p_s_test,
-                                                  Realm::ProfilingRequestSet(), e3);
-    for(unsigned idx = 0; idx < p_img_mapss3.size(); idx++) {
-      p_img_mapss3[idx].destroy(e6);
-    }
-#endif
-    errors += check_empty(e4, p_z_test, "p_z_test");
-    errors += check_empty(e5, p_p_test, "p_p_test");
-    errors += check_empty(e6, p_s_test, "p_s_test");
-    for(unsigned idx = 0; idx < p_z_test.size(); idx++) {
-      p_z_test[idx].destroy(e4);
-    }
-    for(unsigned idx = 0; idx < p_p_test.size(); idx++) {
-      p_p_test[idx].destroy(e5);
-    }
-    for(unsigned idx = 0; idx < p_s_test.size(); idx++) {
-      p_s_test[idx].destroy(e6);
-    }
-
-    return errors;
-  }
-
-  virtual int check_partitioning(void)
-  {
-    int errors = 0;
-
-    for(int pcy = 0; pcy < numpcy; pcy++) {
-      for(int pcx = 0; pcx < numpcx; pcx++) {
-        int idx = pcy * numpcx + pcx;
-
-        int lx = zxbound[pcx + 1] - zxbound[pcx];
-        int ly = zybound[pcy + 1] - zybound[pcy];
-
-        int exp_zones = lx * ly;
-        int exp_sides = exp_zones * 4;
-        int exp_points = (lx + 1) * (ly + 1); // easier because of aliasing
-
-        int act_zones = p_zones[idx].volume();
-        int act_sides = p_sides[idx].volume();
-        int act_points = p_points[idx].volume();
-
-        if(exp_zones != act_zones) {
-          log_app.error() << "Piece #" << idx
-                          << ": zone count mismatch: exp = " << exp_zones
-                          << ", act = " << act_zones;
-          errors++;
-        }
-        if(exp_sides != act_sides) {
-          log_app.error() << "Piece #" << idx
-                          << ": side count mismatch: exp = " << exp_sides
-                          << ", act = " << act_sides;
-          errors++;
-        }
-        if(exp_points != act_points) {
-          log_app.error() << "Piece #" << idx
-                          << ": point count mismatch: exp = " << exp_points
-                          << ", act = " << act_points;
-          errors++;
-        }
-      }
-    }
-
-    // check zones
-    Point<1> pz = is_zones.bounds.lo;
-    for(int pc = 0; pc < numpc; pc++) {
-      for(int i = 0; i < lz[pc]; i++) {
-        for(int j = 0; j < numpc; j++) {
-          bool exp = (j == pc);
-          bool act = p_zones[j].contains(pz);
-          if(exp != act) {
-            log_app.error() << "mismatch: zone " << pz << " in p_zones[" << j
-                            << "]: exp=" << exp << " act=" << act;
-            errors++;
-          }
-        }
-        pz[0]++;
-      }
-    }
-
-    // check sides
-    Point<1> ps = is_sides.bounds.lo;
-    for(int pc = 0; pc < numpc; pc++) {
-      for(int i = 0; i < ls[pc]; i++) {
-        for(int j = 0; j < numpc; j++) {
-          bool exp = (j == pc);
-          bool act = p_sides[j].contains(ps);
-          if(exp != act) {
-            log_app.error() << "mismatch: side " << ps << " in p_sides[" << j
-                            << "]: exp=" << exp << " act=" << act;
-            errors++;
-          }
-        }
-        ps[0]++;
-      }
-    }
-
-    // check points (trickier due to ghosting)
-    for(int py = 0; py < npy; py++)
-      for(int px = 0; px < npx; px++) {
-        Point<1> pp = global_point_pointer(py, px);
-        for(int pc = 0; pc < numpc; pc++) {
-          int pcy = pc / numpcx;
-          int pcx = pc % numpcx;
-          bool exp = ((py >= zybound[pcy]) && (py <= zybound[pcy + 1]) &&
-                      (px >= zxbound[pcx]) && (px <= zxbound[pcx + 1]));
-          bool act = p_points[pc].contains(pp);
-          if(exp != act) {
-            log_app.error() << "mismatch: point " << pp << " in p_points[" << pc
-                            << "]: exp=" << exp << " act=" << act;
-            errors++;
-          }
-        }
-      }
-
-    for(unsigned idx = 0; idx < p_zones.size(); idx++) {
-      p_zones[idx].destroy();
-    }
-    for(unsigned idx = 0; idx < p_sides.size(); idx++) {
-      p_sides[idx].destroy();
-    }
-    for(unsigned idx = 0; idx < p_points.size(); idx++) {
-      p_points[idx].destroy();
-    }
-
-    return errors;
-  }
-};
-
-template <typename PRNG = Philox_2x32<>>
-class RandStream {
-public:
-  RandStream(unsigned _seed)
-    : seed(_seed)
-    , idx(0)
-  {}
-
-  void setpos(unsigned long long _idx) { idx = _idx; }
-  void adjpos(long long _adj) { idx += _adj; }
-
-  unsigned rand_int(unsigned n)
-  {
-    unsigned v = PRNG::rand_int(seed, idx >> 32, idx, n);
-    idx++;
-    return v;
-  }
-
-  float rand_float(void)
-  {
-    float v = PRNG::rand_float(seed, idx >> 32, idx);
-    idx++;
-    return v;
-  }
-
-  unsigned seed;
-  unsigned long long idx;
-};
-
-template <typename FT>
-FT randval(RandStream<> &rs);
-
-template <>
-float randval<float>(RandStream<> &rs)
-{
-  return rs.rand_float();
-}
-
-template <>
-int randval<int>(RandStream<> &rs)
-{
-  return rs.rand_int(INT_MAX);
-}
-
-template <int N1, typename T1, int N2, typename T2, typename FT>
-class RandomTest : public TestInterface {
-public:
-  RandomTest(int argc, const char *argv[]);
-  virtual ~RandomTest(void);
-
-  virtual void print_info(void);
-
-  virtual Event initialize_data(const std::vector<Memory> &memories,
-                                const std::vector<Processor> &procs);
-
-  virtual Event perform_partitioning(void);
-
-  virtual int perform_dynamic_checks(void);
-
-  virtual int check_partitioning(void);
-
-  void fill_instance_data(IndexSpace<N1, T1> ibounds, RegionInstance inst);
-
-protected:
-  T1 base1_min, base1_max, extent1_min, extent1_max;
-  T2 base2_min, base2_max, extent2_min, extent2_max;
-  int num_pieces, num_colors;
-
-  Rect<N1, T1> bounds1;
-  Rect<N2, T2> bounds2;
-  IndexSpace<N1, T1> root1;
-  IndexSpace<N2, T2> root2;
-  std::vector<FT> colors;
-  std::vector<RegionInstance> ri_data1;
-  std::vector<FieldDataDescriptor<IndexSpace<N1, T1>, FT>> fd_vals1;
-  std::vector<FieldDataDescriptor<IndexSpace<N1, T1>, Point<N2, T2>>> fd_ptrs1;
-};
-
-template <int N1, typename T1, int N2, typename T2, typename FT>
-RandomTest<N1, T1, N2, T2, FT>::RandomTest(int argc, const char *argv[])
-  : base1_min(0)
-  , base1_max(0)
-  , extent1_min(4)
-  , extent1_max(6)
-  , base2_min(0)
-  , base2_max(0)
-  , extent2_min(4)
-  , extent2_max(6)
-  , num_pieces(2)
-  , num_colors(4)
-{
-  RandStream<> rs(random_seed + 0);
-
-  for(int i = 0; i < N1; i++) {
-    bounds1.lo[i] = base1_min + rs.rand_int(base1_max - base1_min + 1);
-    bounds1.hi[i] =
-        (bounds1.lo[i] + extent1_min + rs.rand_int(extent1_max - extent1_min + 1));
-  }
-  for(int i = 0; i < N2; i++) {
-    bounds2.lo[i] = base2_min + rs.rand_int(base2_max - base2_min + 1);
-    bounds2.hi[i] =
-        (bounds2.lo[i] + extent2_min + rs.rand_int(extent2_max - extent2_min + 1));
-  }
-
-  colors.resize(num_colors);
-  for(int i = 0; i < num_colors; i++)
-    colors[i] = randval<FT>(rs);
-}
-
-template <int N1, typename T1, int N2, typename T2, typename FT>
-RandomTest<N1, T1, N2, T2, FT>::~RandomTest(void)
-{}
-
-template <int N1, typename T1, int N2, typename T2, typename FT>
-void RandomTest<N1, T1, N2, T2, FT>::print_info(void)
-{
-  printf("Realm dependent partitioning test - random\n");
-}
-
-template <int N1, typename T1, int N2, typename T2, typename FT>
-void RandomTest<N1, T1, N2, T2, FT>::fill_instance_data(IndexSpace<N1, T1> ibounds,
-                                                        RegionInstance inst)
-{
-  {
-    // start with value field
-    AffineAccessor<FT, N1, T1> a_vals(inst, 0);
-
-    // iterate over all points in root1 with initial random values
-    RandStream<> rs1(random_seed + 1);
-    for(PointInRectIterator<N1, T1> pir(bounds1); pir.valid; pir.step()) {
-      FT v = colors[rs1.rand_int(colors.size())];
-      if(ibounds.contains(pir.p))
-        a_vals.write(pir.p, v);
-    }
-
-    // print results
-    for(PointInRectIterator<N1, T1> pir(bounds1); pir.valid; pir.step()) {
-      if(ibounds.contains(pir.p))
-        log_app.debug() << "v[" << pir.p << "] = " << a_vals.read(pir.p);
-    }
-  }
-
-  {
-    // now pointer field
-    AffineAccessor<Point<N2, T2>, N1, T1> a_ptrs(inst, 0 + sizeof(FT));
-
-    // iterate over all points in root1 with initial random values
-    RandStream<> rs2(random_seed + 2);
-    for(PointInRectIterator<N1, T1> pir(bounds1); pir.valid; pir.step()) {
-      Point<N2, T2> p2;
-      for(int i = 0; i < N2; i++)
-        p2[i] = bounds2.lo[i] + rs2.rand_int(bounds2.hi[i] - bounds2.lo[i] + 1);
-      if(ibounds.contains(pir.p))
-        a_ptrs.write(pir.p, p2);
-    }
-
-    // print results
-    for(PointInRectIterator<N1, T1> pir(bounds1); pir.valid; pir.step()) {
-      if(ibounds.contains(pir.p))
-        log_app.debug() << "p[" << pir.p << "] = " << a_ptrs.read(pir.p);
-    }
-  }
-}
-
-template <int N1, typename T1, int N2, typename T2, typename FT>
-Event RandomTest<N1, T1, N2, T2, FT>::initialize_data(const std::vector<Memory> &memories,
-                                                      const std::vector<Processor> &procs)
-{
-  root1 = IndexSpace<N1, T1>(bounds1);
-  root2 = IndexSpace<N2, T2>(bounds2);
-  log_app.debug() << "root1 = " << root1;
-  log_app.debug() << "root2 = " << root2;
-
-  // create instances to hold actual data
-  size_t num_insts = memories.size();
-  log_app.debug() << "procs: " << procs;
-  log_app.debug() << "mems: " << memories;
-  std::vector<IndexSpace<N1, T1>> ss_inst1;
-  root1.create_equal_subspaces(num_insts, 1, ss_inst1, Realm::ProfilingRequestSet())
-      .wait();
-
-  std::vector<size_t> field_sizes;
-  field_sizes.push_back(sizeof(FT));
-  field_sizes.push_back(sizeof(Point<N2, T2>));
-
-  ri_data1.resize(num_insts);
-  fd_vals1.resize(num_insts);
-  fd_ptrs1.resize(num_insts);
-
-  for(size_t i = 0; i < num_insts; i++) {
-    RegionInstance ri;
-    RegionInstance::create_instance(ri, memories[i], ss_inst1[i], field_sizes, 0 /*SOA*/,
-                                    Realm::ProfilingRequestSet())
-        .wait();
-    log_app.debug() << "inst[" << i << "] = " << ri << " (" << ss_inst1[i] << ")";
-    ri_data1[i] = ri;
-
-    fd_vals1[i].index_space = ss_inst1[i];
-    fd_vals1[i].inst = ri;
-    fd_vals1[i].field_offset = 0;
-
-    fd_ptrs1[i].index_space = ss_inst1[i];
-    fd_ptrs1[i].inst = ri;
-    fd_ptrs1[i].field_offset = 0 + sizeof(FT);
-  }
-
-  log_app.debug() << "colors = " << colors;
-
-  for(size_t i = 0; i < num_insts; i++) {
-    fill_instance_data(root1 /*ss_inst1[i]*/, ri_data1[i]);
-  }
-
-  return Event::NO_EVENT;
-}
-
-template <int N1, typename T1, int N2, typename T2, typename FT>
-Event RandomTest<N1, T1, N2, T2, FT>::perform_partitioning(void)
-{
-  // start by filtering root1 by color
-  std::vector<FT> piece_colors(colors.begin(), colors.begin() + num_pieces);
-  std::vector<IndexSpace<N1, T1>> ss_by_color;
-  Event e1 = root1.create_subspaces_by_field(fd_vals1, piece_colors, ss_by_color,
-                                             ProfilingRequestSet());
-  e1.wait();
-
-  for(int i = 0; i < num_pieces; i++) {
-    log_app.debug() << "bycolor[" << i << "] (" << colors[i] << ") = " << ss_by_color[i];
-    dump_sparse_index_space("", ss_by_color[i]);
-  }
-
-  // images
-  std::vector<IndexSpace<N2, T2>> ss_images;
-  Event e2 = root2.create_subspaces_by_image(fd_ptrs1, ss_by_color, ss_images,
-                                             ProfilingRequestSet(), e1);
-
-  e2.wait();
-
-  for(int i = 0; i < num_pieces; i++) {
-    log_app.debug() << "image[" << i << "] = " << ss_images[i];
-    dump_sparse_index_space("", ss_images[i]);
-  }
-
-  // preimages
-  std::vector<IndexSpace<N1, T1>> ss_preimages;
-  Event e3 = root1.create_subspaces_by_preimage(fd_ptrs1, ss_images, ss_preimages,
-                                                ProfilingRequestSet(), e2);
-
-  e3.wait();
-
-  for(int i = 0; i < num_pieces; i++) {
-    log_app.debug() << "preimage[" << i << "] = " << ss_preimages[i];
-    dump_sparse_index_space("", ss_preimages[i]);
-    ss_by_color[i].destroy();
-    ss_images[i].destroy();
-    ss_preimages[i].destroy();
-  }
-
-  return Event::NO_EVENT;
-}
-
-template <int N1, typename T1, int N2, typename T2, typename FT>
-int RandomTest<N1, T1, N2, T2, FT>::perform_dynamic_checks(void)
-{
-  return 0;
-}
-
-template <int N1, typename T1, int N2, typename T2, typename FT>
-int RandomTest<N1, T1, N2, T2, FT>::check_partitioning(void)
-{
-  return 0;
-}
-
-void top_level_task(const void *args, size_t arglen, const void *userdata, size_t userlen,
-                    Processor p)
-{
-  int errors = 0;
-
-  testcfg->print_info();
-
-  // find all the system memories - we'll stride our data across them
-  // for each memory, we'll need one CPU that can do the initialization of the data
-  std::vector<Memory> sysmems;
-  std::vector<Processor> procs;
-
-  Machine machine = Machine::get_machine();
-  {
-    std::set<Memory> all_memories;
-    machine.get_all_memories(all_memories);
-    for(std::set<Memory>::const_iterator it = all_memories.begin();
-        it != all_memories.end(); it++) {
-      Memory m = *it;
-
-      // skip memories with no capacity for creating instances
-      if(m.capacity() == 0)
-        continue;
-
-      if(m.kind() == Memory::SYSTEM_MEM) {
-        sysmems.push_back(m);
-        std::set<Processor> pset;
-        machine.get_shared_processors(m, pset);
-        Processor p = Processor::NO_PROC;
-        for(std::set<Processor>::const_iterator it2 = pset.begin(); it2 != pset.end();
-            it2++) {
-          if(it2->kind() == Processor::LOC_PROC) {
-            p = *it2;
-            break;
-          }
-        }
-        assert(p.exists());
-        procs.push_back(p);
-        log_app.debug() << "System mem #" << (sysmems.size() - 1) << " = "
-                        << *sysmems.rbegin() << " (" << *procs.rbegin() << ")";
-      }
-    }
-  }
-  assert(sysmems.size() > 0);
-
-  {
-    Realm::TimeStamp ts("initialization", true, &log_app);
-
-    Event e = testcfg->initialize_data(sysmems, procs);
-    // wait for all initialization to be done
-    e.wait();
-  }
-
-  // now actual partitioning work
-  {
-    Realm::TimeStamp ts("dependent partitioning work", true, &log_app);
-
-    Event e = testcfg->perform_partitioning();
-
-    e.wait();
-  }
-
-  // dynamic checks (which would be eliminated by compiler)
-  {
-    Realm::TimeStamp ts("dynamic checks", true, &log_app);
-    errors += testcfg->perform_dynamic_checks();
+    Realm::TimeStamp ts("dynamic checks", true, &log_app);
+    errors += testcfg->perform_dynamic_checks();
   }
 
   if(!skip_check) {
@@ -4460,440 +812,46 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, size_
   printf("all done!\n");
 }
 
-template <int N1, typename T1, int N2, typename T2, typename FT, typename TRANSFORM>
-class RandomAffineTest : public TestInterface {
-public:
-  RandomAffineTest(int argc, const char *argv[],
-                   const std::vector<TRANSFORM> &transforms);
-  virtual ~RandomAffineTest(void);
-
-  virtual void print_info(void);
-
-  virtual Event initialize_data(const std::vector<Memory> &memories,
-                                const std::vector<Processor> &procs);
-
-  virtual Event perform_partitioning(void);
-
-  virtual int perform_dynamic_checks(void);
-
-  virtual int check_partitioning(void);
-
-  void fill_instance_data(IndexSpace<N1, T1> ibounds, RegionInstance inst);
-
-  int verify_results(const IndexSpace<N2, T2> &root, const TRANSFORM &transform,
-                     const std::vector<std::vector<IndexSpace<N2, T1>>> &images,
-                     const std::vector<std::vector<IndexSpace<N1, T1>>> &preimages);
-
-protected:
-  std::vector<TRANSFORM> transforms;
-  T1 base1_min, base1_max, extent1_min, extent1_max;
-  T2 base2_min, base2_max, extent2_min, extent2_max;
-  int num_pieces, num_colors;
-
-  // std::vector<AffineTransform<N2, N1, T2>> transforms;
-
-  std::vector<std::vector<IndexSpace<N2, T1>>> dense_images;
-  std::vector<std::vector<IndexSpace<N2, T1>>> sparse_images;
-
-  std::vector<IndexSpace<N1, T1>> ss_by_color;
-
-  std::vector<std::vector<IndexSpace<N1, T1>>> dense_preimages;
-  std::vector<std::vector<IndexSpace<N1, T1>>> sparse_preimages;
-
-  Rect<N1, T1> bounds1;
-  Rect<N2, T2> bounds2;
-  IndexSpace<N1, T1> root1;
-  IndexSpace<N2, T2> root2;
-  IndexSpace<N2, T2> root2_sparse;
-  std::vector<FT> colors;
-  std::vector<RegionInstance> ri_data1;
-  std::vector<FieldDataDescriptor<IndexSpace<N1, T1>, FT>> fd_vals1;
-};
-
-template <int N1, typename T1, int N2, typename T2, typename FT, typename TRANSFORM>
-RandomAffineTest<N1, T1, N2, T2, FT, TRANSFORM>::RandomAffineTest(
-    int argc, const char *argv[], const std::vector<TRANSFORM> &_transforms)
-  : transforms(_transforms)
-  , base1_min(0)
-  , base1_max(0)
-  , extent1_min(4)
-  , extent1_max(6)
-  , base2_min(0)
-  , base2_max(0)
-  , extent2_min(4)
-  , extent2_max(6)
-  , num_pieces(2)
-  , num_colors(4)
-{
-  RandStream<> rs(random_seed + 2);
-
-  for(int i = 0; i < N1; i++) {
-    bounds1.lo[i] = base1_min + rs.rand_int(base1_max - base1_min + 1);
-    bounds1.hi[i] =
-        (bounds1.lo[i] + extent1_min + rs.rand_int(extent1_max - extent1_min + 1));
-  }
-  for(int i = 0; i < N2; i++) {
-    bounds2.lo[i] = base2_min + rs.rand_int(base2_max - base2_min + 1);
-    bounds2.hi[i] =
-        (bounds2.lo[i] + extent2_min + rs.rand_int(extent2_max - extent2_min + 1));
-  }
-
-  colors.resize(num_colors);
-
-  for(int i = 0; i < num_colors; i++)
-    colors[i] = randval<FT>(rs);
-
-  dense_images.resize(transforms.size());
-  sparse_images.resize(transforms.size());
-
-  dense_preimages.resize(transforms.size());
-  sparse_preimages.resize(transforms.size());
-}
-
-template <int N1, typename T1, int N2, typename T2, typename FT, typename TRANSFORM>
-RandomAffineTest<N1, T1, N2, T2, FT, TRANSFORM>::~RandomAffineTest(void)
-{}
-
-template <int N1, typename T1, int N2, typename T2, typename FT, typename TRANSFORM>
-void RandomAffineTest<N1, T1, N2, T2, FT, TRANSFORM>::print_info(void)
-{
-  printf("Realm dependent partitioning test - random affine\n");
-}
-
-template <int N1, typename T1, int N2, typename T2, typename FT, typename TRANSFORM>
-void RandomAffineTest<N1, T1, N2, T2, FT, TRANSFORM>::fill_instance_data(
-    IndexSpace<N1, T1> ibounds, RegionInstance inst)
-{
-  {
-    // start with value field
-    AffineAccessor<FT, N1, T1> a_vals(inst, 0);
-
-    // iterate over all points in root1 with initial random values
-    RandStream<> rs1(random_seed + 1);
-    for(PointInRectIterator<N1, T1> pir(bounds1); pir.valid; pir.step()) {
-      FT v = colors[rs1.rand_int(2)];
-      if(ibounds.contains(pir.p))
-        a_vals.write(pir.p, v);
-    }
-
-    // print results
-    for(PointInRectIterator<N1, T1> pir(bounds1); pir.valid; pir.step()) {
-      if(ibounds.contains(pir.p))
-        log_app.debug() << "v[" << pir.p << "] = " << a_vals.read(pir.p);
-    }
-  }
-}
-
-template <int N1, typename T1, int N2, typename T2, typename FT, typename TRANSFORM>
-Event RandomAffineTest<N1, T1, N2, T2, FT, TRANSFORM>::initialize_data(
-    const std::vector<Memory> &memories, const std::vector<Processor> &procs)
-{
-  std::vector<Point<N2, T2>> sparse_points;
-  int index = 0;
-  for(PointInRectIterator<N2, T2> pir(bounds2); pir.valid; pir.step()) {
-    if(index % 2 == 0) {
-      sparse_points.push_back(pir.p);
-    }
-    index++;
-  }
-  SparsityMap<N2, T2> sparse_map =
-      SparsityMap<N2, T2>::construct(sparse_points, true, true);
-
-  root1 = IndexSpace<N1, T1>(bounds1);
-  root2 = IndexSpace<N2, T2>(bounds2);
-  root2_sparse = IndexSpace<N2, T2>(bounds2, sparse_map);
-
-  log_app.debug() << "root1 = " << root1;
-  log_app.debug() << "root2 = " << root2;
-  log_app.debug() << "root2_sparse = " << root2_sparse;
-
-  // create instances to hold actual data
-  size_t num_insts = memories.size();
-  log_app.debug() << "procs: " << procs;
-  log_app.debug() << "mems: " << memories;
-  std::vector<IndexSpace<N1, T1>> ss_inst1;
-  root1.create_equal_subspaces(num_insts, 1, ss_inst1, Realm::ProfilingRequestSet())
-      .wait();
-
-  std::vector<size_t> field_sizes;
-  field_sizes.push_back(sizeof(FT));
-  field_sizes.push_back(sizeof(Point<N2, T2>));
-
-  ri_data1.resize(num_insts);
-  fd_vals1.resize(num_insts);
-
-  for(size_t i = 0; i < num_insts; i++) {
-    RegionInstance ri;
-    RegionInstance::create_instance(ri, memories[i], ss_inst1[i], field_sizes, 0 /*SOA*/,
-                                    Realm::ProfilingRequestSet())
-        .wait();
-    log_app.debug() << "inst[" << i << "] = " << ri << " (" << ss_inst1[i] << ")";
-    ri_data1[i] = ri;
-
-    fd_vals1[i].index_space = ss_inst1[i];
-    fd_vals1[i].inst = ri;
-    fd_vals1[i].field_offset = 0;
-  }
-
-  log_app.debug() << "colors = " << colors;
-
-  for(size_t i = 0; i < num_insts; i++) {
-    fill_instance_data(root1 /*ss_inst1[i]*/, ri_data1[i]);
-  }
-
-  return Event::NO_EVENT;
-}
-
-template <int N1, typename T1, int N2, typename T2, typename FT, typename TRANSFORM>
-Event RandomAffineTest<N1, T1, N2, T2, FT, TRANSFORM>::perform_partitioning(void)
-{
-  // start by filtering root1 by color
-  std::vector<FT> piece_colors(colors.begin(), colors.begin() + num_pieces);
-
-  Event e1 = root1.create_subspaces_by_field(fd_vals1, piece_colors, ss_by_color,
-                                             ProfilingRequestSet());
-  e1.wait();
-
-  for(int i = 0; i < num_pieces; i++) {
-    log_app.debug() << "bycolor[" << i << "] (" << colors[i] << ") = " << ss_by_color[i];
-    dump_sparse_index_space("", ss_by_color[i]);
-  }
-
-  for(size_t idx = 0; idx < transforms.size(); idx++) {
-    log_app.debug() << "Compute images for transform idx=" << idx;
-
-    unsigned long long start_time = Clock::current_time_in_nanoseconds();
-    // images
-    Event e2 = root2.create_subspaces_by_image(
-        transforms[idx], ss_by_color, dense_images[idx], ProfilingRequestSet(), e1);
-    e2.wait();
-
-    log_app.debug() << "affine image time="
-                    << (Clock::current_time_in_nanoseconds() - start_time);
-
-    for(int i = 0; i < num_pieces; i++) {
-      log_app.debug() << "image[" << i << "] = " << dense_images[idx][i];
-      dump_sparse_index_space("", dense_images[idx][i]);
-    }
-
-    start_time = Clock::current_time_in_nanoseconds();
-    Event e3 = root2_sparse.create_subspaces_by_image(
-        transforms[idx], ss_by_color, sparse_images[idx], ProfilingRequestSet(), e2);
-
-    e3.wait();
-    log_app.debug() << "affine sparse image time="
-                    << (Clock::current_time_in_nanoseconds() - start_time);
-
-    for(int i = 0; i < num_pieces; i++) {
-      log_app.debug() << "sparse_image1[" << i << "] = " << sparse_images[idx][i];
-      dump_sparse_index_space("", sparse_images[idx][i]);
-    }
-
-    // preimages
-    Event e4 = root1.create_subspaces_by_preimage(transforms[idx], dense_images[idx],
-                                                  dense_preimages[idx],
-                                                  ProfilingRequestSet(), e3);
-    e4.wait();
-
-    for(int i = 0; i < num_pieces; i++) {
-      log_app.debug() << "dense_preimage[" << i << "] = " << dense_preimages[idx][i];
-      dump_sparse_index_space("", dense_preimages[idx][i]);
-    }
-
-    Event e5 = root1.create_subspaces_by_preimage(transforms[idx], sparse_images[idx],
-                                                  sparse_preimages[idx],
-                                                  ProfilingRequestSet(), e4);
-    e5.wait();
-
-    for(int i = 0; i < num_pieces; i++) {
-      log_app.debug() << "sparse_preimage[" << i << "] = " << sparse_preimages[idx][i];
-      dump_sparse_index_space("", sparse_preimages[idx][i]);
-    }
-  }
-
-  return Event::NO_EVENT;
-}
-
-template <int N1, typename T1, int N2, typename T2, typename FT, typename TRANSFORM>
-int RandomAffineTest<N1, T1, N2, T2, FT, TRANSFORM>::perform_dynamic_checks(void)
-{
-  return 0;
-}
-
-template <int N1, typename T1, int N2, typename T2, typename FT, typename TRANSFORM>
-int RandomAffineTest<N1, T1, N2, T2, FT, TRANSFORM>::verify_results(
-    const IndexSpace<N2, T2> &root, const TRANSFORM &transform,
-    const std::vector<std::vector<IndexSpace<N2, T1>>> &images,
-    const std::vector<std::vector<IndexSpace<N1, T1>>> &preimages)
-{
-  for(size_t idx = 0; idx < transforms.size(); idx++) {
-    assert(ss_by_color.size() == images[idx].size() &&
-           images[idx].size() == preimages[idx].size());
-    int image_total = 0;
-    for(const auto &image : images[idx]) {
-      for(IndexSpaceIterator<N2, T2> it2(image); it2.valid; it2.step()) {
-        image_total += it2.rect.volume();
-      }
-    }
-
-    int preimage_total = 0;
-    for(const auto &preimage : preimages[idx]) {
-      for(IndexSpaceIterator<N1, T1> it2(preimage); it2.valid; it2.step()) {
-        preimage_total += it2.rect.volume();
-      }
-    }
+// Constructor function-pointer type
+using CtorFn = TestInterface* (*)(int, const char** argv);
 
-    if(image_total != preimage_total)
-      return 1;
-
-    for(size_t i = 0; i < ss_by_color.size(); i++) {
-      for(IndexSpaceIterator<N1, T1> it(ss_by_color[i]); it.valid; it.step()) {
-        for(PointInRectIterator<N1, T1> point(it.rect); point.valid; point.step()) {
-          auto target_point = transforms[idx][point.p];
-          if(root.contains(target_point)) {
-            if(!images[idx][i].contains(target_point)) {
-              return 1;
-            }
-            if(!preimages[idx][i].contains(point.p)) {
-              return 1;
-            }
-          }
-        }
-      }
-    }
-  }
-  return 0;
+// ---- Byfield constructors ----
+template<int D>
+static TestInterface* make_byfield(int argc, const char** argv) {
+  return new ByfieldTest<D>(argc, argv);
 }
 
-template <int N1, typename T1, int N2, typename T2, typename FT, typename TRANSFORM>
-int RandomAffineTest<N1, T1, N2, T2, FT, TRANSFORM>::check_partitioning(void)
-{
-  int result = 0;
-  for(size_t i = 0; i < transforms.size(); i++) {
-    if(verify_results(root2, transforms[i], dense_images, dense_preimages) ||
-       verify_results(root2_sparse, transforms[i], sparse_images, sparse_preimages)) {
-      result++;
-    }
-  }
-  root1.destroy();
-  root2.destroy();
-  root2_sparse.destroy();
-  for(unsigned i = 0; i < dense_images.size(); i++) {
-    for(unsigned j = 0; j < dense_images[i].size(); j++) {
-      dense_images[i][j].destroy();
-    }
-  }
-  for(unsigned i = 0; i < sparse_images.size(); i++) {
-    for(unsigned j = 0; j < sparse_images[i].size(); j++) {
-      sparse_images[i][j].destroy();
-    }
-  }
-  for(unsigned i = 0; i < dense_preimages.size(); i++) {
-    for(unsigned j = 0; j < dense_preimages[i].size(); j++) {
-      dense_preimages[i][j].destroy();
-    }
-  }
-  for(unsigned i = 0; i < sparse_preimages.size(); i++) {
-    for(unsigned j = 0; j < sparse_preimages[i].size(); j++) {
-      sparse_preimages[i][j].destroy();
-    }
-  }
-  return result;
-}
+static constexpr CtorFn BYFIELD_CTORS[3] = {
+  &make_byfield<1>,
+  &make_byfield<2>,
+  &make_byfield<3>,
+};
 
-template <int N1, typename T1, int N2, typename T2, typename FT>
-std::vector<TranslationTransform<N2, T2>> create_translate_transforms(int size)
-{
-  RandStream<> rs(random_seed + 2);
-  std::vector<TranslationTransform<N2, T2>> transforms;
-  {
-    TranslationTransform<N2, T2> translate;
-    translate.offset = Point<N2, T2>::ZEROES();
-    for(int i = 0; i < N2; i++) {
-      translate.offset[i] = rs.rand_int(size - 1);
-    }
-    transforms.push_back(translate);
-  }
-  return transforms;
+// ---- Image constructors ----
+template<int D1, int D2>
+static TestInterface* make_image(int argc, const char** argv) {
+  return new ImageTest<D1, D2>(argc, argv);
 }
 
-template <int N1, typename T1, int N2, typename T2, typename FT>
-std::vector<AffineTransform<N2, N1, T2>> create_affine_transforms()
-{
-  std::vector<AffineTransform<N2, N1, T2>> transforms;
-
-  {
-    AffineTransform<N2, N1, T2> transpose;
-    for(int i = 0; i < N2; i++) {
-      for(int j = 0; j < N1; j++) {
-        transpose.transform[i][j] = (i == N1 - j - 1);
-      }
-    }
-    transpose.offset = Point<N2, T2>::ZEROES();
-    transforms.push_back(transpose);
-  }
-
-  {
-    AffineTransform<N2, N1, T2> translate;
-    for(int i = 0; i < N2; i++) {
-      for(int j = 0; j < N1; j++) {
-        translate.transform[i][j] = (i == j);
-      }
-    }
-    translate.offset = Point<N2, T2>::ZEROES();
-    transforms.push_back(translate);
-  }
-
-  {
-    AffineTransform<N2, N1, T2> scale;
-    for(int i = 0; i < N2; i++) {
-      for(int j = 0; j < N1; j++) {
-        scale.transform[i][j] = (i == j) ? 2 : 0;
-      }
-    }
-    scale.offset = Point<N2, T2>::ZEROES();
-    transforms.push_back(scale);
-  }
+static constexpr CtorFn IMAGE_CTORS[3][3] = {
+  { &make_image<1,1>, &make_image<1,2>, &make_image<1,3> },
+  { &make_image<2,1>, &make_image<2,2>, &make_image<2,3> },
+  { &make_image<3,1>, &make_image<3,2>, &make_image<3,3> },
+};
 
-  {
-    AffineTransform<N2, N1, T2> shear;
-    for(int i = 0; i < N2; i++) {
-      for(int j = 0; j < N1; j++) {
-        shear.transform[i][j] = (i == j);
-      }
-      shear.transform[i][i + 1] = 1;
-    }
-    shear.offset = Point<N2, T2>::ZEROES();
-    transforms.push_back(shear);
-  }
+using TaskWrapperFn = void (*)(const void*, size_t, const void*, size_t, Processor);
 
-  {
-    AffineTransform<N2, N1, T2> reflect;
-    for(int i = 0; i < N2; i++) {
-      for(int j = 0; j < N1; j++) {
-        reflect.transform[i][j] = (i == j) ? -1 : 0;
-      }
-    }
-    reflect.offset = Point<N2, T2>::ZEROES();
-    // transforms.push_back(reflect);
-  }
-  return transforms;
-}
+static constexpr TaskWrapperFn BYFIELD_INIT_TBL[3] = {
+  &ByfieldTest<1>::init_data_task_wrapper,
+  &ByfieldTest<2>::init_data_task_wrapper,
+  &ByfieldTest<3>::init_data_task_wrapper,
+};
 
-TestInterface *run_structured_test(TransformType type, int argc, char **argv)
-{
-  switch(type) {
-  case TransformType::AFFINE:
-    return new RandomAffineTest<2, int, 2, int, int, AffineTransform<2, 2, int>>(
-        argc, const_cast<const char **>(argv),
-        create_affine_transforms<2, int, 2, int, int>());
-  case TransformType::TRANSLATION:
-    return new RandomAffineTest<2, int, 2, int, int, TranslationTransform<2, int>>(
-        argc, const_cast<const char **>(argv),
-        create_translate_transforms<2, int, 2, int, int>(4));
-  }
-  return nullptr;
-}
+static constexpr TaskWrapperFn IMAGE_INIT_TBL[3][3] = {
+  { &ImageTest<1,1>::init_data_task_wrapper, &ImageTest<1,2>::init_data_task_wrapper, &ImageTest<1,3>::init_data_task_wrapper },
+  { &ImageTest<2,1>::init_data_task_wrapper, &ImageTest<2,2>::init_data_task_wrapper, &ImageTest<2,3>::init_data_task_wrapper },
+  { &ImageTest<3,1>::init_data_task_wrapper, &ImageTest<3,2>::init_data_task_wrapper, &ImageTest<3,3>::init_data_task_wrapper },
+};
 
 int main(int argc, char **argv)
 {
@@ -4928,54 +886,28 @@ int main(int argc, char **argv)
       continue;
     }
 
-    // test cases consume the rest of the args
-    if(!strcmp(argv[i], "circuit")) {
-      testcfg = new CircuitTest(argc - i, const_cast<const char **>(argv + i));
-      break;
-    }
-
-    if(!strcmp(argv[i], "basic")) {
-      testcfg = new BasicTest(argc - i, const_cast<const char **>(argv + i));
-      break;
-    }
-
-    if(!strcmp(argv[i], "tile")) {
-      testcfg = new TileTest(argc - i, const_cast<const char **>(argv + i));
-      break;
-    }
-
-    if (!strcmp(argv[i], "range")) {
-      testcfg = new RangeTest(argc - i, const_cast<const char **>(argv + i));
-      break;
-    }
-
-    if (!strcmp(argv[i], "multi")) {
-      testcfg = new Range2DTest(argc - i, const_cast<const char **>(argv + i));
-      break;
+    if(!strcmp(argv[i], "-d1")) {
+      dimension1 = atoi(argv[++i]);
+      continue;
     }
 
-    if(!strcmp(argv[i], "pennant")) {
-      testcfg = new PennantTest(argc - i, const_cast<const char **>(argv + i));
-      break;
+    if(!strcmp(argv[i], "-d2")) {
+      dimension2 = atoi(argv[++i]);
+      continue;
     }
 
-    if(!strcmp(argv[i], "miniaero")) {
-      testcfg = new MiniAeroTest(argc - i, const_cast<const char **>(argv + i));
-      break;
-    }
+    if(!strcmp(argv[i], "byfield")) {
+      if (dimension1 < 1 || dimension1 > 3)
+        assert(false && "invalid dimension");
 
-    if(!strcmp(argv[i], "random")) {
-      testcfg = new RandomTest<1, int, 2, int, int>(argc - i,
-                                                    const_cast<const char **>(argv + i));
+      testcfg = BYFIELD_CTORS[dimension1 - 1](argc - i, const_cast<const char **>(argv + i));
       break;
     }
 
-    if(!strcmp(argv[i], "affine")) {
-      TransformType type = TransformType::AFFINE;
-      if(i < argc - 1 && !strcmp(argv[++i], "-type")) {
-        type = static_cast<TransformType>(atoi(argv[++i]));
-      }
-      testcfg = run_structured_test(type, argc, argv);
+    if(!strcmp(argv[i], "image")) {
+      if (dimension1 < 1 || dimension1 > 3 || dimension2 < 1 || dimension2 > 3)
+        assert(false && "invalid dimension");
+      testcfg = IMAGE_CTORS[dimension1 - 1][dimension2 - 1](argc - i, const_cast<const char **>(argv + i));
       break;
     }
 
@@ -4984,17 +916,16 @@ int main(int argc, char **argv)
 
   // if no test specified, use circuit (with default parameters)
   if(!testcfg) {
-    testcfg = new CircuitTest(0, 0);
+    assert(false);
   }
 
   rt.register_task(TOP_LEVEL_TASK, top_level_task);
-  rt.register_task(INIT_CIRCUIT_DATA_TASK, CircuitTest::init_data_task_wrapper);
-  rt.register_task(INIT_PENNANT_DATA_TASK, PennantTest::init_data_task_wrapper);
-  rt.register_task(INIT_BASIC_DATA_TASK, BasicTest::init_data_task_wrapper);
-  rt.register_task(INIT_TILE_DATA_TASK, TileTest::init_data_task_wrapper);
-  rt.register_task(INIT_RANGE_DATA_TASK, RangeTest::init_data_task_wrapper);
-  rt.register_task(INIT_RANGE2D_DATA_TASK, Range2DTest::init_data_task_wrapper);
-  rt.register_task(INIT_MINIAERO_DATA_TASK, MiniAeroTest::init_data_task_wrapper);
+
+  if (dimension1 < 1 || dimension1 > 3 || dimension2 < 1 || dimension2 > 3)
+    assert(false && "invalid dimension");
+
+  rt.register_task(INIT_BYFIELD_DATA_TASK, BYFIELD_INIT_TBL[dimension1 - 1]);
+  rt.register_task(INIT_IMAGE_DATA_TASK,   IMAGE_INIT_TBL[dimension1 - 1][dimension2 - 1]);
 
   signal(SIGALRM, sigalrm_handler);
 

From 3434f6a0246e6fe0060dae02ded9449bc50c0fdf Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Mon, 9 Mar 2026 00:37:51 -0700
Subject: [PATCH 18/32] implemented cpu bvh

---
 src/realm/deppart/byfield.cc              |    2 +-
 src/realm/deppart/byfield_gpu_impl.hpp    |   18 +-
 src/realm/deppart/image.cc                |    5 +-
 src/realm/deppart/image.h                 |    2 +
 src/realm/deppart/image_gpu_impl.hpp      |   38 +-
 src/realm/deppart/partitions.cc           |    1 -
 src/realm/deppart/partitions.h            |    5 +
 src/realm/deppart/partitions_gpu_impl.hpp |  108 ++-
 src/realm/deppart/preimage.cc             |   24 +-
 src/realm/deppart/preimage_gpu_impl.hpp   |   48 +-
 src/realm/deppart/preimage_gpu_tmpl.cu    |   10 -
 src/realm/deppart/sparsity_impl.cc        |  331 ++++++-
 src/realm/indexspace.inl                  |   90 +-
 src/realm/sparsity.h                      |   66 +-
 tests/benchmark.cc                        | 1031 +++++++++++++++++----
 tests/unit_tests/sparsity_map_test.cc     |    2 +-
 16 files changed, 1513 insertions(+), 268 deletions(-)

diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc
index ce543e1b44..7c1fe148c1 100644
--- a/src/realm/deppart/byfield.cc
+++ b/src/realm/deppart/byfield.cc
@@ -44,7 +44,7 @@ namespace Realm {
             if (val) {
               device_size = atoi(val);
             }
-            size_t optimal_size = is.bounds.volume() * sizeof(RectDesc<N, T>);
+            size_t optimal_size = is.bounds.volume() * 10 * sizeof(RectDesc<N, T>);
             std::vector<Machine::ProcessorMemoryAffinity> affinities;
             unsigned best_bandwidth = 0;
             Processor best_proc = Processor::NO_PROC;
diff --git a/src/realm/deppart/byfield_gpu_impl.hpp b/src/realm/deppart/byfield_gpu_impl.hpp
index e309cf7609..849556a53d 100644
--- a/src/realm/deppart/byfield_gpu_impl.hpp
+++ b/src/realm/deppart/byfield_gpu_impl.hpp
@@ -27,7 +27,7 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
 
   size_t tile_size = field_data[0].scratch_buffer.get_layout()->bytes_used;
 
-  Arena buffer_arena(reinterpret_cast<void *>(AffineAccessor<char, 1>(field_data[0].scratch_buffer, 0).base), tile_size);
+  Arena buffer_arena(field_data[0].scratch_buffer.pointer_untyped(0, tile_size), tile_size);
 
   inst_space.offsets = buffer_arena.alloc<size_t>(field_data.size() + 1);
   inst_space.num_children = field_data.size();
@@ -203,14 +203,18 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
 
     } catch (arena_oom&) {
       std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl;
-      std::cout << buffer_arena.used() << " bytes used in arena." << std::endl;
       curr_tile /= 2;
       if (curr_tile == 0) {
-        host_fallback = true;
-        if (num_output > 0) {
-          this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+        if (host_fallback) {
+          GPUMicroOp<N, T>::shatter_rects(inst_space, num_completed);
+          curr_tile = 1;
+        } else {
+          host_fallback = true;
+          if (num_output > 0) {
+            this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+          }
+          curr_tile = tile_size / 2;
         }
-        curr_tile = tile_size / 2;
       }
     }
   }
@@ -254,7 +258,7 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
       if (entry_counts[idx] > 0) {
         Rect<N, T>* h_rects = reinterpret_cast<Rect<N,T> *>(AffineAccessor<char,1>(h_instances[idx], 0).base);
         span<Rect<N, T>> h_rects_span(h_rects, entry_counts[idx]);
-        impl->contribute_dense_rect_list(h_rects_span, false);
+        impl->contribute_dense_rect_list(h_rects_span, true);
         h_instances[idx].destroy();
       } else {
         impl->contribute_nothing();
diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc
index b0dcd4383a..8d37d81969 100644
--- a/src/realm/deppart/image.cc
+++ b/src/realm/deppart/image.cc
@@ -38,9 +38,12 @@ namespace Realm {
     std::vector<DeppartBufferRequirements>& requirements) const {
     size_t minimal_size = 0;
     size_t source_entries = 0;
-    bool bvh = true;
+    bool bvh = false;
     for (auto subspace : source_spaces) {
       source_entries += subspace.entries == 0 ? 1 : subspace.entries;
+      if (subspace.entries > 1) {
+        bvh = true;
+      }
     }
     minimal_size += sizeof(Rect<N2, T2>) * source_entries;
     if (this->dense()) {
diff --git a/src/realm/deppart/image.h b/src/realm/deppart/image.h
index ab81ecafae..4eed6da566 100644
--- a/src/realm/deppart/image.h
+++ b/src/realm/deppart/image.h
@@ -171,6 +171,8 @@ namespace Realm {
         void add_sparsity_output(IndexSpace<N2, T2> _source,
                                  SparsityMap<N, T> _sparsity);
 
+        bool is_image_microop() const override { return true; }
+
     protected:
         IndexSpace<N, T> parent_space;
         DomainTransform<N, T, N2, T2> domain_transform;
diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp
index 643845296d..83f907d922 100644
--- a/src/realm/deppart/image_gpu_impl.hpp
+++ b/src/realm/deppart/image_gpu_impl.hpp
@@ -7,8 +7,6 @@
 #include <thrust/transform.h>
 #include "realm/nvtx.h"
 
-#include <H5Rpublic.h>
-
 namespace Realm {
 
 //TODO: INTERSECTING INPUT/OUTPUT RECTS CAN BE DONE WITH BVH IF BECOME EXPENSIVE
@@ -46,12 +44,12 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
       return;
     }
 
-    NVTX_DEPPART(gpu_image);
+    NVTX_DEPPART(gpu_image_range);
 
     RegionInstance buffer = domain_transform.range_data[0].scratch_buffer;
     size_t tile_size = buffer.get_layout()->bytes_used;
     std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
-    Arena buffer_arena(reinterpret_cast<void *>(AffineAccessor<char, 1>(buffer, 0).base), tile_size);
+    Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size);
 
     cudaStream_t stream = Cuda::get_task_cuda_stream();
 
@@ -244,14 +242,18 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
       }
       catch (arena_oom&) {
         std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl;
-        std::cout << buffer_arena.used() << " bytes used in arena." << std::endl;
         curr_tile /= 2;
         if (curr_tile == 0) {
-          host_fallback = true;
-          if (num_output > 0) {
-            this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+          if (host_fallback) {
+            GPUMicroOp<N2, T2>::shatter_rects(inst_space, num_completed);
+            curr_tile = 1;
+          } else {
+            host_fallback = true;
+            if (num_output > 0) {
+              this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+            }
+            curr_tile = tile_size / 2;
           }
-          curr_tile = tile_size / 2;
         }
       }
   }
@@ -331,7 +333,7 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
 
     size_t tile_size = buffer.get_layout()->bytes_used;
     std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
-    Arena buffer_arena(reinterpret_cast<void *>(AffineAccessor<char, 1>(buffer, 0).base), tile_size);
+    Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size);
 
     collapsed_space<N2, T2> src_space;
     src_space.offsets = buffer_arena.alloc<size_t>(sources.size()+1);
@@ -390,8 +392,6 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
       try {
         std::cout << "Image iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
         buffer_arena.start();
-        std::cout << "Amount Used: " << buffer_arena.used() << std::endl;
-        std::cout << "Expected Amount Used: " << left + num_output * sizeof(RectDesc<N,T>) << std::endl;
         if (num_completed + curr_tile > inst_space.num_entries) {
           curr_tile = inst_space.num_entries - num_completed;
         }
@@ -514,14 +514,18 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
       }
       catch (arena_oom&) {
         std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl;
-        std::cout << buffer_arena.used() << " bytes used in arena." << std::endl;
         curr_tile /= 2;
         if (curr_tile == 0) {
-          host_fallback = true;
-          if (num_output > 0) {
-            this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+          if (host_fallback) {
+            GPUMicroOp<N2, T2>::shatter_rects(inst_space, num_completed);
+            curr_tile = 1;
+          } else {
+            host_fallback = true;
+            if (num_output > 0) {
+              this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+            }
+            curr_tile = tile_size / 2;
           }
-          curr_tile = tile_size / 2;
         }
       }
     }
diff --git a/src/realm/deppart/partitions.cc b/src/realm/deppart/partitions.cc
index f342519f71..1c16670c47 100644
--- a/src/realm/deppart/partitions.cc
+++ b/src/realm/deppart/partitions.cc
@@ -18,7 +18,6 @@
 // index space partitioning for Realm
 
 #include "realm/deppart/partitions.h"
-
 #include "realm/profiling.h"
 
 #include "realm/runtime_impl.h"
diff --git a/src/realm/deppart/partitions.h b/src/realm/deppart/partitions.h
index 8b67e5e642..68e5b40084 100644
--- a/src/realm/deppart/partitions.h
+++ b/src/realm/deppart/partitions.h
@@ -81,6 +81,7 @@ namespace Realm {
     size_t* offsets;
     size_t num_children;
     Rect<N, T> bounds;
+    RegionInstance h_instance = RegionInstance::NO_INST;
   };
 
   // Stores everything necessary to query a BVH
@@ -348,6 +349,8 @@ namespace Realm {
 
     virtual void execute(void) = 0;
 
+    static void shatter_rects(collapsed_space<N, T> & inst_space, size_t &num_completed);
+
     template <typename space_t>
     static void collapse_multi_space(const std::vector<space_t>& field_data, collapsed_space<N, T> &out_space, Arena &my_arena, cudaStream_t stream);
 
@@ -375,6 +378,8 @@ namespace Realm {
     template<typename Container, typename IndexFn, typename MapFn>
     void send_output(RectDesc<N, T>* d_rects, size_t total_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap);
 
+    virtual bool is_image_microop() const { return false; }
+
     bool exclusive = false;
 
   };
diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp
index 82abfd57d9..0827f1844c 100644
--- a/src/realm/deppart/partitions_gpu_impl.hpp
+++ b/src/realm/deppart/partitions_gpu_impl.hpp
@@ -40,10 +40,18 @@
 //NVTX macros to only add ranges if defined.
 #ifdef REALM_USE_NVTX
 
-  #define NVTX_CAT(a,b)  a##b
+#include <atomic>
 
-  #define NVTX_DEPPART(message) \
-  nvtxScopedRange NVTX_CAT(nvtx_, message)("cuda", #message, 0)
+inline int32_t next_nvtx_payload() {
+  static std::atomic<int32_t> counter{0};
+  return counter.fetch_add(1, std::memory_order_relaxed);
+}
+
+#define NVTX_CAT2(a, b) a##b
+#define NVTX_CAT(a, b) NVTX_CAT2(a, b)
+
+#define NVTX_DEPPART(message) \
+  nvtxScopedRange NVTX_CAT(nvtx_, __LINE__)("cuda", #message, next_nvtx_payload())
 
 #else
 
@@ -98,12 +106,93 @@ namespace Realm {
     return found;
   }
 
+  template <int N, typename T>
+  void GPUMicroOp<N, T>::shatter_rects(collapsed_space<N, T> & inst_space, size_t &num_completed) {
+
+    NVTX_DEPPART(shatter_rects);
+    cudaStream_t stream = Cuda::get_task_cuda_stream();
+    size_t new_size = (inst_space.entries_buffer[num_completed].bounds.volume() + 1) / 2;
+    assert(new_size > 0);
+    size_t num_new_entries = 0;
+    std::vector<size_t> offsets(inst_space.num_children + 1);
+    std::vector<size_t> new_offsets(inst_space.num_children + 1);
+    CUDA_CHECK(cudaMemcpyAsync(offsets.data(), inst_space.offsets, (inst_space.num_children + 1) * sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+    for (size_t i = 0; i < inst_space.num_children; ++i) {
+      new_offsets[i] = num_new_entries;
+      if (offsets[i+1] <= num_completed) {
+        continue;
+      }
+      for (size_t j = offsets[i]; j < offsets[i+1]; ++j) {
+        if (j >= num_completed) {
+          num_new_entries += (inst_space.entries_buffer[j].bounds.volume() + new_size - 1) / new_size;
+        }
+      }
+    }
+    new_offsets[inst_space.num_children] = num_new_entries;
+    CUDA_CHECK(cudaMemcpyAsync(inst_space.offsets, new_offsets.data(), (inst_space.num_children + 1) * sizeof(size_t), cudaMemcpyHostToDevice, stream), stream);
+    RegionInstance new_entries_buffer = realm_malloc(num_new_entries * sizeof(SparsityMapEntry<N,T>), inst_space.h_instance.get_location());
+    SparsityMapEntry<N,T> *new_entries_ptr = reinterpret_cast<SparsityMapEntry<N, T> *>(new_entries_buffer.pointer_untyped(0, num_new_entries * sizeof(SparsityMapEntry<N,T>)));
+
+    size_t write_loc = 0;
+    for (size_t i = num_completed; i < inst_space.num_entries; i++) {
+      Rect<N, T> bounds = inst_space.entries_buffer[i].bounds;
+      if (bounds.volume() <= new_size) {
+        new_entries_ptr[write_loc] = inst_space.entries_buffer[i];
+        write_loc++;
+        continue;
+      }
+      size_t count = (bounds.volume() + new_size - 1) / new_size;
+      // split in the largest dimension available
+      int split_dim = 0;
+      T total = std::max(bounds.hi[0] - bounds.lo[0] + 1, T(0));
+      if(N > 1) {
+        for(int d = 1; d < N; d++) {
+          T extent = std::max(bounds.hi[d] - bounds.lo[d] + 1, T(0));
+          if(extent > total) {
+            total = extent;
+            split_dim = d;
+          }
+        }
+      }
+      T px = bounds.lo[split_dim];
+      // have to divide before multiplying to avoid overflow
+      T base_span_size = total / count;
+      T base_span_rem = total - (base_span_size * count);
+      T leftover = 0;
+      for(size_t j = 0; j < count; j++) {
+        new_entries_ptr[write_loc] = inst_space.entries_buffer[i];
+        T nx = px + (base_span_size - 1);
+        if(base_span_rem != 0) {
+          leftover += base_span_rem;
+          if(leftover >= T(count)) {
+            nx += 1;
+            leftover -= count;
+          }
+        }
+        new_entries_ptr[write_loc].bounds.lo[split_dim] = px;
+        new_entries_ptr[write_loc].bounds.hi[split_dim] = nx;
+        px = nx + 1;
+        write_loc++;
+      }
+    }
+
+    num_completed = 0;
+    inst_space.entries_buffer = new_entries_ptr;
+    inst_space.num_entries = num_new_entries;
+    inst_space.h_instance.destroy();
+    inst_space.h_instance = new_entries_buffer;
+    CUDA_CHECK(cudaStreamSynchronize(stream), stream);
+
+  }
+
   //Given a list of spaces, compacts them all into one collapsed_space
   template<int N, typename T>
   template<typename space_t>
   void GPUMicroOp<N,T>::collapse_multi_space(const std::vector<space_t>& spaces, collapsed_space<N, T> &out_space, Arena &my_arena, cudaStream_t stream)
   {
 
+    NVTX_DEPPART(collapse_multi_space);
     out_space.bounds = Rect<N, T>::make_empty();
 
     char *val = std::getenv("SHATTER_SIZE");  // or any env var
@@ -197,6 +286,8 @@ namespace Realm {
       CUDA_CHECK(cudaMemcpyAsync(out_space.entries_buffer, h_entries, out_space.num_entries * sizeof(SparsityMapEntry<N,T>), cudaMemcpyHostToDevice, stream), stream);
       CUDA_CHECK(cudaStreamSynchronize(stream), stream);
       h_instance.destroy();
+    } else {
+      out_space.h_instance = h_instance;
     }
     CUDA_CHECK(cudaStreamSynchronize(stream), stream);
 
@@ -206,6 +297,8 @@ namespace Realm {
   template<int N, typename T>
   void GPUMicroOp<N,T>::collapse_parent_space(const IndexSpace<N, T>& parent_space, collapsed_space<N, T> &out_space, Arena &my_arena, cudaStream_t stream)
   {
+
+    NVTX_DEPPART(collapse_parent_space);
     if (parent_space.dense()) {
       SparsityMapEntry<N,T> entry;
       entry.bounds = parent_space.bounds;
@@ -229,6 +322,7 @@ namespace Realm {
   template<int N, typename T>
   void GPUMicroOp<N, T>::build_bvh(const collapsed_space<N, T> &space, BVH<N, T> &result, Arena &my_arena, cudaStream_t stream)
   {
+      NVTX_DEPPART(build_bvh);
       //We want to keep the entire BVH that we return in one instance for convenience.
       size_t indices_instance_size = space.num_entries * sizeof(uint64_t);
       size_t labels_instance_size = space.offsets == nullptr ? 0 : space.num_entries * sizeof(size_t);
@@ -329,6 +423,7 @@ namespace Realm {
   void GPUMicroOp<N,T>::construct_input_rectlist(const collapsed_space<N, T> &lhs, const collapsed_space<N, T> &rhs, out_t* &d_valid_rects, size_t& out_size, uint32_t* counters, uint32_t* out_offsets, Arena &my_arena, cudaStream_t stream)
   {
 
+    NVTX_DEPPART(construct_input_rectlist);
     CUDA_CHECK(cudaMemsetAsync(counters, 0, (lhs.num_children) * sizeof(uint32_t), stream), stream);
 
     BVH<N, T> my_bvh;
@@ -388,6 +483,8 @@ namespace Realm {
   template<typename out_t>
   void GPUMicroOp<N, T>::volume_prefix_sum(const out_t* d_rects, size_t total_rects, size_t* &d_prefix_rects, size_t& num_pts, Arena &my_arena, cudaStream_t stream)
   {
+
+    NVTX_DEPPART(volume_prefix_sum);
     d_prefix_rects = my_arena.alloc<size_t>(total_rects+1);
     CUDA_CHECK(cudaMemsetAsync(d_prefix_rects, 0, sizeof(size_t), stream), stream);
 
@@ -1477,7 +1574,7 @@ namespace Realm {
   template<int N, typename T>
   void GPUMicroOp<N,T>::split_output(RectDesc<N, T>* d_rects, size_t total_rects, std::vector<RegionInstance> &output_instances, std::vector<size_t> &output_counts, Arena &my_arena)
   {
-    NVTX_DEPPART(send_output);
+    NVTX_DEPPART(split_output);
 
     cudaStream_t stream = Cuda::get_task_cuda_stream();
     bool use_sysmem = false;
@@ -1626,7 +1723,8 @@ namespace Realm {
           CUDA_CHECK(cudaMemcpyAsync(h_rects, final_rects + start, (end - start) * sizeof(Rect<N,T>), cudaMemcpyDeviceToHost, stream), stream);
           CUDA_CHECK(cudaStreamSynchronize(stream), stream);
           span<Rect<N, T>> h_rects_span(h_rects, end - start);
-          impl->contribute_dense_rect_list(h_rects_span, false);
+          bool disjoint = !this->is_image_microop();
+          impl->contribute_dense_rect_list(h_rects_span, disjoint);
           h_rects_instance.destroy();
         } else {
           impl->contribute_nothing();
diff --git a/src/realm/deppart/preimage.cc b/src/realm/deppart/preimage.cc
index b25c8b2c41..4feaa585e4 100644
--- a/src/realm/deppart/preimage.cc
+++ b/src/realm/deppart/preimage.cc
@@ -43,6 +43,9 @@ namespace Realm {
     bool bvh = false;
     for (auto subspace : target_spaces) {
       source_entries += subspace.entries == 0 ? 1 : subspace.entries;
+      if (subspace.entries > 1) {
+        bvh = true;
+      }
     }
     minimal_size += sizeof(Rect<N2, T2>) * source_entries;
     if (this->dense()) {
@@ -54,9 +57,9 @@ namespace Realm {
       minimal_size +=
         (source_entries * sizeof(uint64_t)) +
         (source_entries * sizeof(size_t)) +
-        ((2*source_entries - 1) * sizeof(Rect<N, T>)) +
+        ((2*source_entries - 1) * sizeof(Rect<N2, T2>)) +
         (2 * (2*source_entries - 1) * sizeof(int)) +
-        sizeof(Rect<N, T>) +
+        sizeof(Rect<N2, T2>) +
         (2 * source_entries * sizeof(uint64_t)) +
         (source_entries * sizeof(uint64_t));
     }
@@ -72,7 +75,7 @@ namespace Realm {
           device_size = atoi(val);
         }
         minimal_size = max(minimal_size, device_size);
-        size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>) * target_spaces.size() + minimal_size;
+        size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>) * target_spaces.size() * 10 + minimal_size;
         std::vector<Machine::ProcessorMemoryAffinity> affinities;
         unsigned best_bandwidth = 0;
         Processor best_proc = Processor::NO_PROC;
@@ -216,6 +219,13 @@ namespace Realm {
   {
     TimeStamp ts("PreimageMicroOp::execute", true, &log_uop_timing);
     std::map<int, DenseRectangleList<N,T> *> rect_map;
+    if (is_ranged || N2 > 1) {
+      for (const IndexSpace<N2, T2>& target : targets) {
+        if (!target.dense()) {
+          target.sparsity.impl()->request_bvh();
+        }
+      }
+    }
 
     if(is_ranged)
       populate_bitmasks_ranges(rect_map);
@@ -737,6 +747,14 @@ namespace Realm {
 		TimeStamp ts("PreimageMicroOp::execute", true, &log_uop_timing);
 		std::map<int, DenseRectangleList<N, T> *> rect_map;
 
+	        if (N2 > 1) {
+	          for (const IndexSpace<N2, T2>& target : targets) {
+                    if (!target.dense()) {
+                      target.sparsity.impl()->request_bvh();
+                    }
+                  }
+	        }
+
 		populate_bitmasks(rect_map);
 #ifdef DEBUG_PARTITIONING
 		std::cout << rect_map.size() << " non-empty preimages present in instance "
diff --git a/src/realm/deppart/preimage_gpu_impl.hpp b/src/realm/deppart/preimage_gpu_impl.hpp
index 3e464c582f..960e427beb 100644
--- a/src/realm/deppart/preimage_gpu_impl.hpp
+++ b/src/realm/deppart/preimage_gpu_impl.hpp
@@ -19,9 +19,9 @@ namespace Realm {
 
     size_t tile_size = buffer.get_layout()->bytes_used;
     std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
-    Arena buffer_arena(reinterpret_cast<void *>(AffineAccessor<char, 1>(buffer, 0).base), tile_size);
+    Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size);
 
-    NVTX_DEPPART(gpu_preimage);
+    NVTX_DEPPART(gpu_preimage_range);
 
     Memory sysmem;
     find_memory(sysmem, Memory::SYSTEM_MEM);
@@ -85,7 +85,6 @@ namespace Realm {
 
         std::cout << "Preimage iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
         buffer_arena.start();
-        std::cout << "Amount Used: " << buffer_arena.used() << std::endl;
         if (num_completed + curr_tile > inst_space.num_entries) {
           curr_tile = inst_space.num_entries - num_completed;
         }
@@ -114,8 +113,6 @@ namespace Realm {
         size_t* d_prefix_rects;
         GPUMicroOp<N, T>::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream);
 
-        nvtx_range_push("cuda", "build target entries");
-
         PointDesc<N,T>* d_points;
         size_t num_valid_points;
 
@@ -258,14 +255,18 @@ namespace Realm {
 
       } catch (arena_oom&) {
         std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl;
-        std::cout << buffer_arena.used() << " bytes used in arena." << std::endl;
         curr_tile /= 2;
         if (curr_tile == 0) {
-          host_fallback = true;
-          if (num_output > 0) {
-            this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+          if (host_fallback) {
+            GPUMicroOp<N, T>::shatter_rects(inst_space, num_completed);
+            curr_tile = 1;
+          } else {
+            host_fallback = true;
+            if (num_output > 0) {
+              this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+            }
+            curr_tile = tile_size / 2;
           }
-          curr_tile = tile_size / 2;
         }
       }
     }
@@ -307,7 +308,7 @@ namespace Realm {
         if (entry_counts[idx] > 0) {
           Rect<N, T>* h_rects = reinterpret_cast<Rect<N,T> *>(AffineAccessor<char,1>(h_instances[idx], 0).base);
           span<Rect<N, T>> h_rects_span(h_rects, entry_counts[idx]);
-          impl->contribute_dense_rect_list(h_rects_span, false);
+          impl->contribute_dense_rect_list(h_rects_span, true);
           h_instances[idx].destroy();
         } else {
           impl->contribute_nothing();
@@ -326,15 +327,15 @@ namespace Realm {
 
     size_t tile_size = buffer.get_layout()->bytes_used;
     std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
-    Arena buffer_arena(reinterpret_cast<void *>(AffineAccessor<char, 1>(buffer, 0).base), tile_size);
-
-    NVTX_DEPPART(gpu_preimage);
+    Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size);
 
     Memory sysmem;
     find_memory(sysmem, Memory::SYSTEM_MEM);
 
     cudaStream_t stream = Cuda::get_task_cuda_stream();
 
+    NVTX_DEPPART(gpu_preimage);
+
     collapsed_space<N, T> inst_space;
 
     // We combine all of our instances into one to batch work, tracking the offsets between instances.
@@ -392,7 +393,6 @@ namespace Realm {
 
         std::cout << "Preimage iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
         buffer_arena.start();
-        std::cout << "Amount Used: " << buffer_arena.used() << std::endl;
         if (num_completed + curr_tile > inst_space.num_entries) {
           curr_tile = inst_space.num_entries - num_completed;
         }
@@ -421,8 +421,6 @@ namespace Realm {
         size_t* d_prefix_rects;
         GPUMicroOp<N, T>::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream);
 
-        nvtx_range_push("cuda", "build target entries");
-
         PointDesc<N,T>* d_points;
         size_t num_valid_points;
 
@@ -565,14 +563,18 @@ namespace Realm {
 
       } catch (arena_oom&) {
         std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl;
-        std::cout << buffer_arena.used() << " bytes used in arena." << std::endl;
         curr_tile /= 2;
         if (curr_tile == 0) {
-          host_fallback = true;
-          if (num_output > 0) {
-            this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+          if (host_fallback) {
+            GPUMicroOp<N, T>::shatter_rects(inst_space, num_completed);
+            curr_tile = 1;
+          } else {
+            host_fallback = true;
+            if (num_output > 0) {
+              this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+            }
+            curr_tile = tile_size / 2;
           }
-          curr_tile = tile_size / 2;
         }
       }
     }
@@ -614,7 +616,7 @@ namespace Realm {
         if (entry_counts[idx] > 0) {
           Rect<N, T>* h_rects = reinterpret_cast<Rect<N,T> *>(AffineAccessor<char,1>(h_instances[idx], 0).base);
           span<Rect<N, T>> h_rects_span(h_rects, entry_counts[idx]);
-          impl->contribute_dense_rect_list(h_rects_span, false);
+          impl->contribute_dense_rect_list(h_rects_span, true);
           h_instances[idx].destroy();
         } else {
           impl->contribute_nothing();
diff --git a/src/realm/deppart/preimage_gpu_tmpl.cu b/src/realm/deppart/preimage_gpu_tmpl.cu
index eb532a5a1d..be634fcc34 100644
--- a/src/realm/deppart/preimage_gpu_tmpl.cu
+++ b/src/realm/deppart/preimage_gpu_tmpl.cu
@@ -13,8 +13,6 @@
  * limitations under the License.
  */
 
-// per‐dimension instantiator for the GPU version of
-// ImageMicroOp<…>::gpu_populate_bitmasks_ptrs
 
 #define REALM_TEMPLATES_ONLY
 #include "realm/deppart/preimage_gpu_kernels.hpp"
@@ -48,14 +46,6 @@ namespace Realm {
   #define N1 INST_N1
   #define N2 INST_N2
 
-  // Replace MyBitmask with whatever bitmask‐type you actually use
-  // (it must have an `as_vector.rects` member that your code touches).
-  //
-  // This explicitly instantiates:
-  //   template void
-  //   ImageMicroOp<N1,T1,N2,T2>::gpu_populate_bitmasks_ptrs<MyBitmask>(
-  //     std::map<int,MyBitmask*>&);
-  //
   #define DO_DOUBLE(T1,T2) \
     template class GPUPreimageMicroOp<N1,T1,N2,T2>; \
     template class PreimageMicroOp<N1,T1,N2,T2>;
diff --git a/src/realm/deppart/sparsity_impl.cc b/src/realm/deppart/sparsity_impl.cc
index b4938edb3b..a1a511b744 100644
--- a/src/realm/deppart/sparsity_impl.cc
+++ b/src/realm/deppart/sparsity_impl.cc
@@ -883,6 +883,334 @@ namespace Realm {
     }
   }
 
+  template <int N, typename T>
+  int SparsityMapPublicImpl<N, T>::choose_bvh_split_axis(
+    const std::vector<uint32_t>& entry_ids,
+    size_t lo, size_t hi) const
+  {
+    assert(lo < hi);
+
+    Rect<N, T> bbox = entries[entry_ids[lo]].bounds;
+    for(size_t i = lo + 1; i < hi; i++)
+      bbox = bbox.union_bbox(entries[entry_ids[i]].bounds);
+
+    int split_axis = 0;
+    long double best_extent =
+      static_cast<long double>(bbox.hi[0]) - static_cast<long double>(bbox.lo[0]);
+
+    for(int d = 1; d < N; d++) {
+      long double extent =
+        static_cast<long double>(bbox.hi[d]) - static_cast<long double>(bbox.lo[d]);
+      if(extent > best_extent) {
+        best_extent = extent;
+        split_axis = d;
+      }
+    }
+
+    return split_axis;
+  }
+
+  template <int N, typename T>
+bool SparsityMapPublicImpl<N, T>::bvh_centroid_less(int axis,
+                                                    uint32_t a,
+                                                    uint32_t b) const
+  {
+    const Rect<N, T>& ra = entries[a].bounds;
+    const Rect<N, T>& rb = entries[b].bounds;
+
+    // comparing (lo + hi) is equivalent to comparing centroids along the axis
+    const auto sa = ra.lo[axis] + ra.hi[axis];
+    const auto sb = rb.lo[axis] + rb.hi[axis];
+    if(sa != sb)
+      return (sa < sb);
+
+    // deterministic tie-break
+    for(int i = 0; i < N; i++) {
+      if(ra.lo[i] != rb.lo[i]) return (ra.lo[i] < rb.lo[i]);
+      if(ra.hi[i] != rb.hi[i]) return (ra.hi[i] < rb.hi[i]);
+    }
+
+    return (a < b);
+  }
+
+  template <int N, typename T>
+  int SparsityMapPublicImpl<N, T>::build_bvh_subtree(CPU_BVH<N, T>& bvh,
+                                                   std::vector<uint32_t>& entry_ids,
+                                                   size_t lo,
+                                                   size_t hi) const
+  {
+    assert(lo < hi);
+
+    // leaf: exactly one sparsity-map entry
+    if((hi - lo) == 1) {
+      const uint32_t entry_idx = entry_ids[lo];
+      const uint32_t leaf_slot = static_cast<uint32_t>(bvh.leaf_entries.size());
+      bvh.leaf_entries.push_back(entry_idx);
+
+      typename CPU_BVH<N, T>::Node node;
+      node.bounds = entries[entry_idx].bounds;
+      node.left = -1;
+      node.right = -1;
+      node.begin = leaf_slot;
+      node.end = leaf_slot + 1;
+
+      const int node_idx = static_cast<int>(bvh.nodes.size());
+      bvh.nodes.push_back(node);
+      return node_idx;
+    }
+
+    const int split_axis = choose_bvh_split_axis(entry_ids, lo, hi);
+    const size_t mid = lo + ((hi - lo) >> 1);
+
+    std::nth_element(entry_ids.begin() + lo,
+                     entry_ids.begin() + mid,
+                     entry_ids.begin() + hi,
+                     [this, split_axis](uint32_t a, uint32_t b) {
+                       return bvh_centroid_less(split_axis, a, b);
+                     });
+
+    const int left_idx = build_bvh_subtree(bvh, entry_ids, lo, mid);
+    const int right_idx = build_bvh_subtree(bvh, entry_ids, mid, hi);
+
+    typename CPU_BVH<N, T>::Node node;
+    node.left = left_idx;
+    node.right = right_idx;
+    node.begin = bvh.nodes[left_idx].begin;
+    node.end = bvh.nodes[right_idx].end;
+    node.bounds = bvh.nodes[left_idx].bounds.union_bbox(bvh.nodes[right_idx].bounds);
+
+    const int node_idx = static_cast<int>(bvh.nodes.size());
+    bvh.nodes.push_back(node);
+    return node_idx;
+  }
+
+  template <int N, typename T>
+  void SparsityMapPublicImpl<N, T>::request_bvh(void)
+  {
+    // fast path
+    if(bvh_valid.load_acquire())
+      return;
+
+    // the BVH indexes the entry list, so entries must already exist
+    if(!entries_valid.load_acquire())
+      assert(false);
+
+    if (from_gpu) {
+      auto gpu_entries = get_entries();
+      entries = std::vector<SparsityMapEntry<N, T>>(gpu_entries.data(), gpu_entries.data() + gpu_entries.size());
+    }
+
+    std::lock_guard<std::mutex> lock(bvh_mutex);
+
+    // somebody else may have built it while we were waiting
+    if(bvh_valid.load())
+      return;
+
+    CPU_BVH<N, T> new_bvh;
+    new_bvh.clear();
+
+    const size_t count = entries.size();
+
+    // empty sparsity map: publish an empty-but-valid BVH
+    if(count == 0) {
+      entries_bvh = std::move(new_bvh);
+      bvh_valid.store_release(true);
+      return;
+    }
+
+    // one leaf per sparsity-map entry
+    std::vector<uint32_t> entry_ids(count);
+    for(uint32_t i = 0; i < count; i++) {
+      assert(!entries[i].sparsity.exists() && (entries[i].bitmap == 0));
+      entry_ids[i] = i;
+    }
+
+    // exact upper bounds for a binary tree with one entry per leaf
+    new_bvh.nodes.reserve((2 * count) - 1);
+    new_bvh.leaf_entries.reserve(count);
+
+    new_bvh.root = build_bvh_subtree(new_bvh, entry_ids, 0, count);
+
+    // publish only after construction is complete
+    entries_bvh = std::move(new_bvh);
+    bvh_valid.store_release(true);
+  }
+
+  template <int N, typename T> bool SparsityMapPublicImpl<N, T>::has_bvh() const
+  {
+    return bvh_valid.load_acquire();
+  }
+
+
+  template <int N, typename T>
+  bool CPU_BVH<N, T>::contains(const span<SparsityMapEntry<N,T>>& entries,
+                               const Point<N,T>& p) const
+  {
+    if(!valid())
+      return false;
+
+    // Root bbox reject.
+    if(!nodes[root].bounds.contains(p))
+      return false;
+
+    std::vector<int> stack;
+    stack.reserve(64);
+    stack.push_back(root);
+
+    while(!stack.empty()) {
+      const int node_idx = stack.back();
+      stack.pop_back();
+
+      const Node& node = nodes[node_idx];
+      if(!node.bounds.contains(p))
+        continue;
+
+      if(node.is_leaf()) {
+        // Leaves currently correspond to exactly one entry, but use the range
+        // to keep the code compatible with future small-bucket leaves.
+        for(uint32_t i = node.begin; i < node.end; i++) {
+          const uint32_t entry_idx = leaf_entries[i];
+          const SparsityMapEntry<N,T>& entry = entries[entry_idx];
+
+          if(!entry.bounds.contains(p))
+            continue;
+
+          if(entry.sparsity.exists()) {
+            assert(0);
+          } else if(entry.bitmap != 0) {
+            assert(0);
+          } else {
+            return true;
+          }
+        }
+      } else {
+        // Push children whose bbox might still contain the point.
+        const int left = node.left;
+        const int right = node.right;
+
+        if((right >= 0) && nodes[right].bounds.contains(p))
+          stack.push_back(right);
+        if((left >= 0) && nodes[left].bounds.contains(p))
+          stack.push_back(left);
+      }
+    }
+
+    return false;
+  }
+
+  template <int N, typename T>
+  bool CPU_BVH<N, T>::contains_any(const span<SparsityMapEntry<N,T>>& entries,
+                                   const Rect<N,T>& r) const
+  {
+    if(!valid())
+      return false;
+
+    // Root bbox reject.
+    if(!nodes[root].bounds.overlaps(r))
+      return false;
+
+    std::vector<int> stack;
+    stack.reserve(64);
+    stack.push_back(root);
+
+    while(!stack.empty()) {
+      const int node_idx = stack.back();
+      stack.pop_back();
+
+      const Node& node = nodes[node_idx];
+      if(!node.bounds.overlaps(r))
+        continue;
+
+      if(node.is_leaf()) {
+        for(uint32_t i = node.begin; i < node.end; i++) {
+          const uint32_t entry_idx = leaf_entries[i];
+          const SparsityMapEntry<N,T>& entry = entries[entry_idx];
+
+          if(!entry.bounds.overlaps(r))
+            continue;
+
+          if(entry.sparsity.exists()) {
+            assert(0);
+          } else if(entry.bitmap != 0) {
+            assert(0);
+          } else {
+            return true;
+          }
+        }
+      } else {
+        const int left = node.left;
+        const int right = node.right;
+
+        if((right >= 0) && nodes[right].bounds.overlaps(r))
+          stack.push_back(right);
+        if((left >= 0) && nodes[left].bounds.overlaps(r))
+          stack.push_back(left);
+      }
+    }
+
+    return false;
+  }
+
+  template <int N, typename T>
+  bool CPU_BVH<N, T>::contains_all(const span<SparsityMapEntry<N,T>>& entries,
+                                   const Rect<N,T>& r) const
+  {
+    if(!valid())
+      return false;
+
+    // Root bbox reject.
+    if(!nodes[root].bounds.contains(r))
+      return false;
+
+    size_t total_volume = 0;
+
+    std::vector<int> stack;
+    stack.reserve(64);
+    stack.push_back(root);
+
+    while(!stack.empty()) {
+      const int node_idx = stack.back();
+      stack.pop_back();
+
+      const Node& node = nodes[node_idx];
+      if(!node.bounds.overlaps(r))
+        continue;
+
+      if(node.is_leaf()) {
+        for(uint32_t i = node.begin; i < node.end; i++) {
+          const uint32_t entry_idx = leaf_entries[i];
+          const SparsityMapEntry<N,T>& entry = entries[entry_idx];
+
+          if(!entry.bounds.overlaps(r))
+            continue;
+
+          if(entry.sparsity.exists()) {
+            assert(0);
+          } else if(entry.bitmap != 0) {
+            assert(0);
+          } else {
+            Rect<N,T> isect = entry.bounds.intersection(r);
+            total_volume += isect.volume();
+
+            // Early out as soon as we know we've covered enough.
+            if(total_volume >= r.volume())
+              return true;
+          }
+        }
+      } else {
+        const int left = node.left;
+        const int right = node.right;
+
+        if((right >= 0) && nodes[right].bounds.overlaps(r))
+          stack.push_back(right);
+        if((left >= 0) && nodes[left].bounds.overlaps(r))
+          stack.push_back(left);
+      }
+    }
+
+    return (total_volume >= r.volume());
+  }
+
   ////////////////////////////////////////////////////////////////////////
   //
   // class SparsityMapImpl<N,T>
@@ -2036,7 +2364,8 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
 #define DOIT(N, T)                                                                       \
   template class SparsityMapPublicImpl<N, T>;                                            \
   template class SparsityMapImpl<N, T>;                                                  \
-  template class SparsityMap<N, T>;
+  template class SparsityMap<N, T>;                                                      \
+  template struct CPU_BVH<N, T>;
   FOREACH_NT(DOIT)
 
 }; // namespace Realm
diff --git a/src/realm/indexspace.inl b/src/realm/indexspace.inl
index d2c41e4c4e..b55e8b1aee 100644
--- a/src/realm/indexspace.inl
+++ b/src/realm/indexspace.inl
@@ -613,6 +613,11 @@ namespace Realm {
       }
       return true;
     } else {
+
+      if (impl->has_bvh()) {
+        return impl->entries_bvh.contains(entries, p);
+      }
+
       for(size_t i = 0; i < entries.size(); i++) {
         SparsityMapEntry<N, T> entry = entries[i];
 	if(!entry.bounds.contains(p)) {
@@ -639,30 +644,34 @@ namespace Realm {
     if(!bounds.contains(r))
       return false;
 
-    if(!dense()) {
-      // test against sparsity map too
-      size_t total_volume = 0;
-      SparsityMapPublicImpl<N,T> *impl = sparsity.impl();
-      span<SparsityMapEntry<N, T>> entries = impl->get_entries();
-      for(size_t i = 0; i < entries.size(); i++) {
-        SparsityMapEntry<N, T> entry = entries[i];
-	if(!entry.bounds.overlaps(r)) continue;
-	if(entry.sparsity.exists()) {
-	  assert(0);
-	} else if(entry.bitmap != 0) {
-	  assert(0);
-	} else {
-          Rect<N,T> isect = entry.bounds.intersection(r);
-          total_volume += isect.volume();
-	}
-      }
+    if(dense()) {
+      return true;
+    }
+    // test against sparsity map too
+    size_t total_volume = 0;
+    SparsityMapPublicImpl<N, T> *impl = sparsity.impl();
+    span<SparsityMapEntry<N, T>> entries = impl->get_entries();
 
-      // did we miss anything?
-      if(total_volume < r.volume())
-        return false;
+    if(impl->has_bvh()) {
+      return impl->entries_bvh.contains_all(entries, r);
     }
 
-    return true;
+    for(size_t i = 0; i < entries.size(); i++) {
+      SparsityMapEntry<N, T> entry = entries[i];
+      if(!entry.bounds.overlaps(r))
+        continue;
+      if(entry.sparsity.exists()) {
+        assert(0);
+      } else if(entry.bitmap != 0) {
+        assert(0);
+      } else {
+        Rect<N, T> isect = entry.bounds.intersection(r);
+        total_volume += isect.volume();
+      }
+    }
+
+    // did we miss anything?
+    return (total_volume == r.volume());
   }
 
   template <int N, typename T>
@@ -672,26 +681,31 @@ namespace Realm {
     if(!bounds.overlaps(r))
       return false;
 
-    if(!dense()) {
-      // test against sparsity map too
-      SparsityMapPublicImpl<N,T> *impl = sparsity.impl();
-      span<SparsityMapEntry<N, T>> entries = impl->get_entries();
-      for(size_t i = 0; i < entries.size(); i++) {
-        SparsityMapEntry<N, T> entry = entries[i];
-	if(!entry.bounds.overlaps(r)) continue;
-	if(entry.sparsity.exists()) {
-	  assert(0);
-	} else if(entry.bitmap != 0) {
-	  assert(0);
-	} else {
-	  return true;
-	}
+    if(dense()) {
+      return true;
+    }
+    // test against sparsity map too
+    SparsityMapPublicImpl<N, T> *impl = sparsity.impl();
+    span<SparsityMapEntry<N, T>> entries = impl->get_entries();
+
+    if(impl->has_bvh()) {
+      return impl->entries_bvh.contains_any(entries, r);
+    }
+
+    for(size_t i = 0; i < entries.size(); i++) {
+      SparsityMapEntry<N, T> entry = entries[i];
+      if(!entry.bounds.overlaps(r))
+        continue;
+      if(entry.sparsity.exists()) {
+        assert(0);
+      } else if(entry.bitmap != 0) {
+        assert(0);
+      } else {
+        return true;
       }
-      
-      return false;
     }
 
-    return true;
+    return false;
   }
 
   template <int N, typename T>
diff --git a/src/realm/sparsity.h b/src/realm/sparsity.h
index bf46284c6d..dc9fe74300 100644
--- a/src/realm/sparsity.h
+++ b/src/realm/sparsity.h
@@ -30,6 +30,7 @@
 #include "realm/atomics.h"
 
 #include <iostream>
+#include <mutex>
 #include <vector>
 
 /**
@@ -153,6 +154,44 @@ namespace Realm {
     HierarchicalBitMap<N, T> *bitmap;
   };
 
+  template <int N, typename T>
+  struct CPU_BVH {
+    struct Node {
+      Rect<N, T> bounds;
+      int left = -1;
+      int right = -1;
+
+      // range in leaf_entries covered by this subtree
+      uint32_t begin = 0;
+      uint32_t end = 0;
+
+      bool is_leaf() const { return left < 0; }
+    };
+
+    std::vector<Node> nodes;
+    std::vector<uint32_t> leaf_entries;
+    int root = -1;
+
+    bool valid() const {
+      return root >= 0;
+    }
+
+    void clear() {
+      nodes.clear();
+      leaf_entries.clear();
+      root = -1;
+    }
+
+    bool contains(const span<SparsityMapEntry<N,T>>& entries,
+                  const Point<N,T>& p) const;
+
+    bool contains_any(const span<SparsityMapEntry<N,T>>& entries,
+                      const Rect<N,T>& r) const;
+
+    bool contains_all(const span<SparsityMapEntry<N,T>>& entries,
+                      const Rect<N,T>& r) const;
+  };
+
   template <int N, typename T>
   REALM_PUBLIC_API std::ostream &operator<<(std::ostream &os,
                                             const SparsityMapEntry<N, T> &entry);
@@ -173,6 +212,12 @@ namespace Realm {
     // cannot be constructed directly
     SparsityMapPublicImpl(void);
 
+    int choose_bvh_split_axis(const std::vector<uint32_t>& entry_ids,
+                                size_t lo, size_t hi) const;
+    bool bvh_centroid_less(int axis, uint32_t a, uint32_t b) const;
+    int build_bvh_subtree(CPU_BVH<N, T> &bvh, std::vector<uint32_t> &entry_ids,
+                          size_t lo, size_t hi) const;
+
   public:
     /**
      * Make this sparsity map valid.
@@ -244,8 +289,27 @@ namespace Realm {
     bool compute_covering(const Rect<N, T> &bounds, size_t max_rects, int max_overhead,
                           std::vector<Rect<N, T>> &covering);
 
+    /**
+     * If this sparsity map doesn't already have an acceleration structure,
+     * build a BVH over the entries.
+     */
+    REALM_PUBLIC_API
+    void request_bvh(void);
+
+    /**
+     * Determine whether this sparsity map has an acceleration structure.
+     * @return true if the sparsity map has a valid bvh, false otherwise
+     */
+    bool has_bvh() const;
+
+    CPU_BVH<N, T> entries_bvh;
+
+
+
   protected:
-    atomic<bool> entries_valid{false}, approx_valid{false};
+    atomic<bool> entries_valid{false}, approx_valid{false}, bvh_valid{false};
+
+    std::mutex bvh_mutex;
 
     //BOTH RegionInstance and vector are returned as a span
     //only on can be valid (i.e. only finalize or gpu_finalize can be called, not both)
diff --git a/tests/benchmark.cc b/tests/benchmark.cc
index 9615a3bcbc..6b78151d68 100644
--- a/tests/benchmark.cc
+++ b/tests/benchmark.cc
@@ -42,16 +42,8 @@ enum
   TOP_LEVEL_TASK = Processor::TASK_ID_FIRST_AVAILABLE + 0,
   INIT_BYFIELD_DATA_TASK,
   INIT_IMAGE_DATA_TASK,
-};
-
-enum TestType {
-  BYFIELD = 0
-};
-
-enum TransformType
-{
-  AFFINE = 0,
-  TRANSLATION = 1,
+  INIT_IMAGE_RANGE_DATA_TASK,
+  INIT_PREIMAGE_DATA_TASK
 };
 
 namespace std {
@@ -108,8 +100,6 @@ namespace {
   bool skip_check = false;
   int dimension1 = 1;
   int dimension2 = 1;
-  TestType test_type = BYFIELD;
-
   TestInterface *testcfg = 0;
 }; // namespace
 
@@ -149,6 +139,31 @@ Event alloc_piece(RegionInstance &result, size_t size, Memory location) {
   return RegionInstance::create_instance(result, location, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet());
 }
 
+template <int N, typename T>
+IndexSpace<N, T> create_sparse_index_space(const Rect<N, T> &bounds, size_t sparse_factor,
+                                           bool randomize, size_t idx)
+{
+  std::vector<Point<N, T>> points;
+  for(PointInRectIterator<N, T> it(bounds); it.valid; it.step()) {
+    size_t flattened = idx * bounds.volume();
+    size_t stride = 1;
+    for (int d = 0; d < N; d++) {
+      flattened += (it.p[d] - bounds.lo[d]) * stride;
+      stride *= (bounds.hi[d] - bounds.lo[d] + 1);
+    }
+    if(randomize) {
+      if(Philox_2x32<>::rand_int(random_seed, flattened, 0, sparse_factor) == 0) {
+        points.push_back(it.p);
+      }
+    } else {
+      if(flattened % sparse_factor == 0) {
+        points.push_back(it.p);
+      }
+    }
+  }
+  return IndexSpace<N, T>(points, true);
+}
+
 /*
  * Byfield test - create a graph, partition it by
  * node subgraph id and then check that the partitioning
@@ -438,7 +453,8 @@ class ImageTest : public TestInterface {
   // graph config parameters
   int num_nodes = 1000;
   int num_edges = 1000;
-  int num_sources = 4;
+  int sparse_factor = 4;
+  int num_spaces = 4;
   int num_pieces = 4;
   std::string filename;
 
@@ -459,14 +475,18 @@ class ImageTest : public TestInterface {
         continue;
       }
       if(!strcmp(argv[i], "-s")) {
-        num_sources = atoi(argv[++i]);
+        num_spaces = atoi(argv[++i]);
+        continue;
+      }
+      if(!strcmp(argv[i], "-f")) {
+        sparse_factor = atoi(argv[++i]);
         continue;
       }
     }
 
 
-    if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_sources <= 0) {
-      log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " sources=" << num_sources <<  "\n";
+    if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_spaces <= 0) {
+      log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " sources=" << num_spaces <<  "\n";
       exit(1);
     }
   }
@@ -540,8 +560,8 @@ class ImageTest : public TestInterface {
 
   virtual void print_info(void)
   {
-    printf("Realm %dD -> %dD Image dependent partitioning test: %d nodes, %d edges, %d pieces ,%d sources\n", (int) N2, (int) N1,
-	   (int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_sources);
+    printf("Realm %dD -> %dD Image dependent partitioning test: %d nodes, %d edges, %d pieces ,%d sources, %d sparse factor\n", (int) N2, (int) N1,
+	   (int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) sparse_factor);
   }
 
   virtual Event initialize_data(const std::vector<Memory> &memories,
@@ -615,8 +635,13 @@ class ImageTest : public TestInterface {
     // Ensure that the results are identical
 
     std::vector<IndexSpace<N2>> sources(num_pieces);
-    for(int i = 0; i < num_sources; i++)
-      sources[i] = point_field_data[i % num_pieces].index_space;
+    for(int i = 0; i < num_spaces; i++) {
+      if (sparse_factor <= 1) {
+        sources[i] = point_field_data[i % num_pieces].index_space;
+      } else {
+        sources[i] = create_sparse_index_space(is_nodes.bounds, sparse_factor, random_colors, i);
+      }
+    }
 
     // We need a GPU memory for GPU partitioning
     Memory gpu_memory;
@@ -648,7 +673,7 @@ class ImageTest : public TestInterface {
     }
 
     std::vector<DeppartEstimateInput<N2, int>> image_inputs(num_pieces);
-    std::vector<DeppartSubspace<N2, int>> image_subspaces(num_sources);
+    std::vector<DeppartSubspace<N2, int>> image_subspaces(num_spaces);
     std::vector<DeppartBufferRequirements> image_requirements(num_pieces);
 
     for (int i = 0; i < num_pieces; i++) {
@@ -656,7 +681,7 @@ class ImageTest : public TestInterface {
       image_inputs[i].space = point_field_data_gpu[i].index_space;
     }
 
-    for (int i = 0; i < num_sources; i++) {
+    for (int i = 0; i < num_spaces; i++) {
       image_subspaces[i].space = sources[i];
       image_subspaces[i].entries = sources[i].dense() ? 1 : sources[i].sparsity.impl()->get_entries().size();
     }
@@ -730,187 +755,873 @@ class ImageTest : public TestInterface {
   }
 };
 
-void top_level_task(const void *args, size_t arglen, const void *userdata, size_t userlen,
-                    Processor p)
-{
-  int errors = 0;
-
-  testcfg->print_info();
-
-  // find all the system memories - we'll stride our data across them
-  // for each memory, we'll need one CPU that can do the initialization of the data
-  std::vector<Memory> sysmems;
-  std::vector<Processor> procs;
+template<int N1, int N2>
+class ImageRangeTest : public TestInterface {
+public:
+  // graph config parameters
+  int num_nodes = 1000;
+  int num_edges = 1000;
+  int rect_size = 10;
+  int num_spaces = 4;
+  int num_pieces = 4;
+  int sparse_factor = 4;
+  std::string filename;
 
-  Machine machine = Machine::get_machine();
+  ImageRangeTest(int argc, const char *argv[])
   {
-    std::set<Memory> all_memories;
-    machine.get_all_memories(all_memories);
-    for(std::set<Memory>::const_iterator it = all_memories.begin();
-        it != all_memories.end(); it++) {
-      Memory m = *it;
+    for(int i = 1; i < argc; i++) {
 
-      // skip memories with no capacity for creating instances
-      if(m.capacity() == 0)
+      if(!strcmp(argv[i], "-p")) {
+        num_pieces = atoi(argv[++i]);
+        continue;
+      }
+      if(!strcmp(argv[i], "-n")) {
+        num_nodes = atoi(argv[++i]);
+        continue;
+      }
+      if(!strcmp(argv[i], "-e")) {
+        num_edges = atoi(argv[++i]);
+        continue;
+      }
+      if(!strcmp(argv[i], "-r")) {
+        rect_size = atoi(argv[++i]);
+        continue;
+      }
+      if(!strcmp(argv[i], "-s")) {
+        num_spaces = atoi(argv[++i]);
+        continue;
+      }
+      if (!strcmp(argv[i], "-f")) {
+        sparse_factor = atoi(argv[++i]);
         continue;
-
-      if(m.kind() == Memory::SYSTEM_MEM) {
-        sysmems.push_back(m);
-        std::set<Processor> pset;
-        machine.get_shared_processors(m, pset);
-        Processor p = Processor::NO_PROC;
-        for(std::set<Processor>::const_iterator it2 = pset.begin(); it2 != pset.end();
-            it2++) {
-          if(it2->kind() == Processor::LOC_PROC) {
-            p = *it2;
-            break;
-          }
-        }
-        assert(p.exists());
-        procs.push_back(p);
-        log_app.debug() << "System mem #" << (sysmems.size() - 1) << " = "
-                        << *sysmems.rbegin() << " (" << *procs.rbegin() << ")";
       }
     }
-  }
-  assert(sysmems.size() > 0);
 
-  {
-    Realm::TimeStamp ts("initialization", true, &log_app);
 
-    Event e = testcfg->initialize_data(sysmems, procs);
-    // wait for all initialization to be done
-    e.wait();
+    if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_spaces <= 0 || rect_size <= 0) {
+      log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " sources=" << num_spaces << " rect size=" << rect_size <<  "\n";
+      exit(1);
+    }
   }
 
-  // now actual partitioning work
-  {
-    Realm::TimeStamp ts("dependent partitioning work", true, &log_app);
-
-    Event e = testcfg->perform_partitioning();
-
-    e.wait();
-  }
+  struct InitDataArgs {
+    int index;
+    RegionInstance ri_nodes;
+  };
 
-  // dynamic checks (which would be eliminated by compiler)
+  enum PRNGStreams
   {
-    Realm::TimeStamp ts("dynamic checks", true, &log_app);
-    errors += testcfg->perform_dynamic_checks();
-  }
+    NODE_SUBGRAPH_STREAM,
+  };
 
-  if(!skip_check) {
-    log_app.print() << "checking correctness of partitioning";
-    Realm::TimeStamp ts("verification", true, &log_app);
-    errors += testcfg->check_partitioning();
+  // assign subgraph ids to nodes
+  void chase_rect(int idx, Rect<N1>& color)
+  {
+    for (int d = 0; d < N1; d++) {
+      if(random_colors) {
+        color.lo[d] = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_edges);
+        color.hi[d] = color.lo[d] + Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, 2 * rect_size);
+      } else {
+        color.lo[d] = (idx * num_edges / num_nodes) % num_edges;
+        color.hi[d] = color.lo[d] + rect_size;
+      }
+    }
   }
 
-  if(errors > 0) {
-    printf("Exiting with errors\n");
-    exit(1);
+  static void init_data_task_wrapper(const void *args, size_t arglen,
+                                     const void *userdata, size_t userlen, Processor p)
+  {
+    ImageRangeTest *me = (ImageRangeTest *)testcfg;
+    me->init_data_task(args, arglen, p);
   }
 
-  printf("all done!\n");
-}
-
-// Constructor function-pointer type
-using CtorFn = TestInterface* (*)(int, const char** argv);
-
-// ---- Byfield constructors ----
-template<int D>
-static TestInterface* make_byfield(int argc, const char** argv) {
-  return new ByfieldTest<D>(argc, argv);
-}
+  //Each piece has a task to initialize its data
+  void init_data_task(const void *args, size_t arglen, Processor p)
+  {
+    const InitDataArgs &i_args = *(const InitDataArgs *)args;
 
-static constexpr CtorFn BYFIELD_CTORS[3] = {
-  &make_byfield<1>,
-  &make_byfield<2>,
-  &make_byfield<3>,
-};
+    log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes
+                   << ")";
 
-// ---- Image constructors ----
-template<int D1, int D2>
-static TestInterface* make_image(int argc, const char** argv) {
-  return new ImageTest<D1, D2>(argc, argv);
-}
+    i_args.ri_nodes.fetch_metadata(p).wait();
 
-static constexpr CtorFn IMAGE_CTORS[3][3] = {
-  { &make_image<1,1>, &make_image<1,2>, &make_image<1,3> },
-  { &make_image<2,1>, &make_image<2,2>, &make_image<2,3> },
-  { &make_image<3,1>, &make_image<3,2>, &make_image<3,3> },
-};
+    IndexSpace<N2> nodes_space = i_args.ri_nodes.template get_indexspace<N2>();
 
-using TaskWrapperFn = void (*)(const void*, size_t, const void*, size_t, Processor);
+    log_app.debug() << "N: " << is_nodes;
 
-static constexpr TaskWrapperFn BYFIELD_INIT_TBL[3] = {
-  &ByfieldTest<1>::init_data_task_wrapper,
-  &ByfieldTest<2>::init_data_task_wrapper,
-  &ByfieldTest<3>::init_data_task_wrapper,
-};
+    //For each node in the graph, mark it with a random (or deterministic) subgraph id
+    {
+      AffineAccessor<Rect<N1>, N2> a_rect(i_args.ri_nodes, 0 /* offset */);
 
-static constexpr TaskWrapperFn IMAGE_INIT_TBL[3][3] = {
-  { &ImageTest<1,1>::init_data_task_wrapper, &ImageTest<1,2>::init_data_task_wrapper, &ImageTest<1,3>::init_data_task_wrapper },
-  { &ImageTest<2,1>::init_data_task_wrapper, &ImageTest<2,2>::init_data_task_wrapper, &ImageTest<2,3>::init_data_task_wrapper },
-  { &ImageTest<3,1>::init_data_task_wrapper, &ImageTest<3,2>::init_data_task_wrapper, &ImageTest<3,3>::init_data_task_wrapper },
-};
+      for (IndexSpaceIterator<N2> it(is_nodes); it.valid; it.step()) {
+        for (PointInRectIterator<N2> point(it.rect); point.valid; point.step()) {
+          int idx = 0;
+          int stride = 1;
+          for (int d = 0; d < N2; d++) {
+            idx += (point.p[d] - is_nodes.bounds.lo[d]) * stride;
+            stride *= (is_nodes.bounds.hi[d] - is_nodes.bounds.lo[d] + 1);
+          }
+          Rect<N1> destination;
+          chase_rect(idx, destination);
+          a_rect.write(point.p, destination);
+        }
+      }
+    }
+  }
 
-int main(int argc, char **argv)
-{
-  Runtime rt;
+  IndexSpace<N2> is_nodes;
+  IndexSpace<N1> is_edges;
+  std::vector<RegionInstance> ri_nodes;
+  std::vector<FieldDataDescriptor<IndexSpace<N2>, Rect<N1>> > rect_field_data;
 
-  rt.init(&argc, &argv);
+  virtual void print_info(void)
+  {
+    printf("Realm %dD -> %dD Image Range dependent partitioning test: %d nodes, %d edges, %d pieces ,%d sources, %d rect size, %d sparse factor\n", (int) N2, (int) N1,
+	   (int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) rect_size, (int) sparse_factor);
+  }
 
-  // parse global options
-  for(int i = 1; i < argc; i++) {
-    if(!strcmp(argv[i], "-seed")) {
-      random_seed = atoi(argv[++i]);
-      continue;
+  virtual Event initialize_data(const std::vector<Memory> &memories,
+                                const std::vector<Processor> &procs)
+  {
+    // now create index space for nodes
+    Point<N2> node_lo, node_hi;
+    for (int d = 0; d < N2; d++) {
+      node_lo[d] = 0;
+      node_hi[d] = num_nodes - 1;
     }
+    is_nodes = Rect<N2>(node_lo, node_hi);
 
-    if(!strcmp(argv[i], "-random")) {
-      random_colors = true;
-      continue;
+    Point<N1> edge_lo, edge_hi;
+    for (int d = 0; d < N1; d++) {
+      edge_lo[d] = 0;
+      edge_hi[d] = num_edges - 1;
     }
+    is_edges = Rect<N1>(edge_lo, edge_hi);
 
-    if(!strcmp(argv[i], "-wait")) {
-      wait_on_events = true;
-      continue;
-    }
+    // equal partition is used to do initial population of edges and nodes
+    std::vector<IndexSpace<N2> > ss_nodes_eq;
 
-    if(!strcmp(argv[i], "-show")) {
-      show_graph = true;
-      continue;
-    }
+    log_app.info() << "Creating equal subspaces\n";
 
-    if(!strcmp(argv[i], "-nocheck")) {
-      skip_check = true;
-      continue;
-    }
+    is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait();
 
-    if(!strcmp(argv[i], "-d1")) {
-      dimension1 = atoi(argv[++i]);
-      continue;
-    }
+    // create instances for each of these subspaces
+    std::vector<size_t> node_fields;
+    node_fields.push_back(sizeof(Rect<N1>));
 
-    if(!strcmp(argv[i], "-d2")) {
-      dimension2 = atoi(argv[++i]);
-      continue;
-    }
+    ri_nodes.resize(num_pieces);
+    rect_field_data.resize(num_pieces);
 
-    if(!strcmp(argv[i], "byfield")) {
-      if (dimension1 < 1 || dimension1 > 3)
-        assert(false && "invalid dimension");
+    for(size_t i = 0; i < ss_nodes_eq.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i],
+                                      node_fields, 0 /*SOA*/,
+                                      Realm::ProfilingRequestSet()).wait();
+      ri_nodes[i] = ri;
 
-      testcfg = BYFIELD_CTORS[dimension1 - 1](argc - i, const_cast<const char **>(argv + i));
-      break;
+      rect_field_data[i].index_space = ss_nodes_eq[i];
+      rect_field_data[i].inst = ri_nodes[i];
+      rect_field_data[i].field_offset = 0;
     }
 
-    if(!strcmp(argv[i], "image")) {
+    // fire off tasks to initialize data
+    std::set<Event> events;
+    for(int i = 0; i < num_pieces; i++) {
+      Processor p = procs[i % procs.size()];
+      InitDataArgs args;
+      args.index = i;
+      args.ri_nodes = ri_nodes[i];
+      Event e = p.spawn(INIT_IMAGE_RANGE_DATA_TASK, &args, sizeof(args));
+      events.insert(e);
+    }
+
+    return Event::merge_events(events);
+  }
+
+  // the outputs of our partitioning will be:
+  //  p_nodes - nodes partitioned by subgraph id (from GPU)
+  //  p_nodes_cpu - nodes partitioned by subgraph id (from CPU)
+
+
+    std::vector<IndexSpace<N1> > p_edges, p_garbage_edges, p_edges_cpu;
+
+  virtual Event perform_partitioning(void)
+  {
+    // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU
+    // Ensure that the results are identical
+
+    std::vector<IndexSpace<N2>> sources(num_spaces);
+    for(int i = 0; i < num_spaces; i++) {
+      if (sparse_factor <= 1) {
+        sources[i] = rect_field_data[i % num_pieces].index_space;
+      } else {
+        sources[i] = create_sparse_index_space(is_nodes.bounds, sparse_factor, random_colors, i);
+      }
+    }
+
+    // We need a GPU memory for GPU partitioning
+    Memory gpu_memory;
+    bool found_gpu_memory = false;
+    Machine machine = Machine::get_machine();
+    std::set<Memory> all_memories;
+    machine.get_all_memories(all_memories);
+    for(Memory memory : all_memories) {
+      if(memory.kind() == Memory::GPU_FB_MEM) {
+        gpu_memory = memory;
+        found_gpu_memory = true;
+        break;
+      }
+    }
+    if (!found_gpu_memory) {
+      log_app.error() << "No GPU memory found for partitioning test\n";
+      return Event::NO_EVENT;
+    }
+
+
+    std::vector<size_t> node_fields;
+    node_fields.push_back(sizeof(Rect<N1>));
+
+    std::vector<FieldDataDescriptor<IndexSpace<N2>, Rect<N1>>> rect_field_data_gpu;
+    rect_field_data_gpu.resize(num_pieces);
+
+    for (int i = 0; i < num_pieces; i++) {
+    	copy_piece(rect_field_data[i], rect_field_data_gpu[i], node_fields, 0, gpu_memory).wait();
+    }
+
+    std::vector<DeppartEstimateInput<N2, int>> image_inputs(num_pieces);
+    std::vector<DeppartSubspace<N2, int>> image_subspaces(num_spaces);
+    std::vector<DeppartBufferRequirements> image_requirements(num_pieces);
+
+    for (int i = 0; i < num_pieces; i++) {
+      image_inputs[i].location = rect_field_data_gpu[i].inst.get_location();
+      image_inputs[i].space = rect_field_data_gpu[i].index_space;
+    }
+
+    for (int i = 0; i < num_spaces; i++) {
+      image_subspaces[i].space = sources[i];
+      image_subspaces[i].entries = sources[i].dense() ? 1 : sources[i].sparsity.impl()->get_entries().size();
+    }
+
+    is_edges.by_image_buffer_requirements(image_subspaces, image_inputs, image_requirements);
+
+    for (int i = 0; i < num_pieces; i++) {
+      alloc_piece(rect_field_data_gpu[i].scratch_buffer, image_requirements[i].upper_bound, gpu_memory).wait();
+    }
+
+    wait_on_events = true;
+    log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n";
+    Event warmup = is_edges.create_subspaces_by_image(rect_field_data_gpu,
+                                                  sources,
+                                                  p_garbage_edges,
+                                                  Realm::ProfilingRequestSet());
+    warmup.wait();
+
+    Event gpu_call = is_edges.create_subspaces_by_image(rect_field_data_gpu,
+                                                  sources,
+                                                  p_edges,
+                                                  Realm::ProfilingRequestSet());
+
+    Event cpu_call = is_edges.create_subspaces_by_image(rect_field_data,
+                                                  sources,
+                                                  p_edges_cpu,
+                                                  Realm::ProfilingRequestSet());
+
+    return Event::merge_events({gpu_call, cpu_call});
+
+  }
+
+  virtual int perform_dynamic_checks(void)
+  {
+    // Nothing to do here
+    return 0;
+  }
+
+  virtual int check_partitioning(void)
+  {
+    int errors = 0;
+
+    if (!p_edges.size()) {
+      return p_edges.size() == p_edges_cpu.size();
+    }
+
+    log_app.info() << "Checking correctness of partitioning " << "\n";
+
+    for(int i = 0; i < num_spaces; i++) {
+      for(IndexSpaceIterator<N1> it(p_edges[i]); it.valid; it.step()) {
+        for(PointInRectIterator<N1> point(it.rect); point.valid; point.step()) {
+          if (!p_edges_cpu[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU has extra image point " << point.p
+                            << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+      for(IndexSpaceIterator<N1> it(p_edges_cpu[i]); it.valid; it.step()) {
+        for(PointInRectIterator<N1> point(it.rect); point.valid; point.step()) {
+          if (!p_edges[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU is missing image point " << point.p
+                          << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+
+    }
+    return errors;
+  }
+};
+
+template<int N1, int N2>
+class PreimageTest : public TestInterface {
+public:
+  // graph config parameters
+  int num_nodes = 1000;
+  int num_edges = 1000;
+  int num_spaces = 4;
+  int num_pieces = 4;
+  int sparse_factor = 4;
+  std::string filename;
+
+  PreimageTest(int argc, const char *argv[])
+  {
+    for(int i = 1; i < argc; i++) {
+
+      if(!strcmp(argv[i], "-p")) {
+        num_pieces = atoi(argv[++i]);
+        continue;
+      }
+      if(!strcmp(argv[i], "-n")) {
+        num_nodes = atoi(argv[++i]);
+        continue;
+      }
+      if(!strcmp(argv[i], "-e")) {
+        num_edges = atoi(argv[++i]);
+        continue;
+      }
+      if(!strcmp(argv[i], "-s")) {
+        num_spaces = atoi(argv[++i]);
+        continue;
+      }
+      if (!strcmp(argv[i], "-f")) {
+        sparse_factor = atoi(argv[++i]);
+        continue;
+      }
+    }
+
+
+    if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_spaces <= 0) {
+      log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " targets=" << num_spaces <<  "\n";
+      exit(1);
+    }
+  }
+
+  struct InitDataArgs {
+    int index;
+    RegionInstance ri_nodes;
+  };
+
+  enum PRNGStreams
+  {
+    NODE_SUBGRAPH_STREAM,
+  };
+
+  // assign subgraph ids to nodes
+  void chase_point(int idx, Point<N2>& color)
+  {
+    for (int d = 0; d < N2; d++) {
+      if(random_colors)
+        color[d] = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_edges);
+      else
+        color[d] = (idx * num_edges / num_nodes) % num_edges;
+    }
+  }
+
+  static void init_data_task_wrapper(const void *args, size_t arglen,
+                                     const void *userdata, size_t userlen, Processor p)
+  {
+    PreimageTest *me = (PreimageTest *)testcfg;
+    me->init_data_task(args, arglen, p);
+  }
+
+  //Each piece has a task to initialize its data
+  void init_data_task(const void *args, size_t arglen, Processor p)
+  {
+    const InitDataArgs &i_args = *(const InitDataArgs *)args;
+
+    log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes
+                   << ")";
+
+    i_args.ri_nodes.fetch_metadata(p).wait();
+
+    IndexSpace<N1> nodes_space = i_args.ri_nodes.template get_indexspace<N1>();
+
+    log_app.debug() << "N: " << is_nodes;
+
+    //For each node in the graph, mark it with a random (or deterministic) subgraph id
+    {
+      AffineAccessor<Point<N2>, N1> a_point(i_args.ri_nodes, 0 /* offset */);
+
+      for (IndexSpaceIterator<N1> it(is_nodes); it.valid; it.step()) {
+        for (PointInRectIterator<N1> point(it.rect); point.valid; point.step()) {
+          int idx = 0;
+          int stride = 1;
+          for (int d = 0; d < N1; d++) {
+            idx += (point.p[d] - is_nodes.bounds.lo[d]) * stride;
+            stride *= (is_nodes.bounds.hi[d] - is_nodes.bounds.lo[d] + 1);
+          }
+          Point<N2> destination;
+          chase_point(idx, destination);
+          a_point.write(point.p, destination);
+        }
+      }
+    }
+  }
+
+  IndexSpace<N1> is_nodes;
+  IndexSpace<N2> is_edges;
+  std::vector<RegionInstance> ri_nodes;
+  std::vector<FieldDataDescriptor<IndexSpace<N1>, Point<N2>> > point_field_data;
+
+  virtual void print_info(void)
+  {
+    printf("Realm %dD -> %dD Preimage dependent partitioning test: %d nodes, %d edges, %d pieces ,%d targets, %d sparse factor\n", (int) N1, (int) N2,
+	   (int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) sparse_factor);
+  }
+
+  virtual Event initialize_data(const std::vector<Memory> &memories,
+                                const std::vector<Processor> &procs)
+  {
+    // now create index space for nodes
+    Point<N1> node_lo, node_hi;
+    for (int d = 0; d < N1; d++) {
+      node_lo[d] = 0;
+      node_hi[d] = num_nodes - 1;
+    }
+    is_nodes = Rect<N1>(node_lo, node_hi);
+
+    Point<N2> edge_lo, edge_hi;
+    for (int d = 0; d < N2; d++) {
+      edge_lo[d] = 0;
+      edge_hi[d] = num_edges - 1;
+    }
+    is_edges = Rect<N2>(edge_lo, edge_hi);
+
+    // equal partition is used to do initial population of edges and nodes
+    std::vector<IndexSpace<N1> > ss_nodes_eq;
+
+    log_app.info() << "Creating equal subspaces\n";
+
+    is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait();
+
+    // create instances for each of these subspaces
+    std::vector<size_t> node_fields;
+    node_fields.push_back(sizeof(Point<N2>));
+
+    ri_nodes.resize(num_pieces);
+    point_field_data.resize(num_pieces);
+
+    for(size_t i = 0; i < ss_nodes_eq.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i],
+                                      node_fields, 0 /*SOA*/,
+                                      Realm::ProfilingRequestSet()).wait();
+      ri_nodes[i] = ri;
+
+      point_field_data[i].index_space = ss_nodes_eq[i];
+      point_field_data[i].inst = ri_nodes[i];
+      point_field_data[i].field_offset = 0;
+    }
+
+    // fire off tasks to initialize data
+    std::set<Event> events;
+    for(int i = 0; i < num_pieces; i++) {
+      Processor p = procs[i % procs.size()];
+      InitDataArgs args;
+      args.index = i;
+      args.ri_nodes = ri_nodes[i];
+      Event e = p.spawn(INIT_PREIMAGE_DATA_TASK, &args, sizeof(args));
+      events.insert(e);
+    }
+
+    return Event::merge_events(events);
+  }
+
+  // the outputs of our partitioning will be:
+  //  p_nodes - nodes partitioned by subgraph id (from GPU)
+  //  p_nodes_cpu - nodes partitioned by subgraph id (from CPU)
+
+
+    std::vector<IndexSpace<N1> > p_nodes, p_garbage_nodes, p_nodes_cpu;
+
+  virtual Event perform_partitioning(void)
+  {
+    // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU
+    // Ensure that the results are identical
+
+    std::vector<IndexSpace<N2>> targets;
+    if (sparse_factor <= 1) {
+      is_edges.create_equal_subspaces(num_spaces, 1, targets, Realm::ProfilingRequestSet()).wait();
+    } else {
+      targets.resize(num_spaces);
+      for (int i = 0; i < num_spaces; i++) {
+        targets[i] = create_sparse_index_space(is_edges.bounds, sparse_factor, random_colors, i);
+      }
+    }
+
+    // We need a GPU memory for GPU partitioning
+    Memory gpu_memory;
+    bool found_gpu_memory = false;
+    Machine machine = Machine::get_machine();
+    std::set<Memory> all_memories;
+    machine.get_all_memories(all_memories);
+    for(Memory memory : all_memories) {
+      if(memory.kind() == Memory::GPU_FB_MEM) {
+        gpu_memory = memory;
+        found_gpu_memory = true;
+        break;
+      }
+    }
+    if (!found_gpu_memory) {
+      log_app.error() << "No GPU memory found for partitioning test\n";
+      return Event::NO_EVENT;
+    }
+
+
+    std::vector<size_t> node_fields;
+    node_fields.push_back(sizeof(Point<N2>));
+
+    std::vector<FieldDataDescriptor<IndexSpace<N1>, Point<N2>>> point_field_data_gpu;
+    point_field_data_gpu.resize(num_pieces);
+
+    for (int i = 0; i < num_pieces; i++) {
+    	copy_piece(point_field_data[i], point_field_data_gpu[i], node_fields, 0, gpu_memory).wait();
+    }
+
+    std::vector<DeppartEstimateInput<N1, int>> preimage_inputs(num_pieces);
+    std::vector<DeppartSubspace<N2, int>> preimage_subspaces(num_spaces);
+    std::vector<DeppartBufferRequirements> preimage_requirements(num_pieces);
+
+    for (int i = 0; i < num_pieces; i++) {
+      preimage_inputs[i].location = point_field_data_gpu[i].inst.get_location();
+      preimage_inputs[i].space = point_field_data_gpu[i].index_space;
+    }
+
+    for (int i = 0; i < num_spaces; i++) {
+      preimage_subspaces[i].space = targets[i];
+      preimage_subspaces[i].entries = targets[i].dense() ? 1 : targets[i].sparsity.impl()->get_entries().size();
+    }
+
+    is_nodes.by_preimage_buffer_requirements(preimage_subspaces, preimage_inputs, preimage_requirements);
+
+    for (int i = 0; i < num_pieces; i++) {
+      alloc_piece(point_field_data_gpu[i].scratch_buffer, preimage_requirements[i].upper_bound, gpu_memory).wait();
+    }
+
+    wait_on_events = true;
+    log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n";
+    Event warmup = is_nodes.create_subspaces_by_preimage(point_field_data_gpu,
+                                                  targets,
+                                                  p_garbage_nodes,
+                                                  Realm::ProfilingRequestSet());
+    warmup.wait();
+
+    Event gpu_call = is_nodes.create_subspaces_by_preimage(point_field_data_gpu,
+                                                  targets,
+                                                  p_nodes,
+                                                  Realm::ProfilingRequestSet());
+
+    gpu_call.wait();
+
+    long long start = Clock::current_time_in_microseconds();
+    Event cpu_call = is_nodes.create_subspaces_by_preimage(point_field_data,
+                                                  targets,
+                                                  p_nodes_cpu,
+                                                  Realm::ProfilingRequestSet());
+    cpu_call.wait();
+    std::cout << "CPU TIME: " << (Clock::current_time_in_microseconds() - start) / 1000 << " ms\n";
+
+    return Event::merge_events({gpu_call, cpu_call});
+
+  }
+
+  virtual int perform_dynamic_checks(void)
+  {
+    // Nothing to do here
+    return 0;
+  }
+
+  virtual int check_partitioning(void)
+  {
+    int errors = 0;
+
+    if (!p_nodes.size()) {
+      return p_nodes.size() != p_nodes_cpu.size();
+    }
+
+    log_app.info() << "Checking correctness of partitioning " << "\n";
+
+    for(int i = 0; i < num_spaces; i++) {
+      if (!p_nodes[i].dense() && (N1 > 1)) {
+        p_nodes[i].sparsity.impl()->request_bvh();
+        if (!p_nodes_cpu[i].dense()) {
+          p_nodes_cpu[i].sparsity.impl()->request_bvh();
+        }
+      }
+      for(IndexSpaceIterator<N1> it(p_nodes[i]); it.valid; it.step()) {
+        for(PointInRectIterator<N1> point(it.rect); point.valid; point.step()) {
+          if (!p_nodes_cpu[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU has extra image point " << point.p
+                            << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+      for(IndexSpaceIterator<N1> it(p_nodes_cpu[i]); it.valid; it.step()) {
+        for(PointInRectIterator<N1> point(it.rect); point.valid; point.step()) {
+          if (!p_nodes[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU is missing image point " << point.p
+                          << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+
+    }
+    return errors;
+  }
+};
+
+void top_level_task(const void *args, size_t arglen, const void *userdata, size_t userlen,
+                    Processor p)
+{
+  int errors = 0;
+
+  testcfg->print_info();
+
+  // find all the system memories - we'll stride our data across them
+  // for each memory, we'll need one CPU that can do the initialization of the data
+  std::vector<Memory> sysmems;
+  std::vector<Processor> procs;
+
+  Machine machine = Machine::get_machine();
+  {
+    std::set<Memory> all_memories;
+    machine.get_all_memories(all_memories);
+    for(std::set<Memory>::const_iterator it = all_memories.begin();
+        it != all_memories.end(); it++) {
+      Memory m = *it;
+
+      // skip memories with no capacity for creating instances
+      if(m.capacity() == 0)
+        continue;
+
+      if(m.kind() == Memory::SYSTEM_MEM) {
+        sysmems.push_back(m);
+        std::set<Processor> pset;
+        machine.get_shared_processors(m, pset);
+        Processor p = Processor::NO_PROC;
+        for(std::set<Processor>::const_iterator it2 = pset.begin(); it2 != pset.end();
+            it2++) {
+          if(it2->kind() == Processor::LOC_PROC) {
+            p = *it2;
+            break;
+          }
+        }
+        assert(p.exists());
+        procs.push_back(p);
+        log_app.debug() << "System mem #" << (sysmems.size() - 1) << " = "
+                        << *sysmems.rbegin() << " (" << *procs.rbegin() << ")";
+      }
+    }
+  }
+  assert(sysmems.size() > 0);
+
+  {
+    Realm::TimeStamp ts("initialization", true, &log_app);
+
+    Event e = testcfg->initialize_data(sysmems, procs);
+    // wait for all initialization to be done
+    e.wait();
+  }
+
+  // now actual partitioning work
+  {
+    Realm::TimeStamp ts("dependent partitioning work", true, &log_app);
+
+    Event e = testcfg->perform_partitioning();
+
+    e.wait();
+  }
+
+  // dynamic checks (which would be eliminated by compiler)
+  {
+    Realm::TimeStamp ts("dynamic checks", true, &log_app);
+    errors += testcfg->perform_dynamic_checks();
+  }
+
+  if(!skip_check) {
+    log_app.print() << "checking correctness of partitioning";
+    Realm::TimeStamp ts("verification", true, &log_app);
+    errors += testcfg->check_partitioning();
+  }
+
+  if(errors > 0) {
+    printf("Exiting with errors\n");
+    exit(1);
+  }
+
+  printf("all done!\n");
+}
+
+// Constructor function-pointer type
+using CtorFn = TestInterface* (*)(int, const char** argv);
+
+// ---- Byfield constructors ----
+template<int D>
+static TestInterface* make_byfield(int argc, const char** argv) {
+  return new ByfieldTest<D>(argc, argv);
+}
+
+static constexpr CtorFn BYFIELD_CTORS[3] = {
+  &make_byfield<1>,
+  &make_byfield<2>,
+  &make_byfield<3>,
+};
+
+// ---- Image constructors ----
+template<int D1, int D2>
+static TestInterface* make_image(int argc, const char** argv) {
+  return new ImageTest<D1, D2>(argc, argv);
+}
+
+static constexpr CtorFn IMAGE_CTORS[3][3] = {
+  { &make_image<1,1>, &make_image<1,2>, &make_image<1,3> },
+  { &make_image<2,1>, &make_image<2,2>, &make_image<2,3> },
+  { &make_image<3,1>, &make_image<3,2>, &make_image<3,3> },
+};
+
+// ---- Image Range constructors ----
+template<int D1, int D2>
+static TestInterface* make_image_range(int argc, const char** argv) {
+  return new ImageRangeTest<D1, D2>(argc, argv);
+}
+
+static constexpr CtorFn IMAGE_RANGE_CTORS[3][3] = {
+  { &make_image_range<1,1>, &make_image_range<1,2>, &make_image_range<1,3> },
+  { &make_image_range<2,1>, &make_image_range<2,2>, &make_image_range<2,3> },
+  { &make_image_range<3,1>, &make_image_range<3,2>, &make_image_range<3,3> },
+};
+
+// ---- Image constructors ----
+template<int D1, int D2>
+static TestInterface* make_preimage(int argc, const char** argv) {
+  return new PreimageTest<D1, D2>(argc, argv);
+}
+
+static constexpr CtorFn PREIMAGE_CTORS[3][3] = {
+  { &make_preimage<1,1>, &make_preimage<1,2>, &make_preimage<1,3> },
+  { &make_preimage<2,1>, &make_preimage<2,2>, &make_preimage<2,3> },
+  { &make_preimage<3,1>, &make_preimage<3,2>, &make_preimage<3,3> },
+};
+
+using TaskWrapperFn = void (*)(const void*, size_t, const void*, size_t, Processor);
+
+static constexpr TaskWrapperFn BYFIELD_INIT_TBL[3] = {
+  &ByfieldTest<1>::init_data_task_wrapper,
+  &ByfieldTest<2>::init_data_task_wrapper,
+  &ByfieldTest<3>::init_data_task_wrapper,
+};
+
+static constexpr TaskWrapperFn IMAGE_INIT_TBL[3][3] = {
+  { &ImageTest<1,1>::init_data_task_wrapper, &ImageTest<1,2>::init_data_task_wrapper, &ImageTest<1,3>::init_data_task_wrapper },
+  { &ImageTest<2,1>::init_data_task_wrapper, &ImageTest<2,2>::init_data_task_wrapper, &ImageTest<2,3>::init_data_task_wrapper },
+  { &ImageTest<3,1>::init_data_task_wrapper, &ImageTest<3,2>::init_data_task_wrapper, &ImageTest<3,3>::init_data_task_wrapper },
+};
+
+static constexpr TaskWrapperFn IMAGE_RANGE_INIT_TBL[3][3] = {
+  { &ImageRangeTest<1,1>::init_data_task_wrapper, &ImageRangeTest<1,2>::init_data_task_wrapper, &ImageRangeTest<1,3>::init_data_task_wrapper },
+  { &ImageRangeTest<2,1>::init_data_task_wrapper, &ImageRangeTest<2,2>::init_data_task_wrapper, &ImageRangeTest<2,3>::init_data_task_wrapper },
+  { &ImageRangeTest<3,1>::init_data_task_wrapper, &ImageRangeTest<3,2>::init_data_task_wrapper, &ImageRangeTest<3,3>::init_data_task_wrapper },
+};
+
+static constexpr TaskWrapperFn PREIMAGE_INIT_TBL[3][3] = {
+  { &PreimageTest<1,1>::init_data_task_wrapper, &PreimageTest<1,2>::init_data_task_wrapper, &PreimageTest<1,3>::init_data_task_wrapper },
+  { &PreimageTest<2,1>::init_data_task_wrapper, &PreimageTest<2,2>::init_data_task_wrapper, &PreimageTest<2,3>::init_data_task_wrapper },
+  { &PreimageTest<3,1>::init_data_task_wrapper, &PreimageTest<3,2>::init_data_task_wrapper, &PreimageTest<3,3>::init_data_task_wrapper },
+};
+
+int main(int argc, char **argv)
+{
+  Runtime rt;
+
+  rt.init(&argc, &argv);
+
+  // parse global options
+  for(int i = 1; i < argc; i++) {
+    if(!strcmp(argv[i], "-seed")) {
+      random_seed = atoi(argv[++i]);
+      continue;
+    }
+
+    if(!strcmp(argv[i], "-random")) {
+      random_colors = true;
+      continue;
+    }
+
+    if(!strcmp(argv[i], "-wait")) {
+      wait_on_events = true;
+      continue;
+    }
+
+    if(!strcmp(argv[i], "-show")) {
+      show_graph = true;
+      continue;
+    }
+
+    if(!strcmp(argv[i], "-nocheck")) {
+      skip_check = true;
+      continue;
+    }
+
+    if(!strcmp(argv[i], "-d1")) {
+      dimension1 = atoi(argv[++i]);
+      continue;
+    }
+
+    if(!strcmp(argv[i], "-d2")) {
+      dimension2 = atoi(argv[++i]);
+      continue;
+    }
+
+    if(!strcmp(argv[i], "byfield")) {
+      if (dimension1 < 1 || dimension1 > 3)
+        assert(false && "invalid dimension");
+
+      testcfg = BYFIELD_CTORS[dimension1 - 1](argc - i, const_cast<const char **>(argv + i));
+      break;
+    }
+
+    if(!strcmp(argv[i], "image")) {
       if (dimension1 < 1 || dimension1 > 3 || dimension2 < 1 || dimension2 > 3)
         assert(false && "invalid dimension");
       testcfg = IMAGE_CTORS[dimension1 - 1][dimension2 - 1](argc - i, const_cast<const char **>(argv + i));
       break;
     }
 
+    if(!strcmp(argv[i], "range")) {
+      if (dimension1 < 1 || dimension1 > 3 || dimension2 < 1 || dimension2 > 3)
+        assert(false && "invalid dimension");
+      testcfg = IMAGE_RANGE_CTORS[dimension1 - 1][dimension2 - 1](argc - i, const_cast<const char **>(argv + i));
+      break;
+    }
+
+    if(!strcmp(argv[i], "preimage")) {
+      if (dimension1 < 1 || dimension1 > 3 || dimension2 < 1 || dimension2 > 3)
+        assert(false && "invalid dimension");
+      testcfg = PREIMAGE_CTORS[dimension1 - 1][dimension2 - 1](argc - i, const_cast<const char **>(argv + i));
+      break;
+    }
+
     // printf("unknown parameter: %s\n", argv[i]);
   }
 
@@ -926,6 +1637,8 @@ int main(int argc, char **argv)
 
   rt.register_task(INIT_BYFIELD_DATA_TASK, BYFIELD_INIT_TBL[dimension1 - 1]);
   rt.register_task(INIT_IMAGE_DATA_TASK,   IMAGE_INIT_TBL[dimension1 - 1][dimension2 - 1]);
+  rt.register_task(INIT_IMAGE_RANGE_DATA_TASK,   IMAGE_RANGE_INIT_TBL[dimension1 - 1][dimension2 - 1]);
+  rt.register_task(INIT_PREIMAGE_DATA_TASK,   PREIMAGE_INIT_TBL[dimension1 - 1][dimension2 - 1]);
 
   signal(SIGALRM, sigalrm_handler);
 
diff --git a/tests/unit_tests/sparsity_map_test.cc b/tests/unit_tests/sparsity_map_test.cc
index ab673f7b27..a0fafbf834 100644
--- a/tests/unit_tests/sparsity_map_test.cc
+++ b/tests/unit_tests/sparsity_map_test.cc
@@ -284,7 +284,7 @@ void run_contribute_dense_case(const ContributeDenseRectTestData<N> &test_case)
   impl->set_contributor_count(1);
   impl->contribute_dense_rect_list(test_case.rects, test_case.disjoint);
 
-  std::vector<SparsityMapEntry<N, T>> entries = public_impl->get_entries();
+  span<SparsityMapEntry<N, T>> entries = public_impl->get_entries();
   ASSERT_TRUE(public_impl->is_valid());
   ASSERT_EQ(entries.size(), test_case.expected.size());
   for(size_t i = 0; i < entries.size(); i++) {

From 7a0c30c80460b6295f12e75da72c3e8b17ffca65 Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Tue, 10 Mar 2026 17:19:51 -0700
Subject: [PATCH 19/32] preparing to run on perlmutter

---
 src/realm/cuda/cuda_internal.h            |   3 +
 src/realm/cuda/cuda_module.cc             |   7 +
 src/realm/deppart/byfield.cc              |  18 +-
 src/realm/deppart/byfield_gpu_impl.hpp    |  13 +-
 src/realm/deppart/image.cc                |  26 +-
 src/realm/deppart/image_gpu_impl.hpp      |  23 +-
 src/realm/deppart/partitions.h            |  41 +-
 src/realm/deppart/partitions_gpu_impl.hpp |  15 +-
 src/realm/deppart/preimage.cc             |  19 +-
 src/realm/deppart/preimage_gpu_impl.hpp   |  24 +-
 tests/benchmark.cc                        | 510 ++++++++++++++++++++--
 11 files changed, 592 insertions(+), 107 deletions(-)

diff --git a/src/realm/cuda/cuda_internal.h b/src/realm/cuda/cuda_internal.h
index 614710bfe1..13d127c12b 100644
--- a/src/realm/cuda/cuda_internal.h
+++ b/src/realm/cuda/cuda_internal.h
@@ -412,6 +412,7 @@ namespace Realm {
       get_null_task_stream(void) const; // needed by librealm_kokkos.so
       GPUStream *get_next_task_stream(bool create = false);
       GPUStream *get_next_d2d_stream();
+      GPUStream *get_deppart_stream() const;
 
       void launch_batch_affine_fill_kernel(void *fill_info, size_t dim, size_t elemSize,
                                            size_t volume, GPUStream *stream);
@@ -489,6 +490,8 @@ namespace Realm {
       GPUStream *host_to_device_stream = nullptr;
       GPUStream *device_to_host_stream = nullptr;
       GPUStream *device_to_device_stream = nullptr;
+      GPUStream *deppart_stream = nullptr;
+
       std::vector<GPUStream *> device_to_device_streams;
       std::vector<GPUStream *> peer_to_peer_streams; // indexed by target
       std::vector<GPUStream *> task_streams;
diff --git a/src/realm/cuda/cuda_module.cc b/src/realm/cuda/cuda_module.cc
index 0147bc2b0d..ce84eb5704 100644
--- a/src/realm/cuda/cuda_module.cc
+++ b/src/realm/cuda/cuda_module.cc
@@ -1058,6 +1058,11 @@ namespace Realm {
       return device_to_device_streams[d2d_stream_index];
     }
 
+    GPUStream *GPU::get_deppart_stream() const
+    {
+      return deppart_stream;
+    }
+
     static void launch_kernel(const Realm::Cuda::GPU::GPUFuncInfo &func_info,
                               void *params, size_t num_elems, GPUStream *stream)
     {
@@ -2040,6 +2045,7 @@ namespace Realm {
 
       host_to_device_stream = new GPUStream(this, worker);
       device_to_host_stream = new GPUStream(this, worker);
+      deppart_stream = new GPUStream(this, worker);
 
       CUdevice dev;
       int numSMs;
@@ -2164,6 +2170,7 @@ namespace Realm {
       // destroy streams
       delete host_to_device_stream;
       delete device_to_host_stream;
+      delete deppart_stream;
 
       delete_container_contents(device_to_device_streams);
 
diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc
index 7c1fe148c1..06c936f0b2 100644
--- a/src/realm/deppart/byfield.cc
+++ b/src/realm/deppart/byfield.cc
@@ -23,6 +23,7 @@
 #include "realm/deppart/rectlist.h"
 #include "realm/deppart/inst_helper.h"
 #include "realm/logging.h"
+#include "realm/cuda/cuda_internal.h"
 
 namespace Realm {
 
@@ -45,16 +46,8 @@ namespace Realm {
               device_size = atoi(val);
             }
             size_t optimal_size = is.bounds.volume() * 10 * sizeof(RectDesc<N, T>);
-            std::vector<Machine::ProcessorMemoryAffinity> affinities;
-            unsigned best_bandwidth = 0;
             Processor best_proc = Processor::NO_PROC;
-            Machine::get_machine().get_proc_mem_affinity(affinities, Processor::NO_PROC, mem);
-            for (auto affinity : affinities) {
-              if (affinity.bandwidth > best_bandwidth) {
-                best_bandwidth = affinity.bandwidth;
-                best_proc = affinity.p;
-              }
-            }
+            assert(choose_proc(best_proc, mem));
             requirements[i].affinity_processor = best_proc;
             requirements[i].lower_bound = device_size;
             requirements[i].upper_bound = max(device_size, optimal_size);
@@ -332,6 +325,13 @@ namespace Realm {
     bool _exclusive)
     : parent_space(_parent), field_data(_field_data) {
     this->exclusive = _exclusive;
+    Memory my_mem = field_data[0].inst.get_location();
+    Processor best_proc;
+    assert(choose_proc(best_proc, my_mem));
+    Cuda::GPUProcessor* gpu_proc = dynamic_cast<Cuda::GPUProcessor*>(get_runtime()->get_processor_impl(best_proc));
+    assert(gpu_proc);
+    this->gpu = gpu_proc->gpu;
+    this->stream = gpu_proc->gpu->get_deppart_stream();
   }
 
   template<int N, typename T, typename FT>
diff --git a/src/realm/deppart/byfield_gpu_impl.hpp b/src/realm/deppart/byfield_gpu_impl.hpp
index 849556a53d..8e1c953730 100644
--- a/src/realm/deppart/byfield_gpu_impl.hpp
+++ b/src/realm/deppart/byfield_gpu_impl.hpp
@@ -18,15 +18,19 @@ template <int N, typename T, typename FT>
 void GPUByFieldMicroOp<N,T,FT>::execute()
 {
 
+  Cuda::AutoGPUContext agc(this->gpu);
+
   // For profiling.
   NVTX_DEPPART(byfield_gpu);
 
-  cudaStream_t stream = Cuda::get_task_cuda_stream();
+  CUstream stream = this->stream->get_stream();
 
   collapsed_space<N, T> inst_space;
 
   size_t tile_size = field_data[0].scratch_buffer.get_layout()->bytes_used;
 
+  //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
+
   Arena buffer_arena(field_data[0].scratch_buffer.pointer_untyped(0, tile_size), tile_size);
 
   inst_space.offsets = buffer_arena.alloc<size_t>(field_data.size() + 1);
@@ -97,12 +101,13 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
   size_t num_completed = 0;
   size_t curr_tile = tile_size / 2;
   int count = 0;
+  if (count) {}
   bool host_fallback = false;
   std::vector<RegionInstance> h_instances(colors.size(), RegionInstance::NO_INST);
   std::vector<size_t> entry_counts(colors.size(), 0);
   while (num_completed < inst_space.num_entries) {
     try {
-      std::cout << "Byfield iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
+      //std::cout << "Byfield iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
       buffer_arena.start();
       if (num_completed + curr_tile > inst_space.num_entries) {
         curr_tile = inst_space.num_entries - num_completed;
@@ -202,11 +207,11 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
       CUDA_CHECK(cudaStreamSynchronize(stream), stream);
 
     } catch (arena_oom&) {
-      std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl;
+      //std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl;
       curr_tile /= 2;
       if (curr_tile == 0) {
         if (host_fallback) {
-          GPUMicroOp<N, T>::shatter_rects(inst_space, num_completed);
+          GPUMicroOp<N, T>::shatter_rects(inst_space, num_completed, stream);
           curr_tile = 1;
         } else {
           host_fallback = true;
diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc
index 8d37d81969..ec8cfb834d 100644
--- a/src/realm/deppart/image.cc
+++ b/src/realm/deppart/image.cc
@@ -24,6 +24,7 @@
 #include "realm/deppart/inst_helper.h"
 #include "realm/deppart/preimage.h"
 #include "realm/logging.h"
+#include "realm/cuda/cuda_internal.h"
 
 namespace Realm {
 
@@ -74,16 +75,8 @@ namespace Realm {
       	}
         minimal_size = max(minimal_size, device_size);
       	size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>) * source_spaces.size() * 10 + minimal_size;
-      	std::vector<Machine::ProcessorMemoryAffinity> affinities;
-        unsigned best_bandwidth = 0;
-        Processor best_proc = Processor::NO_PROC;
-        Machine::get_machine().get_proc_mem_affinity(affinities, Processor::NO_PROC, mem);
-        for (auto affinity : affinities) {
-          if (affinity.bandwidth > best_bandwidth) {
-            best_bandwidth = affinity.bandwidth;
-            best_proc = affinity.p;
-          }
-        }
+      	Processor best_proc;
+      	assert(choose_proc(best_proc, mem));
         requirements[i].affinity_processor = best_proc;
       	requirements[i].lower_bound = minimal_size;
       	requirements[i].upper_bound = optimal_size;
@@ -948,7 +941,14 @@ namespace Realm {
       bool _exclusive)
       : parent_space(_parent), domain_transform(_domain_transform)
   {
-	  this->exclusive = _exclusive;
+	this->exclusive = _exclusive;
+  	Memory my_mem = domain_transform.ptr_data.empty() ? domain_transform.range_data[0].inst.get_location() : domain_transform.ptr_data[0].inst.get_location();
+  	Processor best_proc;
+  	assert(choose_proc(best_proc, my_mem));
+  	Cuda::GPUProcessor* gpu_proc = dynamic_cast<Cuda::GPUProcessor*>(get_runtime()->get_processor_impl(best_proc));
+  	assert(gpu_proc);
+  	this->gpu = gpu_proc->gpu;
+  	this->stream = gpu_proc->gpu->get_deppart_stream();
   }
 
   template <int N, typename T, int N2, typename T2>
@@ -995,7 +995,9 @@ namespace Realm {
 
   template <int N, typename T, int N2, typename T2>
   void GPUImageMicroOp<N, T, N2, T2>::execute(void) {
-    TimeStamp ts("StructuredImageMicroOp::execute", true, &log_uop_timing);
+    TimeStamp ts("GPUImageMicroOp::execute", true, &log_uop_timing);
+
+    Cuda::AutoGPUContext agc(this->gpu);
     if (domain_transform.ptr_data.size() > 0) {
       gpu_populate_ptrs();
     } else {
diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp
index 83f907d922..43682e06dd 100644
--- a/src/realm/deppart/image_gpu_impl.hpp
+++ b/src/realm/deppart/image_gpu_impl.hpp
@@ -48,10 +48,10 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
 
     RegionInstance buffer = domain_transform.range_data[0].scratch_buffer;
     size_t tile_size = buffer.get_layout()->bytes_used;
-    std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
+    //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
     Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size);
 
-    cudaStream_t stream = Cuda::get_task_cuda_stream();
+    CUstream stream = this->stream->get_stream();
 
     collapsed_space<N2, T2> src_space;
     src_space.offsets = buffer_arena.alloc<size_t>(sources.size()+1);
@@ -98,13 +98,13 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
     size_t num_completed = 0;
     size_t curr_tile = tile_size / 2;
     int count = 0;
-
+    if (count) {}
     bool host_fallback = false;
     std::vector<RegionInstance> h_instances(sources.size(), RegionInstance::NO_INST);
     std::vector<size_t> entry_counts(sources.size(), 0);
     while (num_completed < inst_space.num_entries) {
       try {
-        std::cout << "Image Range iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
+        //std::cout << "Image Range iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
         buffer_arena.start();
         buffer_arena.flip_parity();
         if (num_completed + curr_tile > inst_space.num_entries) {
@@ -241,11 +241,11 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
           CUDA_CHECK(cudaStreamSynchronize(stream), stream);
       }
       catch (arena_oom&) {
-        std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl;
+        //std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl;
         curr_tile /= 2;
         if (curr_tile == 0) {
           if (host_fallback) {
-            GPUMicroOp<N2, T2>::shatter_rects(inst_space, num_completed);
+            GPUMicroOp<N2, T2>::shatter_rects(inst_space, num_completed, stream);
             curr_tile = 1;
           } else {
             host_fallback = true;
@@ -329,10 +329,10 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
     Memory sysmem;
     find_memory(sysmem, Memory::SYSTEM_MEM);
 
-    cudaStream_t stream = Cuda::get_task_cuda_stream();
+    CUstream stream = this->stream->get_stream();
 
     size_t tile_size = buffer.get_layout()->bytes_used;
-    std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
+    //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
     Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size);
 
     collapsed_space<N2, T2> src_space;
@@ -385,12 +385,13 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
     size_t num_completed = 0;
     size_t curr_tile = tile_size / 2;
     int count = 0;
+    if (count) {}
     bool host_fallback = false;
     std::vector<RegionInstance> h_instances(sources.size(), RegionInstance::NO_INST);
     std::vector<size_t> entry_counts(sources.size(), 0);
     while (num_completed < inst_space.num_entries) {
       try {
-        std::cout << "Image iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
+        //std::cout << "Image iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
         buffer_arena.start();
         if (num_completed + curr_tile > inst_space.num_entries) {
           curr_tile = inst_space.num_entries - num_completed;
@@ -513,11 +514,11 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
         CUDA_CHECK(cudaStreamSynchronize(stream), stream);
       }
       catch (arena_oom&) {
-        std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl;
+        //std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl;
         curr_tile /= 2;
         if (curr_tile == 0) {
           if (host_fallback) {
-            GPUMicroOp<N2, T2>::shatter_rects(inst_space, num_completed);
+            GPUMicroOp<N2, T2>::shatter_rects(inst_space, num_completed, stream);
             curr_tile = 1;
           } else {
             host_fallback = true;
diff --git a/src/realm/deppart/partitions.h b/src/realm/deppart/partitions.h
index 68e5b40084..9ccbde6b75 100644
--- a/src/realm/deppart/partitions.h
+++ b/src/realm/deppart/partitions.h
@@ -34,9 +34,14 @@
 #include "realm/deppart/sparsity_impl.h"
 #include "realm/deppart/inst_helper.h"
 #include "realm/bgwork.h"
+#ifdef REALM_USE_CUDA
+#include "realm/cuda/cuda_module.h"
 
 struct CUstream_st;
-typedef CUstream_st* cudaStream_t;
+typedef CUstream_st* CUstream;
+
+#endif
+
 
 namespace Realm {
 
@@ -45,6 +50,10 @@ namespace Realm {
 
 #ifdef REALM_USE_CUDA
 
+  namespace Cuda {
+    class GPUStream;
+  }
+
   template<typename T>
   struct HiFlag {
     T hi;
@@ -349,20 +358,20 @@ namespace Realm {
 
     virtual void execute(void) = 0;
 
-    static void shatter_rects(collapsed_space<N, T> & inst_space, size_t &num_completed);
+    static void shatter_rects(collapsed_space<N, T> & inst_space, size_t &num_completed, CUstream stream);
 
     template <typename space_t>
-    static void collapse_multi_space(const std::vector<space_t>& field_data, collapsed_space<N, T> &out_space, Arena &my_arena, cudaStream_t stream);
+    static void collapse_multi_space(const std::vector<space_t>& field_data, collapsed_space<N, T> &out_space, Arena &my_arena, CUstream stream);
 
-    static void collapse_parent_space(const IndexSpace<N, T>& parent_space, collapsed_space<N, T> &out_space, Arena &my_arena, cudaStream_t stream);
+    static void collapse_parent_space(const IndexSpace<N, T>& parent_space, collapsed_space<N, T> &out_space, Arena &my_arena, CUstream stream);
 
-    static void build_bvh(const collapsed_space<N, T> &space, BVH<N, T> &bvh, Arena &my_arena, cudaStream_t stream);
+    static void build_bvh(const collapsed_space<N, T> &space, BVH<N, T> &bvh, Arena &my_arena, CUstream stream);
 
     template <typename out_t>
-    static void construct_input_rectlist(const collapsed_space<N, T> &lhs, const collapsed_space<N, T> &rhs, out_t* &d_valid_rects, size_t& out_size, uint32_t* counters, uint32_t* out_offsets, Arena &my_arena, cudaStream_t stream);
+    static void construct_input_rectlist(const collapsed_space<N, T> &lhs, const collapsed_space<N, T> &rhs, out_t* &d_valid_rects, size_t& out_size, uint32_t* counters, uint32_t* out_offsets, Arena &my_arena, CUstream stream);
 
     template <typename out_t>
-    static void volume_prefix_sum(const out_t* d_rects, size_t total_rects, size_t* &d_prefix_rects, size_t& num_pts, Arena &my_arena, cudaStream_t stream);
+    static void volume_prefix_sum(const out_t* d_rects, size_t total_rects, size_t* &d_prefix_rects, size_t& num_pts, Arena &my_arena, CUstream stream);
 
     template<typename Container, typename IndexFn, typename MapFn>
     void complete_pipeline(PointDesc<N, T>* d_points, size_t total_pts, RectDesc<N, T>* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap);
@@ -381,6 +390,8 @@ namespace Realm {
     virtual bool is_image_microop() const { return false; }
 
     bool exclusive = false;
+    Cuda::GPU* gpu;
+    Cuda::GPUStream* stream;
 
   };
 #endif
@@ -490,6 +501,22 @@ namespace Realm {
     static ActiveMessageHandlerReg<RemoteMicroOpCompleteMessage> areg;
   };
 
+  // Finds a memory of the specified kind. Returns true on success, false otherwise.
+  inline bool choose_proc(Processor &best_proc, Memory location)
+  {
+    std::vector<Machine::ProcessorMemoryAffinity> affinities;
+    unsigned best_bandwidth = 0;
+    best_proc = Processor::NO_PROC;
+    Machine::get_machine().get_proc_mem_affinity(affinities, Processor::NO_PROC, location);
+    for (auto affinity : affinities) {
+      if (affinity.bandwidth > best_bandwidth) {
+        best_bandwidth = affinity.bandwidth;
+        best_proc = affinity.p;
+      }
+    }
+    return best_proc != Processor::NO_PROC;
+  }
+
 
 };
 
diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp
index 0827f1844c..d136c2138b 100644
--- a/src/realm/deppart/partitions_gpu_impl.hpp
+++ b/src/realm/deppart/partitions_gpu_impl.hpp
@@ -107,10 +107,9 @@ namespace Realm {
   }
 
   template <int N, typename T>
-  void GPUMicroOp<N, T>::shatter_rects(collapsed_space<N, T> & inst_space, size_t &num_completed) {
+  void GPUMicroOp<N, T>::shatter_rects(collapsed_space<N, T> & inst_space, size_t &num_completed, CUstream stream) {
 
     NVTX_DEPPART(shatter_rects);
-    cudaStream_t stream = Cuda::get_task_cuda_stream();
     size_t new_size = (inst_space.entries_buffer[num_completed].bounds.volume() + 1) / 2;
     assert(new_size > 0);
     size_t num_new_entries = 0;
@@ -189,7 +188,7 @@ namespace Realm {
   //Given a list of spaces, compacts them all into one collapsed_space
   template<int N, typename T>
   template<typename space_t>
-  void GPUMicroOp<N,T>::collapse_multi_space(const std::vector<space_t>& spaces, collapsed_space<N, T> &out_space, Arena &my_arena, cudaStream_t stream)
+  void GPUMicroOp<N,T>::collapse_multi_space(const std::vector<space_t>& spaces, collapsed_space<N, T> &out_space, Arena &my_arena, CUstream stream)
   {
 
     NVTX_DEPPART(collapse_multi_space);
@@ -609,7 +608,7 @@ namespace Realm {
       return;
     }
     NVTX_DEPPART(complete_rect_pipeline);
-    cudaStream_t stream = Cuda::get_task_cuda_stream();
+    CUstream stream = this->stream->get_stream();
 
     Memory my_mem;
     assert(find_memory(my_mem, Memory::GPU_FB_MEM));
@@ -1300,7 +1299,7 @@ namespace Realm {
   {
 
     NVTX_DEPPART(complete1d_pipeline);
-    cudaStream_t stream = Cuda::get_task_cuda_stream();
+    CUstream stream = this->stream->get_stream();
 
     RectDesc<N,T>* d_rects_in = d_rects;
 
@@ -1454,7 +1453,7 @@ namespace Realm {
     }
 
 
-    cudaStream_t stream = Cuda::get_task_cuda_stream();
+    CUstream stream = this->stream->get_stream();
 
     size_t bytes_T   = total_pts * sizeof(T);
     size_t bytes_S   = total_pts * sizeof(size_t);
@@ -1576,7 +1575,7 @@ namespace Realm {
   {
     NVTX_DEPPART(split_output);
 
-    cudaStream_t stream = Cuda::get_task_cuda_stream();
+    CUstream stream = this->stream->get_stream();
     bool use_sysmem = false;
     RegionInstance sys_instance = RegionInstance::NO_INST;
 
@@ -1680,7 +1679,7 @@ namespace Realm {
 
     size_t prev = my_arena.mark();
 
-    cudaStream_t stream = Cuda::get_task_cuda_stream();
+    CUstream stream = this->stream->get_stream();
 
     SparsityMapEntry<N,T>* final_entries = my_arena.alloc<SparsityMapEntry<N,T>>(total_rects);
     Rect<N,T>* final_rects = my_arena.alloc<Rect<N,T>>(total_rects);
diff --git a/src/realm/deppart/preimage.cc b/src/realm/deppart/preimage.cc
index 4feaa585e4..37bfef188a 100644
--- a/src/realm/deppart/preimage.cc
+++ b/src/realm/deppart/preimage.cc
@@ -26,6 +26,7 @@
 #include "../logging.h"
 #include <sstream>
 #include <ctime>
+#include "realm/cuda/cuda_internal.h"
 
 namespace Realm {
 
@@ -76,16 +77,8 @@ namespace Realm {
         }
         minimal_size = max(minimal_size, device_size);
         size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>) * target_spaces.size() * 10 + minimal_size;
-        std::vector<Machine::ProcessorMemoryAffinity> affinities;
-        unsigned best_bandwidth = 0;
         Processor best_proc = Processor::NO_PROC;
-        Machine::get_machine().get_proc_mem_affinity(affinities, Processor::NO_PROC, mem);
-        for (auto affinity : affinities) {
-          if (affinity.bandwidth > best_bandwidth) {
-            best_bandwidth = affinity.bandwidth;
-            best_proc = affinity.p;
-          }
-        }
+      	assert(choose_proc(best_proc, mem));
         requirements[i].affinity_processor = best_proc;
         requirements[i].lower_bound = minimal_size;
         requirements[i].upper_bound = optimal_size;
@@ -825,6 +818,13 @@ namespace Realm {
 		IndexSpace<N, T> _parent_space, bool _exclusive)
 		: domain_transform(_domain_transform), parent_space(_parent_space) {
 		this->exclusive = _exclusive;
+		Memory my_mem = domain_transform.ptr_data.empty() ? domain_transform.range_data[0].inst.get_location() : domain_transform.ptr_data[0].inst.get_location();
+		Processor best_proc;
+		assert(choose_proc(best_proc, my_mem));
+		Cuda::GPUProcessor* gpu_proc = dynamic_cast<Cuda::GPUProcessor*>(get_runtime()->get_processor_impl(best_proc));
+		assert(gpu_proc);
+		this->gpu = gpu_proc->gpu;
+		this->stream = gpu_proc->gpu->get_deppart_stream();
 	}
 
 	template<int N, typename T, int N2, typename T2>
@@ -841,6 +841,7 @@ namespace Realm {
 	template<int N, typename T, int N2, typename T2>
 	void GPUPreimageMicroOp<N, T, N2, T2>::execute(void) {
 		TimeStamp ts("GPUPreimageMicroOp::execute", true, &log_uop_timing);
+		Cuda::AutoGPUContext agc(this->gpu);
 	        if (domain_transform.ptr_data.size() > 0) {
 	          gpu_populate_bitmasks();
 	        } else if (domain_transform.range_data.size() > 0) {
diff --git a/src/realm/deppart/preimage_gpu_impl.hpp b/src/realm/deppart/preimage_gpu_impl.hpp
index 960e427beb..3a104f2e84 100644
--- a/src/realm/deppart/preimage_gpu_impl.hpp
+++ b/src/realm/deppart/preimage_gpu_impl.hpp
@@ -18,7 +18,7 @@ namespace Realm {
     RegionInstance buffer = domain_transform.range_data[0].scratch_buffer;
 
     size_t tile_size = buffer.get_layout()->bytes_used;
-    std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
+    //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
     Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size);
 
     NVTX_DEPPART(gpu_preimage_range);
@@ -26,7 +26,7 @@ namespace Realm {
     Memory sysmem;
     find_memory(sysmem, Memory::SYSTEM_MEM);
 
-    cudaStream_t stream = Cuda::get_task_cuda_stream();
+    CUstream stream = this->stream->get_stream();
 
     collapsed_space<N, T> inst_space;
 
@@ -76,14 +76,14 @@ namespace Realm {
     size_t num_completed = 0;
     size_t curr_tile = tile_size / 2;
     int count = 0;
-
+    if (count) {}
     bool host_fallback = false;
     std::vector<RegionInstance> h_instances(targets.size(), RegionInstance::NO_INST);
     std::vector<size_t> entry_counts(targets.size(), 0);
     while (num_completed < inst_space.num_entries) {
       try {
 
-        std::cout << "Preimage iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
+        //std::cout << "Preimage iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
         buffer_arena.start();
         if (num_completed + curr_tile > inst_space.num_entries) {
           curr_tile = inst_space.num_entries - num_completed;
@@ -254,11 +254,11 @@ namespace Realm {
         CUDA_CHECK(cudaStreamSynchronize(stream), stream);
 
       } catch (arena_oom&) {
-        std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl;
+        //std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl;
         curr_tile /= 2;
         if (curr_tile == 0) {
           if (host_fallback) {
-            GPUMicroOp<N, T>::shatter_rects(inst_space, num_completed);
+            GPUMicroOp<N, T>::shatter_rects(inst_space, num_completed, stream);
             curr_tile = 1;
           } else {
             host_fallback = true;
@@ -326,13 +326,13 @@ namespace Realm {
     RegionInstance buffer = domain_transform.ptr_data[0].scratch_buffer;
 
     size_t tile_size = buffer.get_layout()->bytes_used;
-    std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
+    //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
     Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size);
 
     Memory sysmem;
     find_memory(sysmem, Memory::SYSTEM_MEM);
 
-    cudaStream_t stream = Cuda::get_task_cuda_stream();
+    CUstream stream = this->stream->get_stream();
 
     NVTX_DEPPART(gpu_preimage);
 
@@ -384,14 +384,14 @@ namespace Realm {
     size_t num_completed = 0;
     size_t curr_tile = tile_size / 2;
     int count = 0;
-
+    if (count) {}
     bool host_fallback = false;
     std::vector<RegionInstance> h_instances(targets.size(), RegionInstance::NO_INST);
     std::vector<size_t> entry_counts(targets.size(), 0);
     while (num_completed < inst_space.num_entries) {
       try {
 
-        std::cout << "Preimage iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
+        //std::cout << "Preimage iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl;
         buffer_arena.start();
         if (num_completed + curr_tile > inst_space.num_entries) {
           curr_tile = inst_space.num_entries - num_completed;
@@ -562,11 +562,11 @@ namespace Realm {
         CUDA_CHECK(cudaStreamSynchronize(stream), stream);
 
       } catch (arena_oom&) {
-        std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl;
+        //std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl;
         curr_tile /= 2;
         if (curr_tile == 0) {
           if (host_fallback) {
-            GPUMicroOp<N, T>::shatter_rects(inst_space, num_completed);
+            GPUMicroOp<N, T>::shatter_rects(inst_space, num_completed, stream);
             curr_tile = 1;
           } else {
             host_fallback = true;
diff --git a/tests/benchmark.cc b/tests/benchmark.cc
index 6b78151d68..177ceeb558 100644
--- a/tests/benchmark.cc
+++ b/tests/benchmark.cc
@@ -43,7 +43,8 @@ enum
   INIT_BYFIELD_DATA_TASK,
   INIT_IMAGE_DATA_TASK,
   INIT_IMAGE_RANGE_DATA_TASK,
-  INIT_PREIMAGE_DATA_TASK
+  INIT_PREIMAGE_DATA_TASK,
+  INIT_PREIMAGE_RANGE_DATA_TASK
 };
 
 namespace std {
@@ -100,6 +101,7 @@ namespace {
   bool skip_check = false;
   int dimension1 = 1;
   int dimension2 = 1;
+  std::string op;
   TestInterface *testcfg = 0;
 }; // namespace
 
@@ -152,11 +154,11 @@ IndexSpace<N, T> create_sparse_index_space(const Rect<N, T> &bounds, size_t spar
       stride *= (bounds.hi[d] - bounds.lo[d] + 1);
     }
     if(randomize) {
-      if(Philox_2x32<>::rand_int(random_seed, flattened, 0, sparse_factor) == 0) {
+      if(Philox_2x32<>::rand_int(random_seed, flattened, 0, 100) < sparse_factor) {
         points.push_back(it.p);
       }
     } else {
-      if(flattened % sparse_factor == 0) {
+      if( (99 * flattened) % 100 < sparse_factor) {
         points.push_back(it.p);
       }
     }
@@ -176,6 +178,7 @@ class ByfieldTest : public TestInterface {
   int num_nodes = 1000;
   int num_pieces = 4;
   int num_colors = 4;
+  size_t buffer_size = 100;
   std::string filename;
 
   ByfieldTest(int argc, const char *argv[])
@@ -194,11 +197,15 @@ class ByfieldTest : public TestInterface {
         num_colors = atoi(argv[++i]);
         continue;
       }
+      if(!strcmp(argv[i], "-b")) {
+        buffer_size = atoi(argv[++i]);
+        continue;
+      }
     }
 
 
-    if (num_nodes <= 0 || num_pieces <= 0 || num_colors <= 0) {
-      log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_colors << " pieces=" << num_pieces << "\n";
+    if (num_nodes <= 0 || num_pieces <= 0 || num_colors <= 0 || buffer_size <= 0 || buffer_size > 100) {
+      log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_colors << " pieces=" << num_pieces << " buffer size=" << buffer_size << "\n";
       exit(1);
     }
   }
@@ -269,8 +276,8 @@ class ByfieldTest : public TestInterface {
 
   virtual void print_info(void)
   {
-    printf("Realm %dD Byfield dependent partitioning test: %d nodes, %d colors, %d pieces\n", (int) N,
-	   (int)num_nodes, (int) num_colors, (int)num_pieces);
+    //printf("Realm %dD Byfield dependent partitioning test: %d nodes, %d colors, %d pieces, %lu tile size\n", (int) N,
+	   //(int)num_nodes, (int) num_colors, (int)num_pieces, buffer_size);
   }
 
   virtual Event initialize_data(const std::vector<Memory> &memories,
@@ -380,11 +387,12 @@ class ByfieldTest : public TestInterface {
 
     is_colors.by_field_buffer_requirements(byfield_inputs, byfield_requirements);
 
+
     for (int i = 0; i < num_pieces; i++) {
-      alloc_piece(piece_field_data_gpu[i].scratch_buffer, byfield_requirements[i].upper_bound, gpu_memory).wait();
+      size_t alloc_size = byfield_requirements[i].lower_bound + (byfield_requirements[i].upper_bound - byfield_requirements[i].lower_bound) * buffer_size / 100;
+      alloc_piece(piece_field_data_gpu[i].scratch_buffer, alloc_size, gpu_memory).wait();
     }
 
-    wait_on_events = true;
     log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n";
     Event warmup = is_colors.create_subspaces_by_field(piece_field_data_gpu,
                                                   colors,
@@ -392,16 +400,27 @@ class ByfieldTest : public TestInterface {
                                                   Realm::ProfilingRequestSet());
     warmup.wait();
 
+    long long start_gpu = Clock::current_time_in_microseconds();
     Event gpu_call = is_colors.create_subspaces_by_field(piece_field_data_gpu,
                                                   colors,
                                                   p_nodes,
                                                   Realm::ProfilingRequestSet());
 
+    gpu_call.wait();
+    long long gpu_time = Clock::current_time_in_microseconds() - start_gpu;
+    long long start_cpu = Clock::current_time_in_microseconds();
+
     Event cpu_call = is_colors.create_subspaces_by_field(piece_id_field_data,
                                                   colors,
                                                   p_nodes_cpu,
                                                   Realm::ProfilingRequestSet());
 
+    cpu_call.wait();
+    long long cpu_time = Clock::current_time_in_microseconds() - start_cpu;
+
+    printf("RESULT,op=byfield,d1=%d,num_nodes=%d,buffer_size=%zu,gpu_us=%lld,cpu_us=%lld\n",
+             N, num_nodes, buffer_size, gpu_time, cpu_time);
+
     return Event::merge_events({gpu_call, cpu_call});
 
   }
@@ -423,6 +442,12 @@ class ByfieldTest : public TestInterface {
     log_app.info() << "Checking correctness of partitioning " << "\n";
 
     for(int i = 0; i < num_pieces; i++) {
+      if (!p_nodes[i].dense() && (N > 1)) {
+        p_nodes[i].sparsity.impl()->request_bvh();
+        if (!p_nodes_cpu[i].dense()) {
+          p_nodes_cpu[i].sparsity.impl()->request_bvh();
+        }
+      }
       for(IndexSpaceIterator<N> it(p_nodes[i]); it.valid; it.step()) {
         for(PointInRectIterator<N> point(it.rect); point.valid; point.step()) {
           if (!p_nodes_cpu[i].contains(point.p)) {
@@ -453,9 +478,10 @@ class ImageTest : public TestInterface {
   // graph config parameters
   int num_nodes = 1000;
   int num_edges = 1000;
-  int sparse_factor = 4;
+  int sparse_factor = 50;
   int num_spaces = 4;
   int num_pieces = 4;
+  size_t buffer_size = 100;
   std::string filename;
 
   ImageTest(int argc, const char *argv[])
@@ -482,11 +508,15 @@ class ImageTest : public TestInterface {
         sparse_factor = atoi(argv[++i]);
         continue;
       }
+      if (!strcmp(argv[i], "-b")) {
+        buffer_size = atoi(argv[++i]);
+        continue;
+      }
     }
 
 
     if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_spaces <= 0) {
-      log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " sources=" << num_spaces <<  "\n";
+      log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " sources=" << num_spaces << " buffer size=" << buffer_size <<  "\n";
       exit(1);
     }
   }
@@ -560,8 +590,8 @@ class ImageTest : public TestInterface {
 
   virtual void print_info(void)
   {
-    printf("Realm %dD -> %dD Image dependent partitioning test: %d nodes, %d edges, %d pieces ,%d sources, %d sparse factor\n", (int) N2, (int) N1,
-	   (int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) sparse_factor);
+    //printf("Realm %dD -> %dD Image dependent partitioning test: %d nodes, %d edges, %d pieces ,%d sources, %d sparse factor, %lu tile size\n", (int) N2, (int) N1,
+	   //(int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) sparse_factor, buffer_size);
   }
 
   virtual Event initialize_data(const std::vector<Memory> &memories,
@@ -689,10 +719,10 @@ class ImageTest : public TestInterface {
     is_edges.by_image_buffer_requirements(image_subspaces, image_inputs, image_requirements);
 
     for (int i = 0; i < num_pieces; i++) {
-      alloc_piece(point_field_data_gpu[i].scratch_buffer, image_requirements[i].upper_bound, gpu_memory).wait();
+      size_t alloc_size = image_requirements[i].lower_bound + (image_requirements[i].upper_bound - image_requirements[i].lower_bound) * buffer_size / 100;
+      alloc_piece(point_field_data_gpu[i].scratch_buffer, alloc_size, gpu_memory).wait();
     }
 
-    wait_on_events = true;
     log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n";
     Event warmup = is_edges.create_subspaces_by_image(point_field_data_gpu,
                                                   sources,
@@ -705,11 +735,19 @@ class ImageTest : public TestInterface {
                                                   p_edges,
                                                   Realm::ProfilingRequestSet());
 
+    if ( wait_on_events ) {
+      gpu_call.wait();
+    }
+
     Event cpu_call = is_edges.create_subspaces_by_image(point_field_data,
                                                   sources,
                                                   p_edges_cpu,
                                                   Realm::ProfilingRequestSet());
 
+    if ( wait_on_events ) {
+      cpu_call.wait();
+    }
+
     return Event::merge_events({gpu_call, cpu_call});
 
   }
@@ -731,6 +769,14 @@ class ImageTest : public TestInterface {
     log_app.info() << "Checking correctness of partitioning " << "\n";
 
     for(int i = 0; i < num_pieces; i++) {
+      if (N1 > 1) {
+        if (!p_edges[i].dense()) {
+          p_edges[i].sparsity.impl()->request_bvh();
+        }
+        if (!p_edges_cpu[i].dense()) {
+          p_edges_cpu[i].sparsity.impl()->request_bvh();
+        }
+      }
       for(IndexSpaceIterator<N1> it(p_edges[i]); it.valid; it.step()) {
         for(PointInRectIterator<N1> point(it.rect); point.valid; point.step()) {
           if (!p_edges_cpu[i].contains(point.p)) {
@@ -764,7 +810,8 @@ class ImageRangeTest : public TestInterface {
   int rect_size = 10;
   int num_spaces = 4;
   int num_pieces = 4;
-  int sparse_factor = 4;
+  int sparse_factor = 50;
+  size_t buffer_size = 100;
   std::string filename;
 
   ImageRangeTest(int argc, const char *argv[])
@@ -795,11 +842,15 @@ class ImageRangeTest : public TestInterface {
         sparse_factor = atoi(argv[++i]);
         continue;
       }
+      if (!strcmp(argv[i], "-b")) {
+        buffer_size = atoi(argv[++i]);
+        continue;
+      }
     }
 
 
-    if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_spaces <= 0 || rect_size <= 0) {
-      log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " sources=" << num_spaces << " rect size=" << rect_size <<  "\n";
+    if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_spaces <= 0 || rect_size <= 0 || sparse_factor < 0 || sparse_factor > 100 || buffer_size < 0 || buffer_size > 100) {
+      log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " sources=" << num_spaces << " rect size=" << rect_size << " sparse factor=" << sparse_factor << " buffer_size=" << buffer_size <<  "\n";
       exit(1);
     }
   }
@@ -876,8 +927,8 @@ class ImageRangeTest : public TestInterface {
 
   virtual void print_info(void)
   {
-    printf("Realm %dD -> %dD Image Range dependent partitioning test: %d nodes, %d edges, %d pieces ,%d sources, %d rect size, %d sparse factor\n", (int) N2, (int) N1,
-	   (int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) rect_size, (int) sparse_factor);
+    //printf("Realm %dD -> %dD Image Range dependent partitioning test: %d nodes, %d edges, %d pieces ,%d sources, %d rect size, %d sparse factor, %lu tile size\n", (int) N2, (int) N1,
+	   // (int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) rect_size, (int) sparse_factor, buffer_size);
   }
 
   virtual Event initialize_data(const std::vector<Memory> &memories,
@@ -1005,10 +1056,10 @@ class ImageRangeTest : public TestInterface {
     is_edges.by_image_buffer_requirements(image_subspaces, image_inputs, image_requirements);
 
     for (int i = 0; i < num_pieces; i++) {
-      alloc_piece(rect_field_data_gpu[i].scratch_buffer, image_requirements[i].upper_bound, gpu_memory).wait();
+      size_t alloc_size = image_requirements[i].lower_bound + (image_requirements[i].upper_bound - image_requirements[i].lower_bound) * buffer_size / 100;
+      alloc_piece(rect_field_data_gpu[i].scratch_buffer, alloc_size, gpu_memory).wait();
     }
 
-    wait_on_events = true;
     log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n";
     Event warmup = is_edges.create_subspaces_by_image(rect_field_data_gpu,
                                                   sources,
@@ -1021,11 +1072,19 @@ class ImageRangeTest : public TestInterface {
                                                   p_edges,
                                                   Realm::ProfilingRequestSet());
 
+    if ( wait_on_events ) {
+      gpu_call.wait();
+    }
+
     Event cpu_call = is_edges.create_subspaces_by_image(rect_field_data,
                                                   sources,
                                                   p_edges_cpu,
                                                   Realm::ProfilingRequestSet());
 
+    if ( wait_on_events ) {
+      cpu_call.wait();
+    }
+
     return Event::merge_events({gpu_call, cpu_call});
 
   }
@@ -1047,6 +1106,16 @@ class ImageRangeTest : public TestInterface {
     log_app.info() << "Checking correctness of partitioning " << "\n";
 
     for(int i = 0; i < num_spaces; i++) {
+
+      if (N1 > 1) {
+        if (!p_edges[i].dense()) {
+          p_edges[i].sparsity.impl()->request_bvh();
+        }
+        if (!p_edges_cpu[i].dense()) {
+          p_edges_cpu[i].sparsity.impl()->request_bvh();
+        }
+      }
+
       for(IndexSpaceIterator<N1> it(p_edges[i]); it.valid; it.step()) {
         for(PointInRectIterator<N1> point(it.rect); point.valid; point.step()) {
           if (!p_edges_cpu[i].contains(point.p)) {
@@ -1079,7 +1148,8 @@ class PreimageTest : public TestInterface {
   int num_edges = 1000;
   int num_spaces = 4;
   int num_pieces = 4;
-  int sparse_factor = 4;
+  int sparse_factor = 50;
+  size_t buffer_size = 100;
   std::string filename;
 
   PreimageTest(int argc, const char *argv[])
@@ -1106,11 +1176,15 @@ class PreimageTest : public TestInterface {
         sparse_factor = atoi(argv[++i]);
         continue;
       }
+      if (!strcmp(argv[i], "-b")) {
+        buffer_size = atoi(argv[++i]);
+        continue;
+      }
     }
 
 
-    if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_spaces <= 0) {
-      log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " targets=" << num_spaces <<  "\n";
+    if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_spaces <= 0 || sparse_factor < 0 || sparse_factor > 100 || buffer_size < 0 || buffer_size > 100) {
+      log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " targets=" << num_spaces << " sparse factor=" << sparse_factor << " buffer size=" << buffer_size <<  "\n";
       exit(1);
     }
   }
@@ -1184,8 +1258,8 @@ class PreimageTest : public TestInterface {
 
   virtual void print_info(void)
   {
-    printf("Realm %dD -> %dD Preimage dependent partitioning test: %d nodes, %d edges, %d pieces ,%d targets, %d sparse factor\n", (int) N1, (int) N2,
-	   (int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) sparse_factor);
+    //printf("Realm %dD -> %dD Preimage dependent partitioning test: %d nodes, %d edges, %d pieces ,%d targets, %d sparse factor, %lu tile size\n", (int) N1, (int) N2,
+	   //(int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) sparse_factor, buffer_size);
   }
 
   virtual Event initialize_data(const std::vector<Memory> &memories,
@@ -1314,10 +1388,10 @@ class PreimageTest : public TestInterface {
     is_nodes.by_preimage_buffer_requirements(preimage_subspaces, preimage_inputs, preimage_requirements);
 
     for (int i = 0; i < num_pieces; i++) {
-      alloc_piece(point_field_data_gpu[i].scratch_buffer, preimage_requirements[i].upper_bound, gpu_memory).wait();
+      size_t alloc_size = preimage_requirements[i].lower_bound + (preimage_requirements[i].upper_bound - preimage_requirements[i].lower_bound) * buffer_size / 100;
+      alloc_piece(point_field_data_gpu[i].scratch_buffer, alloc_size, gpu_memory).wait();
     }
 
-    wait_on_events = true;
     log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n";
     Event warmup = is_nodes.create_subspaces_by_preimage(point_field_data_gpu,
                                                   targets,
@@ -1325,21 +1399,24 @@ class PreimageTest : public TestInterface {
                                                   Realm::ProfilingRequestSet());
     warmup.wait();
 
+    long long gpu_start = Clock::current_time_in_microseconds();
     Event gpu_call = is_nodes.create_subspaces_by_preimage(point_field_data_gpu,
                                                   targets,
                                                   p_nodes,
                                                   Realm::ProfilingRequestSet());
 
     gpu_call.wait();
-
-    long long start = Clock::current_time_in_microseconds();
+    long long gpu_us = Clock::current_time_in_microseconds() - gpu_start;
+    long long cpu_start = Clock::current_time_in_microseconds();
     Event cpu_call = is_nodes.create_subspaces_by_preimage(point_field_data,
                                                   targets,
                                                   p_nodes_cpu,
                                                   Realm::ProfilingRequestSet());
-    cpu_call.wait();
-    std::cout << "CPU TIME: " << (Clock::current_time_in_microseconds() - start) / 1000 << " ms\n";
 
+    cpu_call.wait();
+    long long cpu_us = Clock::current_time_in_microseconds() - cpu_start;
+    printf("RESULT,op=preimage,d1=%d,d2=%d,num_nodes=%d,num_edges=%d,sparse_factor=%d,buffer_size=%zu,gpu_us=%lld,cpu_us=%lld\n",
+       N1, N2, num_nodes, num_edges, sparse_factor, buffer_size, gpu_us, cpu_us);
     return Event::merge_events({gpu_call, cpu_call});
 
   }
@@ -1391,6 +1468,339 @@ class PreimageTest : public TestInterface {
   }
 };
 
+template<int N1, int N2>
+class PreimageRangeTest : public TestInterface {
+public:
+  // graph config parameters
+  int num_nodes = 1000;
+  int num_edges = 1000;
+  int rect_size = 10;
+  int num_spaces = 4;
+  int num_pieces = 4;
+  int sparse_factor = 50;
+  size_t buffer_size = 100;
+  std::string filename;
+
+  PreimageRangeTest(int argc, const char *argv[])
+  {
+    for(int i = 1; i < argc; i++) {
+
+      if(!strcmp(argv[i], "-p")) {
+        num_pieces = atoi(argv[++i]);
+        continue;
+      }
+      if(!strcmp(argv[i], "-n")) {
+        num_nodes = atoi(argv[++i]);
+        continue;
+      }
+      if(!strcmp(argv[i], "-e")) {
+        num_edges = atoi(argv[++i]);
+        continue;
+      }
+      if(!strcmp(argv[i], "-r")) {
+        rect_size = atoi(argv[++i]);
+        continue;
+      }
+      if(!strcmp(argv[i], "-s")) {
+        num_spaces = atoi(argv[++i]);
+        continue;
+      }
+      if (!strcmp(argv[i], "-f")) {
+        sparse_factor = atoi(argv[++i]);
+        continue;
+      }
+      if (!strcmp(argv[i], "-b")) {
+        buffer_size = atoi(argv[++i]);
+        continue;
+      }
+    }
+
+
+    if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_spaces <= 0 || rect_size <= 0 || sparse_factor < 0 || sparse_factor > 100 || buffer_size < 0 || buffer_size > 100) {
+      log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " targets=" << num_spaces << " rect size=" << rect_size << " sparse factor=" << sparse_factor << " buffer size=" << buffer_size <<  "\n";
+      exit(1);
+    }
+  }
+
+  struct InitDataArgs {
+    int index;
+    RegionInstance ri_nodes;
+  };
+
+  enum PRNGStreams
+  {
+    NODE_SUBGRAPH_STREAM,
+  };
+
+  // assign subgraph ids to nodes
+  void chase_rect(int idx, Rect<N2>& color)
+  {
+    for (int d = 0; d < N2; d++) {
+      if(random_colors) {
+        color.lo[d] = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_edges);
+        color.hi[d] = color.lo[d] + Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, 2 * rect_size);
+      } else {
+        color.lo[d] = (idx * num_edges / num_nodes) % num_edges;
+        color.hi[d] = color.lo[d] + rect_size;
+      }
+    }
+  }
+
+  static void init_data_task_wrapper(const void *args, size_t arglen,
+                                     const void *userdata, size_t userlen, Processor p)
+  {
+    PreimageRangeTest *me = (PreimageRangeTest *)testcfg;
+    me->init_data_task(args, arglen, p);
+  }
+
+  //Each piece has a task to initialize its data
+  void init_data_task(const void *args, size_t arglen, Processor p)
+  {
+    const InitDataArgs &i_args = *(const InitDataArgs *)args;
+
+    log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes
+                   << ")";
+
+    i_args.ri_nodes.fetch_metadata(p).wait();
+
+    IndexSpace<N1> nodes_space = i_args.ri_nodes.template get_indexspace<N1>();
+
+    log_app.debug() << "N: " << is_nodes;
+
+    //For each node in the graph, mark it with a random (or deterministic) subgraph id
+    {
+      AffineAccessor<Rect<N2>, N1> a_rect(i_args.ri_nodes, 0 /* offset */);
+
+      for (IndexSpaceIterator<N1> it(is_nodes); it.valid; it.step()) {
+        for (PointInRectIterator<N1> point(it.rect); point.valid; point.step()) {
+          int idx = 0;
+          int stride = 1;
+          for (int d = 0; d < N1; d++) {
+            idx += (point.p[d] - is_nodes.bounds.lo[d]) * stride;
+            stride *= (is_nodes.bounds.hi[d] - is_nodes.bounds.lo[d] + 1);
+          }
+          Rect<N2> destination;
+          chase_rect(idx, destination);
+          a_rect.write(point.p, destination);
+        }
+      }
+    }
+  }
+
+  IndexSpace<N1> is_nodes;
+  IndexSpace<N2> is_edges;
+  std::vector<RegionInstance> ri_nodes;
+  std::vector<FieldDataDescriptor<IndexSpace<N1>, Rect<N2>> > rect_field_data;
+
+  virtual void print_info(void)
+  {
+    printf("Realm %dD -> %dD Preimage Range dependent partitioning test: %d nodes, %d edges, %d pieces ,%d targets, %d rect size, %d sparse factor, %lu tile size\n", (int) N1, (int) N2,
+	   (int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) rect_size, (int) sparse_factor, buffer_size);
+  }
+
+  virtual Event initialize_data(const std::vector<Memory> &memories,
+                                const std::vector<Processor> &procs)
+  {
+    // now create index space for nodes
+    Point<N1> node_lo, node_hi;
+    for (int d = 0; d < N1; d++) {
+      node_lo[d] = 0;
+      node_hi[d] = num_nodes - 1;
+    }
+    is_nodes = Rect<N1>(node_lo, node_hi);
+
+    Point<N2> edge_lo, edge_hi;
+    for (int d = 0; d < N2; d++) {
+      edge_lo[d] = 0;
+      edge_hi[d] = num_edges - 1;
+    }
+    is_edges = Rect<N2>(edge_lo, edge_hi);
+
+    // equal partition is used to do initial population of edges and nodes
+    std::vector<IndexSpace<N1> > ss_nodes_eq;
+
+    log_app.info() << "Creating equal subspaces\n";
+
+    is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait();
+
+    // create instances for each of these subspaces
+    std::vector<size_t> node_fields;
+    node_fields.push_back(sizeof(Rect<N2>));
+
+    ri_nodes.resize(num_pieces);
+    rect_field_data.resize(num_pieces);
+
+    for(size_t i = 0; i < ss_nodes_eq.size(); i++) {
+      RegionInstance ri;
+      RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i],
+                                      node_fields, 0 /*SOA*/,
+                                      Realm::ProfilingRequestSet()).wait();
+      ri_nodes[i] = ri;
+
+      rect_field_data[i].index_space = ss_nodes_eq[i];
+      rect_field_data[i].inst = ri_nodes[i];
+      rect_field_data[i].field_offset = 0;
+    }
+
+    // fire off tasks to initialize data
+    std::set<Event> events;
+    for(int i = 0; i < num_pieces; i++) {
+      Processor p = procs[i % procs.size()];
+      InitDataArgs args;
+      args.index = i;
+      args.ri_nodes = ri_nodes[i];
+      Event e = p.spawn(INIT_PREIMAGE_RANGE_DATA_TASK, &args, sizeof(args));
+      events.insert(e);
+    }
+
+    return Event::merge_events(events);
+  }
+
+  // the outputs of our partitioning will be:
+  //  p_nodes - nodes partitioned by subgraph id (from GPU)
+  //  p_nodes_cpu - nodes partitioned by subgraph id (from CPU)
+
+  std::vector<IndexSpace<N1> > p_nodes, p_garbage_nodes, p_nodes_cpu;
+
+  virtual Event perform_partitioning(void)
+  {
+    // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU
+    // Ensure that the results are identical
+
+    std::vector<IndexSpace<N2>> targets;
+    if (sparse_factor <= 1) {
+      is_edges.create_equal_subspaces(num_spaces, 1, targets, Realm::ProfilingRequestSet()).wait();
+    } else {
+      targets.resize(num_spaces);
+      for (int i = 0; i < num_spaces; i++) {
+        targets[i] = create_sparse_index_space(is_edges.bounds, sparse_factor, random_colors, i);
+      }
+    }
+
+    // We need a GPU memory for GPU partitioning
+    Memory gpu_memory;
+    bool found_gpu_memory = false;
+    Machine machine = Machine::get_machine();
+    std::set<Memory> all_memories;
+    machine.get_all_memories(all_memories);
+    for(Memory memory : all_memories) {
+      if(memory.kind() == Memory::GPU_FB_MEM) {
+        gpu_memory = memory;
+        found_gpu_memory = true;
+        break;
+      }
+    }
+    if (!found_gpu_memory) {
+      log_app.error() << "No GPU memory found for partitioning test\n";
+      return Event::NO_EVENT;
+    }
+
+
+    std::vector<size_t> node_fields;
+    node_fields.push_back(sizeof(Rect<N2>));
+
+    std::vector<FieldDataDescriptor<IndexSpace<N1>, Rect<N2>>> rect_field_data_gpu;
+    rect_field_data_gpu.resize(num_pieces);
+
+    for (int i = 0; i < num_pieces; i++) {
+    	copy_piece(rect_field_data[i], rect_field_data_gpu[i], node_fields, 0, gpu_memory).wait();
+    }
+
+    std::vector<DeppartEstimateInput<N1, int>> preimage_inputs(num_pieces);
+    std::vector<DeppartSubspace<N2, int>> preimage_subspaces(num_spaces);
+    std::vector<DeppartBufferRequirements> preimage_requirements(num_pieces);
+
+    for (int i = 0; i < num_pieces; i++) {
+      preimage_inputs[i].location = rect_field_data_gpu[i].inst.get_location();
+      preimage_inputs[i].space = rect_field_data_gpu[i].index_space;
+    }
+
+    for (int i = 0; i < num_spaces; i++) {
+      preimage_subspaces[i].space = targets[i];
+      preimage_subspaces[i].entries = targets[i].dense() ? 1 : targets[i].sparsity.impl()->get_entries().size();
+    }
+
+    is_nodes.by_preimage_buffer_requirements(preimage_subspaces, preimage_inputs, preimage_requirements);
+
+    for (int i = 0; i < num_pieces; i++) {
+      size_t alloc_size = preimage_requirements[i].lower_bound + (preimage_requirements[i].upper_bound - preimage_requirements[i].lower_bound) * buffer_size / 100;
+      alloc_piece(rect_field_data_gpu[i].scratch_buffer, alloc_size, gpu_memory).wait();
+    }
+
+    log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n";
+    Event warmup = is_nodes.create_subspaces_by_preimage(rect_field_data_gpu,
+                                                  targets,
+                                                  p_garbage_nodes,
+                                                  Realm::ProfilingRequestSet());
+    warmup.wait();
+
+    Event gpu_call = is_nodes.create_subspaces_by_preimage(rect_field_data_gpu,
+                                                  targets,
+                                                  p_nodes,
+                                                  Realm::ProfilingRequestSet());
+
+    if ( wait_on_events ) {
+      gpu_call.wait();
+    }
+    Event cpu_call = is_nodes.create_subspaces_by_preimage(rect_field_data,
+                                                  targets,
+                                                  p_nodes_cpu,
+                                                  Realm::ProfilingRequestSet());
+
+    if ( wait_on_events ) {
+      cpu_call.wait();
+    }
+
+    return Event::merge_events({gpu_call, cpu_call});
+  }
+
+  virtual int perform_dynamic_checks(void)
+  {
+    // Nothing to do here
+    return 0;
+  }
+
+  virtual int check_partitioning(void)
+  {
+    int errors = 0;
+
+    if (!p_nodes.size()) {
+      return p_nodes.size() != p_nodes_cpu.size();
+    }
+
+    log_app.info() << "Checking correctness of partitioning " << "\n";
+
+    for(int i = 0; i < num_spaces; i++) {
+      if (!p_nodes[i].dense() && (N1 > 1)) {
+        p_nodes[i].sparsity.impl()->request_bvh();
+        if (!p_nodes_cpu[i].dense()) {
+          p_nodes_cpu[i].sparsity.impl()->request_bvh();
+        }
+      }
+      for(IndexSpaceIterator<N1> it(p_nodes[i]); it.valid; it.step()) {
+        for(PointInRectIterator<N1> point(it.rect); point.valid; point.step()) {
+          if (!p_nodes_cpu[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU has extra image point " << point.p
+                            << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+      for(IndexSpaceIterator<N1> it(p_nodes_cpu[i]); it.valid; it.step()) {
+        for(PointInRectIterator<N1> point(it.rect); point.valid; point.step()) {
+          if (!p_nodes[i].contains(point.p)) {
+            log_app.error() << "Mismatch! GPU is missing image point " << point.p
+                          << " on piece " << i << "\n";
+            errors++;
+          }
+        }
+      }
+
+    }
+    return errors;
+  }
+};
+
 void top_level_task(const void *args, size_t arglen, const void *userdata, size_t userlen,
                     Processor p)
 {
@@ -1470,7 +1880,6 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, size_
     exit(1);
   }
 
-  printf("all done!\n");
 }
 
 // Constructor function-pointer type
@@ -1524,6 +1933,18 @@ static constexpr CtorFn PREIMAGE_CTORS[3][3] = {
   { &make_preimage<3,1>, &make_preimage<3,2>, &make_preimage<3,3> },
 };
 
+// ---- Image constructors ----
+template<int D1, int D2>
+static TestInterface* make_preimage_range(int argc, const char** argv) {
+  return new PreimageRangeTest<D1, D2>(argc, argv);
+}
+
+static constexpr CtorFn PREIMAGE_RANGE_CTORS[3][3] = {
+  { &make_preimage_range<1,1>, &make_preimage_range<1,2>, &make_preimage_range<1,3> },
+  { &make_preimage_range<2,1>, &make_preimage_range<2,2>, &make_preimage_range<2,3> },
+  { &make_preimage_range<3,1>, &make_preimage_range<3,2>, &make_preimage_range<3,3> },
+};
+
 using TaskWrapperFn = void (*)(const void*, size_t, const void*, size_t, Processor);
 
 static constexpr TaskWrapperFn BYFIELD_INIT_TBL[3] = {
@@ -1550,6 +1971,12 @@ static constexpr TaskWrapperFn PREIMAGE_INIT_TBL[3][3] = {
   { &PreimageTest<3,1>::init_data_task_wrapper, &PreimageTest<3,2>::init_data_task_wrapper, &PreimageTest<3,3>::init_data_task_wrapper },
 };
 
+static constexpr TaskWrapperFn PREIMAGE_RANGE_INIT_TBL[3][3] = {
+  { &PreimageRangeTest<1,1>::init_data_task_wrapper, &PreimageRangeTest<1,2>::init_data_task_wrapper, &PreimageRangeTest<1,3>::init_data_task_wrapper },
+  { &PreimageRangeTest<2,1>::init_data_task_wrapper, &PreimageRangeTest<2,2>::init_data_task_wrapper, &PreimageRangeTest<2,3>::init_data_task_wrapper },
+  { &PreimageRangeTest<3,1>::init_data_task_wrapper, &PreimageRangeTest<3,2>::init_data_task_wrapper, &PreimageRangeTest<3,3>::init_data_task_wrapper },
+};
+
 int main(int argc, char **argv)
 {
   Runtime rt;
@@ -1597,6 +2024,7 @@ int main(int argc, char **argv)
       if (dimension1 < 1 || dimension1 > 3)
         assert(false && "invalid dimension");
 
+      op = "byfield";
       testcfg = BYFIELD_CTORS[dimension1 - 1](argc - i, const_cast<const char **>(argv + i));
       break;
     }
@@ -1604,13 +2032,15 @@ int main(int argc, char **argv)
     if(!strcmp(argv[i], "image")) {
       if (dimension1 < 1 || dimension1 > 3 || dimension2 < 1 || dimension2 > 3)
         assert(false && "invalid dimension");
+      op = "image";
       testcfg = IMAGE_CTORS[dimension1 - 1][dimension2 - 1](argc - i, const_cast<const char **>(argv + i));
       break;
     }
 
-    if(!strcmp(argv[i], "range")) {
+    if(!strcmp(argv[i], "irange")) {
       if (dimension1 < 1 || dimension1 > 3 || dimension2 < 1 || dimension2 > 3)
         assert(false && "invalid dimension");
+      op = "irange";
       testcfg = IMAGE_RANGE_CTORS[dimension1 - 1][dimension2 - 1](argc - i, const_cast<const char **>(argv + i));
       break;
     }
@@ -1618,10 +2048,19 @@ int main(int argc, char **argv)
     if(!strcmp(argv[i], "preimage")) {
       if (dimension1 < 1 || dimension1 > 3 || dimension2 < 1 || dimension2 > 3)
         assert(false && "invalid dimension");
+      op = "preimage";
       testcfg = PREIMAGE_CTORS[dimension1 - 1][dimension2 - 1](argc - i, const_cast<const char **>(argv + i));
       break;
     }
 
+    if(!strcmp(argv[i], "prange")) {
+      if (dimension1 < 1 || dimension1 > 3 || dimension2 < 1 || dimension2 > 3)
+        assert(false && "invalid dimension");
+      op = "prange";
+      testcfg = PREIMAGE_RANGE_CTORS[dimension1 - 1][dimension2 - 1](argc - i, const_cast<const char **>(argv + i));
+      break;
+    }
+
     // printf("unknown parameter: %s\n", argv[i]);
   }
 
@@ -1639,6 +2078,7 @@ int main(int argc, char **argv)
   rt.register_task(INIT_IMAGE_DATA_TASK,   IMAGE_INIT_TBL[dimension1 - 1][dimension2 - 1]);
   rt.register_task(INIT_IMAGE_RANGE_DATA_TASK,   IMAGE_RANGE_INIT_TBL[dimension1 - 1][dimension2 - 1]);
   rt.register_task(INIT_PREIMAGE_DATA_TASK,   PREIMAGE_INIT_TBL[dimension1 - 1][dimension2 - 1]);
+  rt.register_task(INIT_PREIMAGE_RANGE_DATA_TASK,   PREIMAGE_RANGE_INIT_TBL[dimension1 - 1][dimension2 - 1]);
 
   signal(SIGALRM, sigalrm_handler);
 

From 83cb1d67c98ec27cffd9f3300c21972e1255e788 Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Tue, 10 Mar 2026 21:47:53 -0700
Subject: [PATCH 20/32] trying full benchmark

---
 src/realm/deppart/byfield.cc |  2 +-
 tests/benchmark.cc           | 34 +++++++++++++++++++---------------
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc
index 06c936f0b2..78aceb2f92 100644
--- a/src/realm/deppart/byfield.cc
+++ b/src/realm/deppart/byfield.cc
@@ -45,7 +45,7 @@ namespace Realm {
             if (val) {
               device_size = atoi(val);
             }
-            size_t optimal_size = is.bounds.volume() * 10 * sizeof(RectDesc<N, T>);
+            size_t optimal_size = is.bounds.volume() * 20 * sizeof(RectDesc<N, T>);
             Processor best_proc = Processor::NO_PROC;
             assert(choose_proc(best_proc, mem));
             requirements[i].affinity_processor = best_proc;
diff --git a/tests/benchmark.cc b/tests/benchmark.cc
index 177ceeb558..3259644270 100644
--- a/tests/benchmark.cc
+++ b/tests/benchmark.cc
@@ -113,6 +113,7 @@ Event copy_piece(FieldDataDescriptor<IS, FT> src_data, FieldDataDescriptor<IS, F
     offset += fields[i];
   }
   size_t size = fields[field_idx];
+  dst_data.index_space = src_data.index_space;
   RegionInstance::create_instance(dst_data.inst,
                                         dst_memory,
                                         src_data.index_space,
@@ -126,7 +127,6 @@ Event copy_piece(FieldDataDescriptor<IS, FT> src_data, FieldDataDescriptor<IS, F
   dst_field.inst = dst_data.inst;
   dst_field.size = size;
   dst_field.field_id = offset;
-  dst_data.index_space = src_data.index_space;
   dst_data.field_offset = src_data.field_offset;
   std::vector<CopySrcDstField> src_fields = {src_field};
   std::vector<CopySrcDstField> dst_fields = {dst_field};
@@ -664,7 +664,7 @@ class ImageTest : public TestInterface {
     // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU
     // Ensure that the results are identical
 
-    std::vector<IndexSpace<N2>> sources(num_pieces);
+    std::vector<IndexSpace<N2>> sources(num_spaces);
     for(int i = 0; i < num_spaces; i++) {
       if (sparse_factor <= 1) {
         sources[i] = point_field_data[i % num_pieces].index_space;
@@ -730,23 +730,24 @@ class ImageTest : public TestInterface {
                                                   Realm::ProfilingRequestSet());
     warmup.wait();
 
+    long long start_gpu = Clock::current_time_in_microseconds();
     Event gpu_call = is_edges.create_subspaces_by_image(point_field_data_gpu,
                                                   sources,
                                                   p_edges,
                                                   Realm::ProfilingRequestSet());
 
-    if ( wait_on_events ) {
-      gpu_call.wait();
-    }
-
+    gpu_call.wait();
+    long long gpu_us = Clock::current_time_in_microseconds() - start_gpu;
+    long long start_cpu = Clock::current_time_in_microseconds();
     Event cpu_call = is_edges.create_subspaces_by_image(point_field_data,
                                                   sources,
                                                   p_edges_cpu,
                                                   Realm::ProfilingRequestSet());
 
-    if ( wait_on_events ) {
-      cpu_call.wait();
-    }
+    cpu_call.wait();
+    long long cpu_us = Clock::current_time_in_microseconds() - start_cpu;
+    printf("RESULT,op=image,d1=%d,d2=%d,num_nodes=%d,num_edges=%d,num_spaces=%d,sparse_factor=%d,buffer_size=%zu,gpu_us=%lld,cpu_us=%lld\n",
+                 N1, N2, num_nodes, num_edges, num_spaces, sparse_factor, buffer_size, gpu_us, cpu_us);
 
     return Event::merge_events({gpu_call, cpu_call});
 
@@ -1067,23 +1068,26 @@ class ImageRangeTest : public TestInterface {
                                                   Realm::ProfilingRequestSet());
     warmup.wait();
 
+    long long start_gpu = Clock::current_time_in_microseconds();
     Event gpu_call = is_edges.create_subspaces_by_image(rect_field_data_gpu,
                                                   sources,
                                                   p_edges,
                                                   Realm::ProfilingRequestSet());
 
-    if ( wait_on_events ) {
-      gpu_call.wait();
-    }
 
+    gpu_call.wait();
+    long long gpu_us = Clock::current_time_in_microseconds() - start_gpu;
+    long long start_cpu = Clock::current_time_in_microseconds();
     Event cpu_call = is_edges.create_subspaces_by_image(rect_field_data,
                                                   sources,
                                                   p_edges_cpu,
                                                   Realm::ProfilingRequestSet());
 
-    if ( wait_on_events ) {
-      cpu_call.wait();
-    }
+    cpu_call.wait();
+    long long cpu_us = Clock::current_time_in_microseconds() - start_cpu;
+
+    printf("RESULT,op=image,d1=%d,d2=%d,num_nodes=%d,num_edges=%d,num_spaces=%d,sparse_factor=%d,buffer_size=%zu,gpu_us=%lld,cpu_us=%lld\n",
+                 N1, N2, num_nodes, num_edges, num_spaces, sparse_factor, buffer_size, gpu_us, cpu_us);
 
     return Event::merge_events({gpu_call, cpu_call});
 

From a55e5c69cf36fc20995cdf9691ac71d5718b4284 Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Tue, 10 Mar 2026 23:09:49 -0700
Subject: [PATCH 21/32] bumped upper bounds

---
 src/realm/deppart/image.cc    | 2 +-
 src/realm/deppart/preimage.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc
index ec8cfb834d..c0656d4b59 100644
--- a/src/realm/deppart/image.cc
+++ b/src/realm/deppart/image.cc
@@ -74,7 +74,7 @@ namespace Realm {
       		device_size = atoi(val);
       	}
         minimal_size = max(minimal_size, device_size);
-      	size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>) * source_spaces.size() * 10 + minimal_size;
+      	size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>) * source_spaces.size() * 20 + minimal_size;
       	Processor best_proc;
       	assert(choose_proc(best_proc, mem));
         requirements[i].affinity_processor = best_proc;
diff --git a/src/realm/deppart/preimage.cc b/src/realm/deppart/preimage.cc
index 37bfef188a..9ac7d85606 100644
--- a/src/realm/deppart/preimage.cc
+++ b/src/realm/deppart/preimage.cc
@@ -76,7 +76,7 @@ namespace Realm {
           device_size = atoi(val);
         }
         minimal_size = max(minimal_size, device_size);
-        size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>) * target_spaces.size() * 10 + minimal_size;
+        size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>) * target_spaces.size() * 20 + minimal_size;
         Processor best_proc = Processor::NO_PROC;
       	assert(choose_proc(best_proc, mem));
         requirements[i].affinity_processor = best_proc;

From 669b69a303a0293308342606b68de48751240ba5 Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Wed, 11 Mar 2026 00:38:25 -0700
Subject: [PATCH 22/32] fixed construct input rectlist

---
 src/realm/deppart/partitions_gpu_impl.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp
index d136c2138b..90f3a9056d 100644
--- a/src/realm/deppart/partitions_gpu_impl.hpp
+++ b/src/realm/deppart/partitions_gpu_impl.hpp
@@ -426,7 +426,7 @@ namespace Realm {
     CUDA_CHECK(cudaMemsetAsync(counters, 0, (lhs.num_children) * sizeof(uint32_t), stream), stream);
 
     BVH<N, T> my_bvh;
-    bool bvh_valid = rhs.num_children < rhs.num_entries;
+    bool bvh_valid = rhs.num_children < rhs.num_entries && lhs.num_children < lhs.num_entries;
     if (bvh_valid) {
       build_bvh(rhs, my_bvh, my_arena, stream);
     }

From 17003b1bb2fa70faea534919d833392b7a766ead Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Wed, 11 Mar 2026 12:32:55 -0700
Subject: [PATCH 23/32] fixed overflow

---
 src/realm/deppart/image.cc                |  4 +++-
 src/realm/deppart/image_gpu_impl.hpp      |  6 +++++-
 src/realm/deppart/partitions_gpu_impl.hpp | 12 +++++++++++-
 tests/benchmark.cc                        |  4 +++-
 4 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc
index c0656d4b59..edc8ffc010 100644
--- a/src/realm/deppart/image.cc
+++ b/src/realm/deppart/image.cc
@@ -74,13 +74,15 @@ namespace Realm {
       		device_size = atoi(val);
       	}
         minimal_size = max(minimal_size, device_size);
-      	size_t optimal_size = is.bounds.volume() * sizeof(Rect<N, T>) * source_spaces.size() * 20 + minimal_size;
+      	size_t optimal_size = is.bounds.volume() * sizeof(RectDesc<N, T>) * source_spaces.size() * 20 + minimal_size;
+        optimal_size += 2 * (is.dense() ? 1 : is.sparsity.impl()->get_entries().size()) * sizeof(Rect<N, T>) * source_entries;
       	Processor best_proc;
       	assert(choose_proc(best_proc, mem));
         requirements[i].affinity_processor = best_proc;
       	requirements[i].lower_bound = minimal_size;
       	requirements[i].upper_bound = optimal_size;
         requirements[i].minimum_alignment = 128;
+        std::cout << "UPPER BOUND IS " << optimal_size << std::endl;
       } else {
 	requirements[i].affinity_processor = Processor::NO_PROC;
       	requirements[i].lower_bound = 0;
diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp
index 43682e06dd..48faad0585 100644
--- a/src/realm/deppart/image_gpu_impl.hpp
+++ b/src/realm/deppart/image_gpu_impl.hpp
@@ -332,7 +332,7 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
     CUstream stream = this->stream->get_stream();
 
     size_t tile_size = buffer.get_layout()->bytes_used;
-    //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
+    std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
     Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size);
 
     collapsed_space<N2, T2> src_space;
@@ -449,6 +449,10 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
 
         buffer_arena.flip_parity();
         PointDesc<N,T>* d_valid_points = buffer_arena.alloc<PointDesc<N,T>>(num_valid_points);
+        buffer_arena.start();
+        d_valid_points = buffer_arena.alloc<PointDesc<N,T>>(num_valid_points);
+
+        std::cout << "Tile has " << num_valid_rects << " valid rects and " << num_valid_points << " valid points." << std::endl;
 
         CUDA_CHECK(cudaMemsetAsync(d_inst_counters, 0, (domain_transform.ptr_data.size()) * sizeof(uint32_t), stream), stream);
 
diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp
index 90f3a9056d..722a4113df 100644
--- a/src/realm/deppart/partitions_gpu_impl.hpp
+++ b/src/realm/deppart/partitions_gpu_impl.hpp
@@ -426,7 +426,7 @@ namespace Realm {
     CUDA_CHECK(cudaMemsetAsync(counters, 0, (lhs.num_children) * sizeof(uint32_t), stream), stream);
 
     BVH<N, T> my_bvh;
-    bool bvh_valid = rhs.num_children < rhs.num_entries && lhs.num_children < lhs.num_entries;
+    bool bvh_valid = rhs.num_children < rhs.num_entries && lhs.num_children < lhs.num_entries && lhs.num_entries > 1000;
     if (bvh_valid) {
       build_bvh(rhs, my_bvh, my_arena, stream);
     }
@@ -1462,10 +1462,16 @@ namespace Realm {
     size_t max_aux_bytes = std::max({bytes_T, bytes_S, bytes_R});
     size_t max_pg_bytes = std::max({bytes_p, bytes_S});
 
+    std::cout << "COMPLETE PIPELINE HAS USED " << my_arena.used() << " bytes" << " out of " << my_arena.capacity() << std::endl;
+    std::cout << "TOTAL POINTS IS " << total_pts << std::endl;
+
+    std::cout << "AUX BYTES: " << max_aux_bytes << std::endl;
 
     // Instance shared by coordinate keys, source keys, and rectangle outputs
     char* aux_ptr = my_arena.alloc<char>(2 * max_aux_bytes);
 
+    std::cout << "PG BYTES: " << max_pg_bytes << std::endl;
+
     //Instance shared by group ids (RLE) and intermediate points in sorting
     char* pg_ptr = my_arena.alloc<char>(max_pg_bytes);
 
@@ -1492,8 +1498,12 @@ namespace Realm {
 
     //Temporary storage instance shared by CUB operations.
     size_t temp_bytes = std::max({t1, t2, t3});
+
+    std::cout << "TEMP BYTES: " << temp_bytes << std::endl;
     void *temp_storage = my_arena.alloc<char>(temp_bytes);
 
+    std::cout << "TOTAL BYTES: " << my_arena.used() + temp_bytes << std::endl;
+
 
     //Sort along each dimension from LSB to MSB (0 to N-1)
     size_t use_bytes = temp_bytes;
diff --git a/tests/benchmark.cc b/tests/benchmark.cc
index 3259644270..cc3b17a634 100644
--- a/tests/benchmark.cc
+++ b/tests/benchmark.cc
@@ -137,7 +137,7 @@ Event alloc_piece(RegionInstance &result, size_t size, Memory location) {
   assert(location != Memory::NO_MEMORY);
   assert(size > 0);
   std::vector<size_t> byte_fields = {sizeof(char)};
-  IndexSpace<1> instance_index_space(Rect<1>(0, size-1));
+  IndexSpace<1, long long> instance_index_space(Rect<1, long long>(0, size-1));
   return RegionInstance::create_instance(result, location, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet());
 }
 
@@ -720,6 +720,7 @@ class ImageTest : public TestInterface {
 
     for (int i = 0; i < num_pieces; i++) {
       size_t alloc_size = image_requirements[i].lower_bound + (image_requirements[i].upper_bound - image_requirements[i].lower_bound) * buffer_size / 100;
+      std::cout << "Allocating scratch buffer with size " << alloc_size << " for piece " << i << "\n";
       alloc_piece(point_field_data_gpu[i].scratch_buffer, alloc_size, gpu_memory).wait();
     }
 
@@ -1058,6 +1059,7 @@ class ImageRangeTest : public TestInterface {
 
     for (int i = 0; i < num_pieces; i++) {
       size_t alloc_size = image_requirements[i].lower_bound + (image_requirements[i].upper_bound - image_requirements[i].lower_bound) * buffer_size / 100;
+      std::cout << "allocating buffer of size " << alloc_size << " for piece " << i << "\n";
       alloc_piece(rect_field_data_gpu[i].scratch_buffer, alloc_size, gpu_memory).wait();
     }
 

From dc8d5743838ea06b342775527683b78ea242a7c6 Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Wed, 11 Mar 2026 12:34:37 -0700
Subject: [PATCH 24/32] fixed overflow

---
 src/realm/deppart/image.cc                |  1 -
 src/realm/deppart/partitions_gpu_impl.hpp | 11 -----------
 tests/benchmark.cc                        |  2 --
 3 files changed, 14 deletions(-)

diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc
index edc8ffc010..9eaf7b8197 100644
--- a/src/realm/deppart/image.cc
+++ b/src/realm/deppart/image.cc
@@ -82,7 +82,6 @@ namespace Realm {
       	requirements[i].lower_bound = minimal_size;
       	requirements[i].upper_bound = optimal_size;
         requirements[i].minimum_alignment = 128;
-        std::cout << "UPPER BOUND IS " << optimal_size << std::endl;
       } else {
 	requirements[i].affinity_processor = Processor::NO_PROC;
       	requirements[i].lower_bound = 0;
diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp
index 722a4113df..93cfc5582b 100644
--- a/src/realm/deppart/partitions_gpu_impl.hpp
+++ b/src/realm/deppart/partitions_gpu_impl.hpp
@@ -1462,16 +1462,9 @@ namespace Realm {
     size_t max_aux_bytes = std::max({bytes_T, bytes_S, bytes_R});
     size_t max_pg_bytes = std::max({bytes_p, bytes_S});
 
-    std::cout << "COMPLETE PIPELINE HAS USED " << my_arena.used() << " bytes" << " out of " << my_arena.capacity() << std::endl;
-    std::cout << "TOTAL POINTS IS " << total_pts << std::endl;
-
-    std::cout << "AUX BYTES: " << max_aux_bytes << std::endl;
-
     // Instance shared by coordinate keys, source keys, and rectangle outputs
     char* aux_ptr = my_arena.alloc<char>(2 * max_aux_bytes);
 
-    std::cout << "PG BYTES: " << max_pg_bytes << std::endl;
-
     //Instance shared by group ids (RLE) and intermediate points in sorting
     char* pg_ptr = my_arena.alloc<char>(max_pg_bytes);
 
@@ -1499,12 +1492,8 @@ namespace Realm {
     //Temporary storage instance shared by CUB operations.
     size_t temp_bytes = std::max({t1, t2, t3});
 
-    std::cout << "TEMP BYTES: " << temp_bytes << std::endl;
     void *temp_storage = my_arena.alloc<char>(temp_bytes);
 
-    std::cout << "TOTAL BYTES: " << my_arena.used() + temp_bytes << std::endl;
-
-
     //Sort along each dimension from LSB to MSB (0 to N-1)
     size_t use_bytes = temp_bytes;
 
diff --git a/tests/benchmark.cc b/tests/benchmark.cc
index cc3b17a634..b0bed444e1 100644
--- a/tests/benchmark.cc
+++ b/tests/benchmark.cc
@@ -720,7 +720,6 @@ class ImageTest : public TestInterface {
 
     for (int i = 0; i < num_pieces; i++) {
       size_t alloc_size = image_requirements[i].lower_bound + (image_requirements[i].upper_bound - image_requirements[i].lower_bound) * buffer_size / 100;
-      std::cout << "Allocating scratch buffer with size " << alloc_size << " for piece " << i << "\n";
       alloc_piece(point_field_data_gpu[i].scratch_buffer, alloc_size, gpu_memory).wait();
     }
 
@@ -1059,7 +1058,6 @@ class ImageRangeTest : public TestInterface {
 
     for (int i = 0; i < num_pieces; i++) {
       size_t alloc_size = image_requirements[i].lower_bound + (image_requirements[i].upper_bound - image_requirements[i].lower_bound) * buffer_size / 100;
-      std::cout << "allocating buffer of size " << alloc_size << " for piece " << i << "\n";
       alloc_piece(rect_field_data_gpu[i].scratch_buffer, alloc_size, gpu_memory).wait();
     }
 

From 0e836f0d98ad61e32ecc1f230a54ff1e9c102184 Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Wed, 11 Mar 2026 13:02:37 -0700
Subject: [PATCH 25/32] removed prints

---
 src/realm/deppart/image_gpu_impl.hpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp
index 48faad0585..fa25ab5632 100644
--- a/src/realm/deppart/image_gpu_impl.hpp
+++ b/src/realm/deppart/image_gpu_impl.hpp
@@ -332,7 +332,7 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
     CUstream stream = this->stream->get_stream();
 
     size_t tile_size = buffer.get_layout()->bytes_used;
-    std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
+    //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
     Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size);
 
     collapsed_space<N2, T2> src_space;
@@ -452,8 +452,6 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
         buffer_arena.start();
         d_valid_points = buffer_arena.alloc<PointDesc<N,T>>(num_valid_points);
 
-        std::cout << "Tile has " << num_valid_rects << " valid rects and " << num_valid_points << " valid points." << std::endl;
-
         CUDA_CHECK(cudaMemsetAsync(d_inst_counters, 0, (domain_transform.ptr_data.size()) * sizeof(uint32_t), stream), stream);
 
         image_gpuPopulateBitmasksPtrsKernel<N,T,N2,T2><<<COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, collapsed_parent.entries_buffer, d_prefix_rects, d_inst_prefix, d_prefix_points, total_pts, num_valid_rects, domain_transform.ptr_data.size(), collapsed_parent.num_entries, d_inst_counters, d_valid_points);

From 27771caa56b5ebc8d62364084cb6e250dbb67d1d Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Wed, 11 Mar 2026 17:18:48 -0700
Subject: [PATCH 26/32] picked better host memories

---
 src/realm/deppart/byfield_gpu_impl.hpp    |  6 ++---
 src/realm/deppart/image_gpu_impl.hpp      | 14 +++++-----
 src/realm/deppart/partitions.h            | 10 ++++---
 src/realm/deppart/partitions_gpu_impl.hpp | 33 +++++++++++------------
 src/realm/deppart/preimage_gpu_impl.hpp   | 12 ++++-----
 5 files changed, 39 insertions(+), 36 deletions(-)

diff --git a/src/realm/deppart/byfield_gpu_impl.hpp b/src/realm/deppart/byfield_gpu_impl.hpp
index 8e1c953730..4d59d30b54 100644
--- a/src/realm/deppart/byfield_gpu_impl.hpp
+++ b/src/realm/deppart/byfield_gpu_impl.hpp
@@ -31,7 +31,7 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
 
   //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
 
-  Arena buffer_arena(field_data[0].scratch_buffer.pointer_untyped(0, tile_size), tile_size);
+  Arena buffer_arena(field_data[0].scratch_buffer);
 
   inst_space.offsets = buffer_arena.alloc<size_t>(field_data.size() + 1);
   inst_space.num_children = field_data.size();
@@ -76,7 +76,7 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
 
 
   Memory zcpy_mem;
-  assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM));
+  assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM, buffer_arena.location));
 
   // We need to pass the accessors to the GPU so it can read field values.
   RegionInstance accessors_instance = this->realm_malloc(field_data.size() * sizeof(AffineAccessor<FT,N,T>), zcpy_mem);
@@ -94,7 +94,7 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
   }
 
   Memory sysmem;
-  assert(find_memory(sysmem, Memory::SYSTEM_MEM));
+  assert(find_memory(sysmem, Memory::SYSTEM_MEM, buffer_arena.location));
 
   size_t num_output = 0;
   RectDesc<N, T>* output_start = nullptr;
diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp
index fa25ab5632..7bac9f9054 100644
--- a/src/realm/deppart/image_gpu_impl.hpp
+++ b/src/realm/deppart/image_gpu_impl.hpp
@@ -49,7 +49,7 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
     RegionInstance buffer = domain_transform.range_data[0].scratch_buffer;
     size_t tile_size = buffer.get_layout()->bytes_used;
     //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
-    Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size);
+    Arena buffer_arena(buffer);
 
     CUstream stream = this->stream->get_stream();
 
@@ -81,7 +81,7 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
     GPUMicroOp<N, T>::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream);
 
     Memory zcpy_mem;
-    assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM));
+    assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM, buffer_arena.location));
     RegionInstance accessors_instance = this->realm_malloc(domain_transform.range_data.size() * sizeof(AffineAccessor<Rect<N,T>,N2,T2>), zcpy_mem);
     AffineAccessor<Rect<N,T>,N2,T2>* d_accessors = reinterpret_cast<AffineAccessor<Rect<N,T>,N2,T2>*>(AffineAccessor<char,1>(accessors_instance, 0).base);
     for (size_t i = 0; i < domain_transform.range_data.size(); ++i) {
@@ -326,14 +326,14 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
 
     NVTX_DEPPART(gpu_image);
 
-    Memory sysmem;
-    find_memory(sysmem, Memory::SYSTEM_MEM);
-
     CUstream stream = this->stream->get_stream();
 
     size_t tile_size = buffer.get_layout()->bytes_used;
     //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
-    Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size);
+    Arena buffer_arena(buffer);
+
+    Memory sysmem;
+    assert(find_memory(sysmem, Memory::SYSTEM_MEM, buffer_arena.location));
 
     collapsed_space<N2, T2> src_space;
     src_space.offsets = buffer_arena.alloc<size_t>(sources.size()+1);
@@ -366,7 +366,7 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
     GPUMicroOp<N, T>::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream);
 
     Memory zcpy_mem;
-    assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM));
+    assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM, buffer_arena.location));
     RegionInstance accessors_instance = this->realm_malloc(domain_transform.ptr_data.size() * sizeof(AffineAccessor<Point<N,T>,N2,T2>), zcpy_mem);
     AffineAccessor<Point<N,T>,N2,T2>* d_accessors = reinterpret_cast<AffineAccessor<Point<N,T>,N2,T2>*>(AffineAccessor<char,1>(accessors_instance, 0).base);
     for (size_t i = 0; i < domain_transform.ptr_data.size(); ++i) {
diff --git a/src/realm/deppart/partitions.h b/src/realm/deppart/partitions.h
index 9ccbde6b75..a3d0d3feb8 100644
--- a/src/realm/deppart/partitions.h
+++ b/src/realm/deppart/partitions.h
@@ -52,6 +52,7 @@ namespace Realm {
 
   namespace Cuda {
     class GPUStream;
+    class GPUProcessor;
   }
 
   template<typename T>
@@ -114,9 +115,10 @@ namespace Realm {
   public:
     using byte = std::byte;
 
-    Arena() noexcept : base_(nullptr), cap_(0), parity_(false), left_(0), right_(0), base_left_(0), base_right_(0) {}
-    Arena(void* buffer, size_t bytes) noexcept
-      : base_(reinterpret_cast<byte*>(buffer)), cap_(bytes), parity_(false), left_(0), right_(0), base_left_(0), base_right_(0) {}
+    Arena() noexcept : location(Memory::NO_MEMORY), base_(nullptr), cap_(0), parity_(false), left_(0), right_(0), base_left_(0), base_right_(0) {}
+    Arena(void* buffer, size_t bytes, Memory location) noexcept
+      : location(location), base_(reinterpret_cast<byte*>(buffer)), cap_(bytes), parity_(false), left_(0), right_(0), base_left_(0), base_right_(0) {}
+    Arena(RegionInstance buffer) : Arena(buffer.pointer_untyped(0, buffer.get_layout()->bytes_used), buffer.get_layout()->bytes_used, buffer.get_location()) {}
 
     size_t capacity() const noexcept { return cap_; }
     size_t used() const noexcept { return left_ + right_; }
@@ -194,6 +196,8 @@ namespace Realm {
       parity_ = false;
     }
 
+    Memory location;
+
   private:
 
     void* alloc_left_bytes(size_t bytes, size_t align = alignof(std::max_align_t)) {
diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp
index 93cfc5582b..015a1b7726 100644
--- a/src/realm/deppart/partitions_gpu_impl.hpp
+++ b/src/realm/deppart/partitions_gpu_impl.hpp
@@ -90,20 +90,22 @@ namespace Realm {
   };
 
   // Finds a memory of the specified kind. Returns true on success, false otherwise.
-  inline bool find_memory(Memory &output, Memory::Kind kind)
+  inline bool find_memory(Memory &output, Memory::Kind kind, Memory input = Memory::NO_MEMORY)
   {
-    bool found = false;
-    Machine machine = Machine::get_machine();
-    std::set<Memory> all_memories;
-    machine.get_all_memories(all_memories);
-    for(auto& memory : all_memories) {
-      if(memory.kind() == kind) {
-        output = memory;
-        found = true;
-        break;
+    std::vector<Machine::MemoryMemoryAffinity> affinities;
+    unsigned best_bandwidth = 0;
+    output = Memory::NO_MEMORY;
+    Machine::get_machine().get_mem_mem_affinity(affinities, input, Memory::NO_MEMORY);
+    for (auto affinity : affinities) {
+      if (affinity.m2.kind() != kind) {
+        continue;
+      }
+      if (affinity.bandwidth > best_bandwidth) {
+        best_bandwidth = affinity.bandwidth;
+        output = affinity.m2;
       }
     }
-    return found;
+    return output != Memory::NO_MEMORY;
   }
 
   template <int N, typename T>
@@ -228,7 +230,7 @@ namespace Realm {
 
     //We copy into one contiguous host buffer, then copy to device
     Memory sysmem;
-    assert(find_memory(sysmem, Memory::SYSTEM_MEM));
+    assert(find_memory(sysmem, Memory::SYSTEM_MEM, my_arena.location));
 
 
     RegionInstance h_instance = realm_malloc(out_space.num_entries * sizeof(SparsityMapEntry<N,T>), sysmem);
@@ -610,9 +612,6 @@ namespace Realm {
     NVTX_DEPPART(complete_rect_pipeline);
     CUstream stream = this->stream->get_stream();
 
-    Memory my_mem;
-    assert(find_memory(my_mem, Memory::GPU_FB_MEM));
-
     assert(!my_arena.get_parity());
     size_t beginning = my_arena.mark();
 
@@ -1579,7 +1578,7 @@ namespace Realm {
     RegionInstance sys_instance = RegionInstance::NO_INST;
 
     Memory sysmem;
-    assert(find_memory(sysmem, Memory::SYSTEM_MEM));
+    assert(find_memory(sysmem, Memory::SYSTEM_MEM, my_arena.location));
 
     Rect<N,T>* final_rects;
     std::vector<size_t> d_starts_host(output_instances.size()), d_ends_host(output_instances.size());
@@ -1707,7 +1706,7 @@ namespace Realm {
     }
 
     Memory sysmem;
-    assert(find_memory(sysmem, Memory::SYSTEM_MEM));
+    assert(find_memory(sysmem, Memory::SYSTEM_MEM, my_arena.location));
     if (!this->exclusive) {
       for (auto const& elem : ctr) {
         size_t idx = getIndex(elem);
diff --git a/src/realm/deppart/preimage_gpu_impl.hpp b/src/realm/deppart/preimage_gpu_impl.hpp
index 3a104f2e84..2a93136921 100644
--- a/src/realm/deppart/preimage_gpu_impl.hpp
+++ b/src/realm/deppart/preimage_gpu_impl.hpp
@@ -19,12 +19,12 @@ namespace Realm {
 
     size_t tile_size = buffer.get_layout()->bytes_used;
     //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
-    Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size);
+    Arena buffer_arena(buffer);
 
     NVTX_DEPPART(gpu_preimage_range);
 
     Memory sysmem;
-    find_memory(sysmem, Memory::SYSTEM_MEM);
+    assert(find_memory(sysmem, Memory::SYSTEM_MEM, buffer_arena.location));
 
     CUstream stream = this->stream->get_stream();
 
@@ -58,7 +58,7 @@ namespace Realm {
     GPUMicroOp<N2, T2>::collapse_multi_space(targets, target_space, buffer_arena, stream);
 
     Memory zcpy_mem;
-    assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM));
+    assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM, buffer_arena.location));
     RegionInstance accessors_instance = this->realm_malloc(domain_transform.range_data.size() * sizeof(AffineAccessor<Rect<N2,T2>,N,T>), zcpy_mem);
     AffineAccessor<Rect<N2,T2>,N,T>* d_accessors = reinterpret_cast<AffineAccessor<Rect<N2,T2>,N,T>*>(AffineAccessor<char,1>(accessors_instance, 0).base);
     for (size_t i = 0; i < domain_transform.range_data.size(); ++i) {
@@ -327,10 +327,10 @@ namespace Realm {
 
     size_t tile_size = buffer.get_layout()->bytes_used;
     //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl;
-    Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size);
+    Arena buffer_arena(buffer);
 
     Memory sysmem;
-    find_memory(sysmem, Memory::SYSTEM_MEM);
+    assert(find_memory(sysmem, Memory::SYSTEM_MEM, buffer_arena.location));
 
     CUstream stream = this->stream->get_stream();
 
@@ -366,7 +366,7 @@ namespace Realm {
     GPUMicroOp<N2, T2>::collapse_multi_space(targets, target_space, buffer_arena, stream);
 
     Memory zcpy_mem;
-    assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM));
+    assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM, buffer_arena.location));
     RegionInstance accessors_instance = this->realm_malloc(domain_transform.ptr_data.size() * sizeof(AffineAccessor<Point<N2,T2>,N,T>), zcpy_mem);
     AffineAccessor<Point<N2,T2>,N,T>* d_accessors = reinterpret_cast<AffineAccessor<Point<N2,T2>,N,T>*>(AffineAccessor<char,1>(accessors_instance, 0).base);
     for (size_t i = 0; i < domain_transform.ptr_data.size(); ++i) {

From 07e354acec9d013f91d10a7429115e6c96bfccef Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Thu, 19 Mar 2026 15:35:21 -0700
Subject: [PATCH 27/32] for flecsii

---
 src/realm/deppart/byfield.cc              | 100 +++++-
 src/realm/deppart/byfield.h               |  20 +-
 src/realm/deppart/byfield_gpu_impl.hpp    |  40 ++-
 src/realm/deppart/image.cc                | 205 ++++++++++--
 src/realm/deppart/image.h                 |  19 ++
 src/realm/deppart/image_gpu_impl.hpp      |  58 ++--
 src/realm/deppart/partitions.cc           |  11 -
 src/realm/deppart/partitions.h            |   9 +-
 src/realm/deppart/partitions_gpu_impl.hpp | 138 +++++---
 src/realm/deppart/preimage.cc             | 176 ++++++++++-
 src/realm/deppart/preimage.h              |  14 +-
 src/realm/deppart/preimage_gpu_impl.hpp   |  54 ++--
 src/realm/deppart/sparsity_impl.cc        | 364 +++++++++++++++++++++-
 src/realm/deppart/sparsity_impl.h         |  17 +-
 src/realm/sparsity.h                      |   6 +-
 src/realm/sparsity.inl                    |  10 +-
 tests/benchmark.cc                        |   2 +-
 17 files changed, 1047 insertions(+), 196 deletions(-)

diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc
index 78aceb2f92..ed65533555 100644
--- a/src/realm/deppart/byfield.cc
+++ b/src/realm/deppart/byfield.cc
@@ -325,13 +325,61 @@ namespace Realm {
     bool _exclusive)
     : parent_space(_parent), field_data(_field_data) {
     this->exclusive = _exclusive;
-    Memory my_mem = field_data[0].inst.get_location();
-    Processor best_proc;
-    assert(choose_proc(best_proc, my_mem));
-    Cuda::GPUProcessor* gpu_proc = dynamic_cast<Cuda::GPUProcessor*>(get_runtime()->get_processor_impl(best_proc));
-    assert(gpu_proc);
-    this->gpu = gpu_proc->gpu;
-    this->stream = gpu_proc->gpu->get_deppart_stream();
+    areg.force_instantiation();
+    // GPU setup (this->gpu, this->stream) deferred to execute(), which runs on the
+    // correct node after dispatch() has forwarded to the instance owner if needed.
+  }
+
+  template<int N, typename T, typename FT>
+  template <typename S>
+  GPUByFieldMicroOp<N, T, FT>::GPUByFieldMicroOp(
+    NodeID _requestor, AsyncMicroOp *_async_microop, S& s)
+    : GPUMicroOp<N,T>(_requestor, _async_microop)
+    , parent_space() {
+    bool ok = true;
+    size_t n = 0;
+    ok = ok && (s >> parent_space);
+    ok = ok && (s >> this->exclusive);
+    ok = ok && (s >> n);
+    field_data.resize(n);
+    for(size_t i = 0; i < n && ok; i++)
+      ok = ok && (s >> field_data[i].index_space) &&
+                 (s >> field_data[i].inst) &&
+                 (s >> field_data[i].field_offset) &&
+                 (s >> field_data[i].scratch_buffer);
+    // Deserialize colors manually to avoid std::vector<bool> proxy issues
+    size_t nc = 0;
+    ok = ok && (s >> nc);
+    for(size_t i = 0; i < nc && ok; i++) {
+      FT c;
+      ok = ok && (s >> c);
+      if(ok) colors.push_back(c);
+    }
+    ok = ok && (s >> sparsity_outputs);
+    assert(ok);
+    (void)ok;
+  }
+
+  template<int N, typename T, typename FT>
+  template <typename S>
+  bool GPUByFieldMicroOp<N, T, FT>::serialize_params(S& s) const {
+    bool ok = true;
+    ok = ok && (s << parent_space);
+    ok = ok && (s << this->exclusive);
+    ok = ok && (s << field_data.size());
+    for(size_t i = 0; i < field_data.size() && ok; i++)
+      ok = ok && (s << field_data[i].index_space) &&
+                 (s << field_data[i].inst) &&
+                 (s << field_data[i].field_offset) &&
+                 (s << field_data[i].scratch_buffer);
+    // Serialize colors manually to avoid std::vector<bool> proxy issues
+    ok = ok && (s << colors.size());
+    for(size_t i = 0; i < colors.size() && ok; i++) {
+      FT c = colors[i];
+      ok = ok && (s << c);
+    }
+    ok = ok && (s << sparsity_outputs);
+    return ok;
   }
 
   template<int N, typename T, typename FT>
@@ -342,6 +390,17 @@ namespace Realm {
   void GPUByFieldMicroOp<N, T, FT>::dispatch(
     PartitioningOperation *op, bool inline_ok) {
 
+    // GPU by-field must execute on the node that owns the GPU memory
+    NodeID exec_node = ID(field_data[0].inst).instance_owner_node();
+    if(this->exclusive) {
+      for(const auto& it : sparsity_outputs)
+        assert(NodeID(ID(it.second).sparsity_creator_node()) == exec_node);
+    }
+    if(exec_node != Network::my_node_id) {
+      PartitioningMicroOp::template forward_microop<GPUByFieldMicroOp<N,T,FT> >(exec_node, op, this);
+      return;
+    }
+
     // We have to register ourselves as a waiter on sparse inputs before dispatching.
 
     for (size_t i = 0; i < field_data.size(); i++) {
@@ -367,6 +426,10 @@ namespace Realm {
     sparsity_outputs[_val] = _sparsity;
   }
 
+  template <int N, typename T, typename FT>
+  ActiveMessageHandlerReg<RemoteMicroOpMessage<GPUByFieldMicroOp<N, T, FT> > >
+      GPUByFieldMicroOp<N, T, FT>::areg;
+
 #endif
 
 
@@ -383,12 +446,26 @@ namespace Realm {
     : PartitioningOperation(reqs, _finish_event, _finish_gen)
     , parent(_parent)
     , field_data(_field_data)
+    , exclusive_gpu_owner(exclusive_gpu_exec_node())
   {}
 
   template <int N, typename T, typename FT>
   ByFieldOperation<N,T,FT>::~ByFieldOperation(void)
   {}
 
+  template <int N, typename T, typename FT>
+  NodeID ByFieldOperation<N,T,FT>::exclusive_gpu_exec_node(void) const
+  {
+    if(field_data.size() != 1)
+      return -1;
+
+    Memory::Kind kind = field_data[0].inst.get_location().kind();
+    if((kind != Memory::GPU_FB_MEM) && (kind != Memory::Z_COPY_MEM))
+      return -1;
+
+    return ID(field_data[0].inst).instance_owner_node();
+  }
+
   template <int N, typename T, typename FT>
   IndexSpace<N,T> ByFieldOperation<N,T,FT>::add_color(FT color)
   {
@@ -401,8 +478,13 @@ namespace Realm {
     subspace.bounds = parent.bounds;
 
     // get a sparsity ID by round-robin'ing across the nodes that have field data
-    int target_node = ID(field_data[colors.size() % field_data.size()].inst).instance_owner_node();
-    SparsityMap<N,T> sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert<SparsityMap<N,T> >();
+    int target_node = (exclusive_gpu_owner >= 0) ?
+        exclusive_gpu_owner :
+        ID(field_data[colors.size() % field_data.size()].inst).instance_owner_node();
+    if(exclusive_gpu_owner >= 0)
+      assert(target_node == exclusive_gpu_exec_node());
+    SparsityMap<N,T> sparsity =
+        create_deppart_output_sparsity(target_node).convert<SparsityMap<N, T>>();
     subspace.sparsity = sparsity;
 
     colors.push_back(color);
diff --git a/src/realm/deppart/byfield.h b/src/realm/deppart/byfield.h
index cc21234f32..35b823552f 100644
--- a/src/realm/deppart/byfield.h
+++ b/src/realm/deppart/byfield.h
@@ -73,6 +73,10 @@ namespace Realm {
   template<int N, typename T, typename FT>
     class GPUByFieldMicroOp : public GPUMicroOp<N, T> {
   public:
+    static const int DIM = N;
+    typedef T IDXTYPE;
+    typedef FT FIELDTYPE;
+
     GPUByFieldMicroOp(
         const IndexSpace<N, T> &_parent,
         std::vector<FieldDataDescriptor<IndexSpace<N,T>,FT> > _field_data,
@@ -87,7 +91,18 @@ namespace Realm {
     void add_sparsity_output(FT _val, SparsityMap<N, T> _sparsity);
 
   protected:
-    const IndexSpace<N, T> parent_space;
+    friend struct RemoteMicroOpMessage<GPUByFieldMicroOp<N,T,FT> >;
+    static ActiveMessageHandlerReg<RemoteMicroOpMessage<GPUByFieldMicroOp<N,T,FT> > > areg;
+
+    friend class PartitioningMicroOp;
+    template <typename S>
+    REALM_ATTR_WARN_UNUSED(bool serialize_params(S& s) const);
+
+    // construct from received packet
+    template <typename S>
+    GPUByFieldMicroOp(NodeID _requestor, AsyncMicroOp *_async_microop, S& s);
+
+    IndexSpace<N, T> parent_space;
     std::vector<FieldDataDescriptor<IndexSpace<N,T>,FT> > field_data;
     std::vector<FT> colors;
     std::map<FT, SparsityMap<N,T> > sparsity_outputs;
@@ -112,10 +127,13 @@ namespace Realm {
     virtual void print(std::ostream& os) const;
 
   protected:
+    NodeID exclusive_gpu_exec_node(void) const;
+
     IndexSpace<N,T> parent;
     std::vector<FieldDataDescriptor<IndexSpace<N,T>,FT> > field_data;
     std::vector<FT> colors;
     std::vector<SparsityMap<N,T> > subspaces;
+    int exclusive_gpu_owner;
   };
     
 };
diff --git a/src/realm/deppart/byfield_gpu_impl.hpp b/src/realm/deppart/byfield_gpu_impl.hpp
index 4d59d30b54..bf25f81f03 100644
--- a/src/realm/deppart/byfield_gpu_impl.hpp
+++ b/src/realm/deppart/byfield_gpu_impl.hpp
@@ -17,6 +17,20 @@ namespace Realm {
 template <int N, typename T, typename FT>
 void GPUByFieldMicroOp<N,T,FT>::execute()
 {
+  // Resolve the local GPU processor now that we are guaranteed to be on the
+  // correct node (dispatch() forwarded us here if the instance was remote).
+  {
+    Memory my_mem = field_data[0].inst.get_location();
+    Processor best_proc;
+    assert(choose_proc(best_proc, my_mem));
+    Cuda::GPUProcessor *gpu_proc =
+        dynamic_cast<Cuda::GPUProcessor *>(get_runtime()->get_processor_impl(best_proc));
+    assert(gpu_proc);
+    this->gpu = gpu_proc->gpu;
+    this->stream = gpu_proc->gpu->get_deppart_stream();
+  }
+
+
 
   Cuda::AutoGPUContext agc(this->gpu);
 
@@ -75,15 +89,14 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
   }
 
 
-  Memory zcpy_mem;
-  assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM, buffer_arena.location));
-
-  // We need to pass the accessors to the GPU so it can read field values.
-  RegionInstance accessors_instance = this->realm_malloc(field_data.size() * sizeof(AffineAccessor<FT,N,T>), zcpy_mem);
-  AffineAccessor<FT,N,T>* d_accessors = reinterpret_cast<AffineAccessor<FT,N,T>*>(AffineAccessor<char,1>(accessors_instance, 0).base);
+  std::vector<AffineAccessor<FT,N,T>> h_accessors(field_data.size());
   for (size_t i = 0; i < field_data.size(); ++i) {
-    d_accessors[i] = AffineAccessor<FT,N,T>(field_data[i].inst, field_data[i].field_offset);
+    h_accessors[i] = AffineAccessor<FT,N,T>(field_data[i].inst, field_data[i].field_offset);
   }
+  AffineAccessor<FT,N,T>* d_accessors = buffer_arena.alloc<AffineAccessor<FT,N,T>>(field_data.size());
+  CUDA_CHECK(cudaMemcpyAsync(d_accessors, h_accessors.data(),
+                             field_data.size() * sizeof(AffineAccessor<FT,N,T>),
+                             cudaMemcpyHostToDevice, stream), stream);
 
   buffer_arena.commit(false);
 
@@ -103,7 +116,7 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
   int count = 0;
   if (count) {}
   bool host_fallback = false;
-  std::vector<RegionInstance> h_instances(colors.size(), RegionInstance::NO_INST);
+  std::vector<Rect<N, T>*> host_rect_buffers(colors.size(), nullptr);
   std::vector<size_t> entry_counts(colors.size(), 0);
   while (num_completed < inst_space.num_entries) {
     try {
@@ -167,7 +180,7 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
                            });
 
       if (host_fallback) {
-        this->split_output(d_new_rects, num_new_rects, h_instances, entry_counts, buffer_arena);
+        this->split_output(d_new_rects, num_new_rects, host_rect_buffers, entry_counts, buffer_arena);
       }
 
       if (num_output==0 || host_fallback) {
@@ -216,7 +229,7 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
         } else {
           host_fallback = true;
           if (num_output > 0) {
-            this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+            this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena);
           }
           curr_tile = tile_size / 2;
         }
@@ -248,7 +261,7 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
                               return kv.second;
                            });
     } catch (arena_oom&) {
-      this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+      this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena);
       host_fallback = true;
     }
   }
@@ -261,10 +274,9 @@ void GPUByFieldMicroOp<N,T,FT>::execute()
       }
       size_t idx = color_indices.at(it.first);
       if (entry_counts[idx] > 0) {
-        Rect<N, T>* h_rects = reinterpret_cast<Rect<N,T> *>(AffineAccessor<char,1>(h_instances[idx], 0).base);
-        span<Rect<N, T>> h_rects_span(h_rects, entry_counts[idx]);
+        span<Rect<N, T>> h_rects_span(host_rect_buffers[idx], entry_counts[idx]);
         impl->contribute_dense_rect_list(h_rects_span, true);
-        h_instances[idx].destroy();
+        deppart_host_free(host_rect_buffers[idx]);
       } else {
         impl->contribute_nothing();
       }
diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc
index 9eaf7b8197..217543d147 100644
--- a/src/realm/deppart/image.cc
+++ b/src/realm/deppart/image.cc
@@ -25,6 +25,7 @@
 #include "realm/deppart/preimage.h"
 #include "realm/logging.h"
 #include "realm/cuda/cuda_internal.h"
+#include <cstdio>
 
 namespace Realm {
 
@@ -500,12 +501,43 @@ namespace Realm {
       EventImpl::gen_t _finish_gen)
       : PartitioningOperation(reqs, _finish_event, _finish_gen),
         parent(_parent),
-        domain_transform(_domain_transform) {}
+        domain_transform(_domain_transform),
+        is_intersection(false),
+        exclusive_gpu_owner(exclusive_gpu_exec_node())
+  {}
 
   template <int N, typename T, int N2, typename T2>
   ImageOperation<N,T,N2,T2>::~ImageOperation(void)
   {}
 
+  template <int N, typename T, int N2, typename T2>
+  NodeID ImageOperation<N, T, N2, T2>::exclusive_gpu_exec_node(void) const
+  {
+    size_t gpu_ptrs = 0, gpu_rects = 0, cpu_ptrs = 0, cpu_rects = 0;
+    for(size_t i = 0; i < domain_transform.ptr_data.size(); i++) {
+      Memory::Kind kind = domain_transform.ptr_data[i].inst.get_location().kind();
+      if((kind == Memory::GPU_FB_MEM) || (kind == Memory::Z_COPY_MEM))
+        gpu_ptrs++;
+      else
+        cpu_ptrs++;
+    }
+    for(size_t i = 0; i < domain_transform.range_data.size(); i++) {
+      Memory::Kind kind = domain_transform.range_data[i].inst.get_location().kind();
+      if((kind == Memory::GPU_FB_MEM) || (kind == Memory::Z_COPY_MEM))
+        gpu_rects++;
+      else
+        cpu_rects++;
+    }
+    size_t opcount = gpu_ptrs + gpu_rects + cpu_ptrs + cpu_rects;
+    if((gpu_ptrs + gpu_rects) == 0 || (opcount != 1))
+      return -1;
+    if(gpu_ptrs == 1)
+      return ID(domain_transform.ptr_data[0].inst).instance_owner_node();
+    if(gpu_rects == 1)
+      return ID(domain_transform.range_data[0].inst).instance_owner_node();
+    return -1;
+  }
+
   template <int N, typename T, int N2, typename T2>
   IndexSpace<N,T> ImageOperation<N,T,N2,T2>::add_source(const IndexSpace<N2,T2>& source)
   {
@@ -520,17 +552,22 @@ namespace Realm {
     // if the source has a sparsity map, use the same node - otherwise
     // get a sparsity ID by round-robin'ing across the nodes that have field data
     int target_node = 0;
-    if(!source.dense())
+    if(exclusive_gpu_owner >= 0)
+      target_node = exclusive_gpu_owner;
+    else if(!source.dense())
       target_node = ID(source.sparsity).sparsity_creator_node();
     else
       if(!domain_transform.ptr_data.empty())
 	target_node = ID(domain_transform.ptr_data[sources.size() % domain_transform.ptr_data.size()].inst).instance_owner_node();
       else
 	target_node = ID(domain_transform.range_data[sources.size() % domain_transform.range_data.size()].inst).instance_owner_node();
+    if(exclusive_gpu_owner >= 0) {
+      assert(target_node == exclusive_gpu_exec_node());
+    }
 
-    SparsityMap<N,T> sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert<SparsityMap<N,T> >();
+    SparsityMap<N,T> sparsity =
+        create_deppart_output_sparsity(target_node).convert<SparsityMap<N, T>>();
     image.sparsity = sparsity;
-
     sources.push_back(source);
     images.push_back(sparsity);
 
@@ -552,17 +589,22 @@ namespace Realm {
     // if the source has a sparsity map, use the same node - otherwise
     // get a sparsity ID by round-robin'ing across the nodes that have field data
     int target_node;
-    if(!source.dense())
+    if(exclusive_gpu_owner >= 0)
+      target_node = exclusive_gpu_owner;
+    else if(!source.dense())
       target_node = ID(source.sparsity).sparsity_creator_node();
     else
       if(!domain_transform.ptr_data.empty())
         target_node = ID(domain_transform.ptr_data[sources.size() % domain_transform.ptr_data.size()].inst).instance_owner_node();
       else
 	      target_node = ID(domain_transform.range_data[sources.size() % domain_transform.range_data.size()].inst).instance_owner_node();
+    if(exclusive_gpu_owner >= 0) {
+      assert(target_node == exclusive_gpu_exec_node());
+    }
 
-    SparsityMap<N,T> sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert<SparsityMap<N,T> >();
+    SparsityMap<N,T> sparsity =
+        create_deppart_output_sparsity(target_node).convert<SparsityMap<N, T>>();
     image.sparsity = sparsity;
-
     sources.push_back(source);
     diff_rhss.push_back(diff_rhs);
     images.push_back(sparsity);
@@ -586,17 +628,22 @@ namespace Realm {
     // if the source has a sparsity map, use the same node - otherwise
     // get a sparsity ID by round-robin'ing across the nodes that have field data
     int target_node;
-    if(!source.dense())
+    if(exclusive_gpu_owner >= 0)
+      target_node = exclusive_gpu_owner;
+    else if(!source.dense())
       target_node = ID(source.sparsity).sparsity_creator_node();
     else
       if(!domain_transform.ptr_data.empty())
         target_node = ID(domain_transform.ptr_data[sources.size() % domain_transform.ptr_data.size()].inst).instance_owner_node();
       else
 	      target_node = ID(domain_transform.range_data[sources.size() % domain_transform.range_data.size()].inst).instance_owner_node();
+    if(exclusive_gpu_owner >= 0) {
+      assert(target_node == exclusive_gpu_exec_node());
+    }
 
-    SparsityMap<N,T> sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert<SparsityMap<N,T> >();
+    SparsityMap<N,T> sparsity =
+        create_deppart_output_sparsity(target_node).convert<SparsityMap<N, T>>();
     image.sparsity = sparsity;
-
     sources.push_back(source);
     diff_rhss.push_back(diff_rhs);
     images.push_back(sparsity);
@@ -666,6 +713,15 @@ namespace Realm {
     } else {
       size_t opcount = cpu_ptr_data.size() + cpu_rect_data.size() + gpu_ptr_data.size() + gpu_rect_data.size();
       bool exclusive = gpu_data && (opcount == 1);
+      if(exclusive) {
+        NodeID expected_owner = exclusive_gpu_exec_node();
+        assert(exclusive_gpu_owner >= 0);
+        assert(NodeID(exclusive_gpu_owner) == expected_owner);
+        for(size_t i = 0; i < images.size(); i++) {
+          NodeID output_owner = NodeID(ID(images[i]).sparsity_creator_node());
+          assert(output_owner == NodeID(exclusive_gpu_owner));
+        }
+      }
       if (!exclusive) {
     	// launch full cross-product of image micro ops right away
     	for (size_t i = 0; i < sources.size(); i++) {
@@ -702,6 +758,10 @@ namespace Realm {
       for (auto ptr_fdd : gpu_ptr_data) {
           	// launch full cross-product of image micro ops right away
           assert(ptr_fdd.scratch_buffer != RegionInstance::NO_INST);
+          if(exclusive) {
+            NodeID microop_exec_node = ID(ptr_fdd.inst).instance_owner_node();
+            assert(NodeID(exclusive_gpu_owner) == microop_exec_node);
+          }
           DomainTransform<N, T, N2, T2> domain_transform_copy = domain_transform;
           domain_transform_copy.ptr_data = {ptr_fdd};
       	  GPUImageMicroOp<N, T, N2, T2> *micro_op =
@@ -715,6 +775,10 @@ namespace Realm {
       for (auto rect_fdd : gpu_rect_data) {
         // launch full cross-product of image micro ops right away
         assert(rect_fdd.scratch_buffer != RegionInstance::NO_INST);
+        if(exclusive) {
+          NodeID microop_exec_node = ID(rect_fdd.inst).instance_owner_node();
+          assert(NodeID(exclusive_gpu_owner) == microop_exec_node);
+        }
         DomainTransform<N, T, N2, T2> domain_transform_copy = domain_transform;
         domain_transform_copy.range_data = {rect_fdd};
         GPUImageMicroOp<N, T, N2, T2> *micro_op =
@@ -942,14 +1006,76 @@ namespace Realm {
       bool _exclusive)
       : parent_space(_parent), domain_transform(_domain_transform)
   {
-	this->exclusive = _exclusive;
-  	Memory my_mem = domain_transform.ptr_data.empty() ? domain_transform.range_data[0].inst.get_location() : domain_transform.ptr_data[0].inst.get_location();
-  	Processor best_proc;
-  	assert(choose_proc(best_proc, my_mem));
-  	Cuda::GPUProcessor* gpu_proc = dynamic_cast<Cuda::GPUProcessor*>(get_runtime()->get_processor_impl(best_proc));
-  	assert(gpu_proc);
-  	this->gpu = gpu_proc->gpu;
-  	this->stream = gpu_proc->gpu->get_deppart_stream();
+    this->exclusive = _exclusive;
+    areg.force_instantiation();
+    // GPU setup (this->gpu, this->stream) deferred to execute(), which runs on the
+    // correct node after dispatch() has forwarded to the instance owner if needed.
+  }
+
+  template <int N, typename T, int N2, typename T2>
+  template <typename S>
+  GPUImageMicroOp<N, T, N2, T2>::GPUImageMicroOp(
+      NodeID _requestor, AsyncMicroOp *_async_microop, S& s)
+      : GPUMicroOp<N,T>(_requestor, _async_microop)
+  {
+    bool ok = true;
+    bool use_ptr_data = false;
+    ok = ok && (s >> parent_space);
+    ok = ok && (s >> this->exclusive);
+    ok = ok && (s >> use_ptr_data);
+    if(use_ptr_data) {
+      domain_transform.type = DomainTransform<N,T,N2,T2>::DomainTransformType::UNSTRUCTURED_PTR;
+      size_t n = 0;
+      ok = ok && (s >> n);
+      domain_transform.ptr_data.resize(n);
+      for(size_t i = 0; i < n && ok; i++)
+        ok = ok && (s >> domain_transform.ptr_data[i].index_space) &&
+                   (s >> domain_transform.ptr_data[i].inst) &&
+                   (s >> domain_transform.ptr_data[i].field_offset) &&
+                   (s >> domain_transform.ptr_data[i].scratch_buffer);
+    } else {
+      domain_transform.type = DomainTransform<N,T,N2,T2>::DomainTransformType::UNSTRUCTURED_RANGE;
+      size_t n = 0;
+      ok = ok && (s >> n);
+      domain_transform.range_data.resize(n);
+      for(size_t i = 0; i < n && ok; i++)
+        ok = ok && (s >> domain_transform.range_data[i].index_space) &&
+                   (s >> domain_transform.range_data[i].inst) &&
+                   (s >> domain_transform.range_data[i].field_offset) &&
+                   (s >> domain_transform.range_data[i].scratch_buffer);
+    }
+    ok = ok && (s >> sources);
+    ok = ok && (s >> sparsity_outputs);
+    assert(ok);
+    (void)ok;
+  }
+
+  template <int N, typename T, int N2, typename T2>
+  template <typename S>
+  bool GPUImageMicroOp<N, T, N2, T2>::serialize_params(S& s) const {
+    bool ok = true;
+    bool use_ptr_data = !domain_transform.ptr_data.empty();
+    ok = ok && (s << parent_space);
+    ok = ok && (s << this->exclusive);
+    ok = ok && (s << use_ptr_data);
+    if(use_ptr_data) {
+      ok = ok && (s << domain_transform.ptr_data.size());
+      for(size_t i = 0; i < domain_transform.ptr_data.size() && ok; i++)
+        ok = ok && (s << domain_transform.ptr_data[i].index_space) &&
+                   (s << domain_transform.ptr_data[i].inst) &&
+                   (s << domain_transform.ptr_data[i].field_offset) &&
+                   (s << domain_transform.ptr_data[i].scratch_buffer);
+    } else {
+      ok = ok && (s << domain_transform.range_data.size());
+      for(size_t i = 0; i < domain_transform.range_data.size() && ok; i++)
+        ok = ok && (s << domain_transform.range_data[i].index_space) &&
+                   (s << domain_transform.range_data[i].inst) &&
+                   (s << domain_transform.range_data[i].field_offset) &&
+                   (s << domain_transform.range_data[i].scratch_buffer);
+    }
+    ok = ok && (s << sources);
+    ok = ok && (s << sparsity_outputs);
+    return ok;
   }
 
   template <int N, typename T, int N2, typename T2>
@@ -959,6 +1085,20 @@ namespace Realm {
   void GPUImageMicroOp<N, T, N2, T2>::dispatch(
       PartitioningOperation *op, bool inline_ok) {
 
+    // GPU image must execute on the node that owns the GPU memory
+    NodeID exec_node = domain_transform.ptr_data.empty() ?
+        ID(domain_transform.range_data[0].inst).instance_owner_node() :
+        ID(domain_transform.ptr_data[0].inst).instance_owner_node();
+    if(this->exclusive) {
+      for(size_t i = 0; i < sparsity_outputs.size(); i++) {
+        assert(NodeID(ID(sparsity_outputs[i]).sparsity_creator_node()) == exec_node);
+      }
+    }
+    if(exec_node != Network::my_node_id) {
+      PartitioningMicroOp::template forward_microop<GPUImageMicroOp<N,T,N2,T2> >(exec_node, op, this);
+      return;
+    }
+
     for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) {
       IndexSpace<N2, T2> inst_space = domain_transform.ptr_data[i].index_space;
       if (!inst_space.dense()) {
@@ -969,6 +1109,16 @@ namespace Realm {
           this->wait_count.fetch_add(1);
       }
     }
+  	for (size_t i = 0; i < domain_transform.range_data.size(); i++) {
+  		IndexSpace<N2, T2> inst_space = domain_transform.range_data[i].index_space;
+  		if (!inst_space.dense()) {
+  			// it's safe to add the count after the registration only because we initialized
+  			//  the count to 2 instead of 1
+  			bool registered = SparsityMapImpl<N2,T2>::lookup(inst_space.sparsity)->add_waiter(this, true /*precise*/);
+  			if(registered)
+  				this->wait_count.fetch_add(1);
+  		}
+  	}
 
     for (size_t i = 0; i < sources.size(); i++) {
       if (!sources[i].dense()) {
@@ -994,10 +1144,29 @@ namespace Realm {
    sparsity_outputs.push_back(_sparsity);
   }
 
+  template <int N, typename T, int N2, typename T2>
+  ActiveMessageHandlerReg<RemoteMicroOpMessage<GPUImageMicroOp<N, T, N2, T2> > >
+      GPUImageMicroOp<N, T, N2, T2>::areg;
+
   template <int N, typename T, int N2, typename T2>
   void GPUImageMicroOp<N, T, N2, T2>::execute(void) {
     TimeStamp ts("GPUImageMicroOp::execute", true, &log_uop_timing);
 
+    // Resolve the local GPU processor now that we are guaranteed to be on the
+    // correct node (dispatch() forwarded us here if the instance was remote).
+    {
+      Memory my_mem = domain_transform.ptr_data.empty() ?
+          domain_transform.range_data[0].inst.get_location() :
+          domain_transform.ptr_data[0].inst.get_location();
+      Processor best_proc;
+      assert(choose_proc(best_proc, my_mem));
+      Cuda::GPUProcessor *gpu_proc =
+          dynamic_cast<Cuda::GPUProcessor *>(get_runtime()->get_processor_impl(best_proc));
+      assert(gpu_proc);
+      this->gpu = gpu_proc->gpu;
+      this->stream = gpu_proc->gpu->get_deppart_stream();
+    }
+
     Cuda::AutoGPUContext agc(this->gpu);
     if (domain_transform.ptr_data.size() > 0) {
       gpu_populate_ptrs();
diff --git a/src/realm/deppart/image.h b/src/realm/deppart/image.h
index 4eed6da566..fec4dc7651 100644
--- a/src/realm/deppart/image.h
+++ b/src/realm/deppart/image.h
@@ -116,12 +116,15 @@ namespace Realm {
         virtual void set_overlap_tester(void *tester);
 
     protected:
+        NodeID exclusive_gpu_exec_node(void) const;
+
         IndexSpace<N, T> parent;
         DomainTransform<N, T, N2, T2> domain_transform;
         std::vector<IndexSpace<N2, T2> > sources;
         std::vector<IndexSpace<N, T> > diff_rhss;
         std::vector<SparsityMap<N, T> > images;
         bool is_intersection;
+        int exclusive_gpu_owner;
     };
 
     template<int N, typename T, int N2, typename T2>
@@ -153,6 +156,11 @@ namespace Realm {
     template<int N, typename T, int N2, typename T2>
     class GPUImageMicroOp : public GPUMicroOp<N, T> {
     public:
+        static const int DIM = N;
+        typedef T IDXTYPE;
+        static const int DIM2 = N2;
+        typedef T2 IDXTYPE2;
+
         GPUImageMicroOp(
             const IndexSpace<N, T> &_parent,
             const DomainTransform<N, T, N2, T2> &_domain_transform,
@@ -174,6 +182,17 @@ namespace Realm {
         bool is_image_microop() const override { return true; }
 
     protected:
+        friend struct RemoteMicroOpMessage<GPUImageMicroOp<N,T,N2,T2> >;
+        static ActiveMessageHandlerReg<RemoteMicroOpMessage<GPUImageMicroOp<N,T,N2,T2> > > areg;
+
+        friend class PartitioningMicroOp;
+        template <typename S>
+        REALM_ATTR_WARN_UNUSED(bool serialize_params(S& s) const);
+
+        // construct from received packet
+        template <typename S>
+        GPUImageMicroOp(NodeID _requestor, AsyncMicroOp *_async_microop, S& s);
+
         IndexSpace<N, T> parent_space;
         DomainTransform<N, T, N2, T2> domain_transform;
         std::vector<IndexSpace<N2, T2> > sources;
diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp
index 7bac9f9054..75401be42c 100644
--- a/src/realm/deppart/image_gpu_impl.hpp
+++ b/src/realm/deppart/image_gpu_impl.hpp
@@ -41,6 +41,7 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
 {
 
     if (sources.size() == 0) {
+      assert(sparsity_outputs.empty());
       return;
     }
 
@@ -80,13 +81,15 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
     // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs.
     GPUMicroOp<N, T>::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream);
 
-    Memory zcpy_mem;
-    assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM, buffer_arena.location));
-    RegionInstance accessors_instance = this->realm_malloc(domain_transform.range_data.size() * sizeof(AffineAccessor<Rect<N,T>,N2,T2>), zcpy_mem);
-    AffineAccessor<Rect<N,T>,N2,T2>* d_accessors = reinterpret_cast<AffineAccessor<Rect<N,T>,N2,T2>*>(AffineAccessor<char,1>(accessors_instance, 0).base);
+    std::vector<AffineAccessor<Rect<N,T>,N2,T2>> h_accessors(domain_transform.range_data.size());
     for (size_t i = 0; i < domain_transform.range_data.size(); ++i) {
-      d_accessors[i] = AffineAccessor<Rect<N,T>,N2,T2>(domain_transform.range_data[i].inst, domain_transform.range_data[i].field_offset);
+      h_accessors[i] = AffineAccessor<Rect<N,T>,N2,T2>(domain_transform.range_data[i].inst, domain_transform.range_data[i].field_offset);
     }
+    AffineAccessor<Rect<N,T>,N2,T2>* d_accessors =
+        buffer_arena.alloc<AffineAccessor<Rect<N,T>,N2,T2>>(domain_transform.range_data.size());
+    CUDA_CHECK(cudaMemcpyAsync(d_accessors, h_accessors.data(),
+                               domain_transform.range_data.size() * sizeof(AffineAccessor<Rect<N,T>,N2,T2>),
+                               cudaMemcpyHostToDevice, stream), stream);
 
     uint32_t* d_src_counters = buffer_arena.alloc<uint32_t>(2 * sources.size() + 1);
     uint32_t* d_src_prefix = d_src_counters + sources.size();
@@ -100,7 +103,7 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
     int count = 0;
     if (count) {}
     bool host_fallback = false;
-    std::vector<RegionInstance> h_instances(sources.size(), RegionInstance::NO_INST);
+    std::vector<Rect<N, T>*> host_rect_buffers(sources.size(), nullptr);
     std::vector<size_t> entry_counts(sources.size(), 0);
     while (num_completed < inst_space.num_entries) {
       try {
@@ -197,7 +200,7 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
                           });
 
           if (host_fallback) {
-            this->split_output(d_new_rects, num_new_rects, h_instances, entry_counts, buffer_arena);
+            this->split_output(d_new_rects, num_new_rects, host_rect_buffers, entry_counts, buffer_arena);
           }
 
           //Set our first set of output rectangles
@@ -250,7 +253,7 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
           } else {
             host_fallback = true;
             if (num_output > 0) {
-              this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+              this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena);
             }
             curr_tile = tile_size / 2;
           }
@@ -282,7 +285,7 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
                             return elem;
                          });
     } catch (arena_oom&) {
-      this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+      this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena);
       host_fallback = true;
     }
   }
@@ -294,10 +297,9 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_rngs()
         impl->set_contributor_count(1);
       }
       if (entry_counts[idx] > 0) {
-        Rect<N, T>* h_rects = reinterpret_cast<Rect<N,T> *>(AffineAccessor<char,1>(h_instances[idx], 0).base);
-        span<Rect<N, T>> h_rects_span(h_rects, entry_counts[idx]);
+        span<Rect<N, T>> h_rects_span(host_rect_buffers[idx], entry_counts[idx]);
         impl->contribute_dense_rect_list(h_rects_span, false);
-        h_instances[idx].destroy();
+        deppart_host_free(host_rect_buffers[idx]);
       } else {
         impl->contribute_nothing();
       }
@@ -319,6 +321,7 @@ template <int N, typename T, int N2, typename T2>
 void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
 {
     if (sources.size() == 0) {
+      assert(sparsity_outputs.empty());
       return;
     }
 
@@ -365,13 +368,15 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
     // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs.
     GPUMicroOp<N, T>::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream);
 
-    Memory zcpy_mem;
-    assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM, buffer_arena.location));
-    RegionInstance accessors_instance = this->realm_malloc(domain_transform.ptr_data.size() * sizeof(AffineAccessor<Point<N,T>,N2,T2>), zcpy_mem);
-    AffineAccessor<Point<N,T>,N2,T2>* d_accessors = reinterpret_cast<AffineAccessor<Point<N,T>,N2,T2>*>(AffineAccessor<char,1>(accessors_instance, 0).base);
+    std::vector<AffineAccessor<Point<N,T>,N2,T2>> h_accessors(domain_transform.ptr_data.size());
     for (size_t i = 0; i < domain_transform.ptr_data.size(); ++i) {
-      d_accessors[i] = AffineAccessor<Point<N,T>,N2,T2>(domain_transform.ptr_data[i].inst, domain_transform.ptr_data[i].field_offset);
+      h_accessors[i] = AffineAccessor<Point<N,T>,N2,T2>(domain_transform.ptr_data[i].inst, domain_transform.ptr_data[i].field_offset);
     }
+    AffineAccessor<Point<N,T>,N2,T2>* d_accessors =
+        buffer_arena.alloc<AffineAccessor<Point<N,T>,N2,T2>>(domain_transform.ptr_data.size());
+    CUDA_CHECK(cudaMemcpyAsync(d_accessors, h_accessors.data(),
+                               domain_transform.ptr_data.size() * sizeof(AffineAccessor<Point<N,T>,N2,T2>),
+                               cudaMemcpyHostToDevice, stream), stream);
 
     uint32_t* d_prefix_points = buffer_arena.alloc<uint32_t>(domain_transform.ptr_data.size()+1);
 
@@ -387,7 +392,7 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
     int count = 0;
     if (count) {}
     bool host_fallback = false;
-    std::vector<RegionInstance> h_instances(sources.size(), RegionInstance::NO_INST);
+    std::vector<Rect<N, T>*> host_rect_buffers(sources.size(), nullptr);
     std::vector<size_t> entry_counts(sources.size(), 0);
     while (num_completed < inst_space.num_entries) {
       try {
@@ -475,9 +480,9 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
                               return elem;
                            });
 
-        if (host_fallback) {
-          this->split_output(d_new_rects, num_new_rects, h_instances, entry_counts, buffer_arena);
-        }
+          if (host_fallback) {
+            this->split_output(d_new_rects, num_new_rects, host_rect_buffers, entry_counts, buffer_arena);
+          }
 
         if (num_output==0 || host_fallback) {
           num_output = num_new_rects;
@@ -525,7 +530,7 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
           } else {
             host_fallback = true;
             if (num_output > 0) {
-              this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+              this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena);
             }
             curr_tile = tile_size / 2;
           }
@@ -557,7 +562,7 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
                             return elem;
                          });
     } catch (arena_oom&) {
-      this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+      this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena);
       host_fallback = true;
     }
   }
@@ -569,14 +574,13 @@ void GPUImageMicroOp<N,T,N2,T2>::gpu_populate_ptrs()
         impl->set_contributor_count(1);
       }
       if (entry_counts[idx] > 0) {
-        Rect<N, T>* h_rects = reinterpret_cast<Rect<N,T> *>(AffineAccessor<char,1>(h_instances[idx], 0).base);
-        span<Rect<N, T>> h_rects_span(h_rects, entry_counts[idx]);
+        span<Rect<N, T>> h_rects_span(host_rect_buffers[idx], entry_counts[idx]);
         impl->contribute_dense_rect_list(h_rects_span, false);
-        h_instances[idx].destroy();
+        deppart_host_free(host_rect_buffers[idx]);
       } else {
         impl->contribute_nothing();
       }
     }
   }
 }
-}
\ No newline at end of file
+}
diff --git a/src/realm/deppart/partitions.cc b/src/realm/deppart/partitions.cc
index 1c16670c47..b25ddd17a5 100644
--- a/src/realm/deppart/partitions.cc
+++ b/src/realm/deppart/partitions.cc
@@ -662,16 +662,6 @@ namespace Realm {
     }
   }
 
-  RegionInstance PartitioningMicroOp::realm_malloc(size_t size, Memory location) {
-      assert(location != Memory::NO_MEMORY);
-      assert(size > 0);
-      std::vector<size_t> byte_fields = {sizeof(char)};
-      IndexSpace<1> instance_index_space(Rect<1>(0, size-1));
-      RegionInstance result;
-      RegionInstance::create_instance(result, location, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait();
-      return result;
-  }
-
   ////////////////////////////////////////////////////////////////////////
   //
   // class ComputeOverlapMicroOp<N,T>
@@ -1067,4 +1057,3 @@ namespace Realm {
   FOREACH_NTNT(DOIT2)
 
 };
-
diff --git a/src/realm/deppart/partitions.h b/src/realm/deppart/partitions.h
index a3d0d3feb8..0af8ec0673 100644
--- a/src/realm/deppart/partitions.h
+++ b/src/realm/deppart/partitions.h
@@ -91,7 +91,7 @@ namespace Realm {
     size_t* offsets;
     size_t num_children;
     Rect<N, T> bounds;
-    RegionInstance h_instance = RegionInstance::NO_INST;
+    SparsityMapEntry<N, T>* host_entries_owner = nullptr;
   };
 
   // Stores everything necessary to query a BVH
@@ -310,8 +310,6 @@ namespace Realm {
     template <int N, typename T>
     void sparsity_map_ready(SparsityMapImpl<N,T> *sparsity, bool precise);
 
-    static RegionInstance realm_malloc(size_t size, Memory location = Memory::NO_MEMORY);
-
     IntrusiveListLink<PartitioningMicroOp> uop_link;
     REALM_PMTA_DEFN(PartitioningMicroOp,IntrusiveListLink<PartitioningMicroOp>,uop_link);
     typedef IntrusiveList<PartitioningMicroOp, REALM_PMTA_USE(PartitioningMicroOp,uop_link), DummyLock> MicroOpList;
@@ -358,6 +356,8 @@ namespace Realm {
   class GPUMicroOp : public PartitioningMicroOp {
   public:
     GPUMicroOp(void) = default;
+    GPUMicroOp(NodeID _requestor, AsyncMicroOp *_async_microop)
+      : PartitioningMicroOp(_requestor, _async_microop) {}
     virtual ~GPUMicroOp(void) = default;
 
     virtual void execute(void) = 0;
@@ -386,7 +386,7 @@ namespace Realm {
     template<typename Container, typename IndexFn, typename MapFn>
     void complete1d_pipeline(RectDesc<N, T>* d_rects, size_t total_rects, RectDesc<N, T>* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap);
 
-    void split_output(RectDesc<N, T>* d_rects, size_t total_rects, std::vector<RegionInstance> &output_instances, std::vector<size_t> &output_counts, Arena &my_arena);
+    void split_output(RectDesc<N, T>* d_rects, size_t total_rects, std::vector<Rect<N, T> *> &output_instances, std::vector<size_t> &output_counts, Arena &my_arena);
 
     template<typename Container, typename IndexFn, typename MapFn>
     void send_output(RectDesc<N, T>* d_rects, size_t total_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap);
@@ -527,4 +527,3 @@ namespace Realm {
 #include "realm/deppart/partitions.inl"
 
 #endif // REALM_PARTITIONS_H
-
diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp
index 015a1b7726..e293419b9a 100644
--- a/src/realm/deppart/partitions_gpu_impl.hpp
+++ b/src/realm/deppart/partitions_gpu_impl.hpp
@@ -36,6 +36,17 @@
 #define COMPUTE_GRID(num_items) \
   (((num_items) + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK)
 
+#define CUDA_HOST_CHECK(call)                                                   \
+  do {                                                                          \
+    cudaError_t err = (call);                                                   \
+    if (err != cudaSuccess) {                                                   \
+      std::cerr << "CUDA host error at " << __FILE__ << ":" << __LINE__        \
+                << " '" #call "' failed with "                                 \
+                << cudaGetErrorString(err) << " (" << err << ")\n";            \
+      assert(false);                                                            \
+    }                                                                           \
+  } while (0)
+
 
 //NVTX macros to only add ranges if defined.
 #ifdef REALM_USE_NVTX
@@ -61,6 +72,21 @@ inline int32_t next_nvtx_payload() {
 
 namespace Realm {
 
+  template <typename T>
+  inline T *deppart_host_alloc(size_t count, unsigned flags = cudaHostAllocPortable)
+  {
+    if(count == 0) return nullptr;
+    void *ptr = nullptr;
+    CUDA_HOST_CHECK(cudaHostAlloc(&ptr, count * sizeof(T), flags));
+    return reinterpret_cast<T *>(ptr);
+  }
+
+  inline void deppart_host_free(void *ptr)
+  {
+    if(ptr != nullptr)
+      CUDA_HOST_CHECK(cudaFreeHost(ptr));
+  }
+
   // Used by cub::DeviceReduce to compute bad GPU approximation.
   template<int N, typename T>
   struct UnionRectOp {
@@ -105,7 +131,18 @@ namespace Realm {
         output = affinity.m2;
       }
     }
-    return output != Memory::NO_MEMORY;
+    if (output == Memory::NO_MEMORY) {
+      std::set<Memory> memories;
+      Machine::get_machine().get_all_memories(memories);
+      for (auto mem : memories) {
+        if (mem.kind() == kind) {
+          output = mem;
+          return true;
+        }
+      }
+      return false;
+    }
+    return true;
   }
 
   template <int N, typename T>
@@ -132,8 +169,7 @@ namespace Realm {
     }
     new_offsets[inst_space.num_children] = num_new_entries;
     CUDA_CHECK(cudaMemcpyAsync(inst_space.offsets, new_offsets.data(), (inst_space.num_children + 1) * sizeof(size_t), cudaMemcpyHostToDevice, stream), stream);
-    RegionInstance new_entries_buffer = realm_malloc(num_new_entries * sizeof(SparsityMapEntry<N,T>), inst_space.h_instance.get_location());
-    SparsityMapEntry<N,T> *new_entries_ptr = reinterpret_cast<SparsityMapEntry<N, T> *>(new_entries_buffer.pointer_untyped(0, num_new_entries * sizeof(SparsityMapEntry<N,T>)));
+    SparsityMapEntry<N,T> *new_entries_ptr = deppart_host_alloc<SparsityMapEntry<N, T>>(num_new_entries);
 
     size_t write_loc = 0;
     for (size_t i = num_completed; i < inst_space.num_entries; i++) {
@@ -181,8 +217,8 @@ namespace Realm {
     num_completed = 0;
     inst_space.entries_buffer = new_entries_ptr;
     inst_space.num_entries = num_new_entries;
-    inst_space.h_instance.destroy();
-    inst_space.h_instance = new_entries_buffer;
+    deppart_host_free(inst_space.host_entries_owner);
+    inst_space.host_entries_owner = new_entries_ptr;
     CUDA_CHECK(cudaStreamSynchronize(stream), stream);
 
   }
@@ -229,15 +265,11 @@ namespace Realm {
     space_offsets[spaces.size()] = out_space.num_entries;
 
     //We copy into one contiguous host buffer, then copy to device
-    Memory sysmem;
-    assert(find_memory(sysmem, Memory::SYSTEM_MEM, my_arena.location));
-
-
-    RegionInstance h_instance = realm_malloc(out_space.num_entries * sizeof(SparsityMapEntry<N,T>), sysmem);
-    SparsityMapEntry<N, T>* h_entries = reinterpret_cast<SparsityMapEntry<N,T>*>(AffineAccessor<char,1>(h_instance, 0).base);
+    SparsityMapEntry<N, T>* h_entries = deppart_host_alloc<SparsityMapEntry<N, T>>(out_space.num_entries);
 
     if (my_arena.capacity()==0) {
-      out_space.entries_buffer = reinterpret_cast<SparsityMapEntry<N,T>*>(AffineAccessor<char,1>(h_instance, 0).base);
+      out_space.entries_buffer = h_entries;
+      out_space.host_entries_owner = h_entries;
     } else {
       out_space.entries_buffer = my_arena.alloc<SparsityMapEntry<N,T> >(out_space.num_entries);
     }
@@ -286,9 +318,7 @@ namespace Realm {
     if (my_arena.capacity() != 0) {
       CUDA_CHECK(cudaMemcpyAsync(out_space.entries_buffer, h_entries, out_space.num_entries * sizeof(SparsityMapEntry<N,T>), cudaMemcpyHostToDevice, stream), stream);
       CUDA_CHECK(cudaStreamSynchronize(stream), stream);
-      h_instance.destroy();
-    } else {
-      out_space.h_instance = h_instance;
+      deppart_host_free(h_entries);
     }
     CUDA_CHECK(cudaStreamSynchronize(stream), stream);
 
@@ -1569,16 +1599,12 @@ namespace Realm {
   }
 
   template<int N, typename T>
-  void GPUMicroOp<N,T>::split_output(RectDesc<N, T>* d_rects, size_t total_rects, std::vector<RegionInstance> &output_instances, std::vector<size_t> &output_counts, Arena &my_arena)
+  void GPUMicroOp<N,T>::split_output(RectDesc<N, T>* d_rects, size_t total_rects, std::vector<Rect<N, T> *> &output_instances, std::vector<size_t> &output_counts, Arena &my_arena)
   {
     NVTX_DEPPART(split_output);
 
     CUstream stream = this->stream->get_stream();
     bool use_sysmem = false;
-    RegionInstance sys_instance = RegionInstance::NO_INST;
-
-    Memory sysmem;
-    assert(find_memory(sysmem, Memory::SYSTEM_MEM, my_arena.location));
 
     Rect<N,T>* final_rects;
     std::vector<size_t> d_starts_host(output_instances.size()), d_ends_host(output_instances.size());
@@ -1605,10 +1631,8 @@ namespace Realm {
       CUDA_CHECK(cudaStreamSynchronize(stream), stream);
     } catch (arena_oom&) {
       use_sysmem = true;
-      RegionInstance tmp_instance = this->realm_malloc(total_rects * sizeof(RectDesc<N,T>), sysmem);
-      sys_instance = this->realm_malloc(total_rects * sizeof(Rect<N,T>), sysmem);
-      RectDesc<N, T>* h_tmp_rects = reinterpret_cast<RectDesc<N,T>*>(tmp_instance.pointer_untyped(0, total_rects * sizeof(RectDesc<N,T>)));
-      final_rects = reinterpret_cast<Rect<N,T>*>(sys_instance.pointer_untyped(0, total_rects * sizeof(Rect<N,T>)));
+      RectDesc<N, T>* h_tmp_rects = deppart_host_alloc<RectDesc<N, T>>(total_rects);
+      final_rects = deppart_host_alloc<Rect<N, T>>(total_rects);
       CUDA_CHECK(cudaMemcpyAsync(h_tmp_rects, d_rects, total_rects * sizeof(RectDesc<N,T>), cudaMemcpyDeviceToHost, stream), stream);
       CUDA_CHECK(cudaStreamSynchronize(stream), stream);
       for (size_t idx = 0; idx < total_rects; idx++ ) {
@@ -1624,7 +1648,7 @@ namespace Realm {
           d_ends_host[h_tmp_rects[idx].src_idx] = idx+1;
         }
       }
-      tmp_instance.destroy();
+      deppart_host_free(h_tmp_rects);
     }
 
     for (size_t i = 1; i < output_instances.size(); i++) {
@@ -1639,12 +1663,10 @@ namespace Realm {
         size_t end = d_ends_host[i];
         size_t start = d_starts_host[i];
         if (end - start > 0) {
-          RegionInstance new_instance = this->realm_malloc(((end - start) + output_counts[i]) * sizeof(Rect<N, T>), sysmem);
-          Rect<N, T>* h_new_rects = reinterpret_cast<Rect<N, T>*>(new_instance.pointer_untyped(0, ((end - start) + output_counts[i]) * sizeof(Rect<N, T>)));
+          Rect<N, T>* h_new_rects = deppart_host_alloc<Rect<N, T>>((end - start) + output_counts[i]);
           if (output_counts[i] > 0) {
-            Rect<N, T>* h_old_rects = reinterpret_cast<Rect<N, T>*>(output_instances[i].pointer_untyped(0, output_counts[i] * sizeof(Rect<N, T>)));
-            std::memcpy(h_new_rects, h_old_rects, output_counts[i] * sizeof(Rect<N, T>));
-            output_instances[i].destroy();
+            std::memcpy(h_new_rects, output_instances[i], output_counts[i] * sizeof(Rect<N, T>));
+            deppart_host_free(output_instances[i]);
           }
           if (use_sysmem) {
             std::memcpy(h_new_rects + output_counts[i], final_rects + start, (end - start) * sizeof(Rect<N, T>));
@@ -1652,13 +1674,13 @@ namespace Realm {
             CUDA_CHECK(cudaMemcpyAsync(h_new_rects + output_counts[i], final_rects + start, (end - start) * sizeof(Rect<N, T>), cudaMemcpyDeviceToHost, stream), stream);
             CUDA_CHECK(cudaStreamSynchronize(stream), stream);
           }
-          output_instances[i] = new_instance;
+          output_instances[i] = h_new_rects;
           output_counts[i] += end - start;
         }
       }
     }
     if (use_sysmem) {
-      sys_instance.destroy();
+      deppart_host_free(final_rects);
     }
   }
 
@@ -1715,30 +1737,31 @@ namespace Realm {
         if (d_ends_host[idx] > d_starts_host[idx]) {
           size_t end = d_ends_host[idx];
           size_t start = d_starts_host[idx];
-          RegionInstance h_rects_instance = this->realm_malloc((end - start) * sizeof(Rect<N,T>), sysmem);
-          Rect<N, T> *h_rects = reinterpret_cast<Rect<N,T> *>(AffineAccessor<char, 1>(h_rects_instance, 0).base);
+          Rect<N, T> *h_rects = deppart_host_alloc<Rect<N, T>>(end - start);
           CUDA_CHECK(cudaMemcpyAsync(h_rects, final_rects + start, (end - start) * sizeof(Rect<N,T>), cudaMemcpyDeviceToHost, stream), stream);
           CUDA_CHECK(cudaStreamSynchronize(stream), stream);
           span<Rect<N, T>> h_rects_span(h_rects, end - start);
           bool disjoint = !this->is_image_microop();
           impl->contribute_dense_rect_list(h_rects_span, disjoint);
-          h_rects_instance.destroy();
+          deppart_host_free(h_rects);
         } else {
           impl->contribute_nothing();
         }
       }
     } else {
+      std::vector<SparsityMapImpl<N, T> *> local_finalizations;
 
       //Use provided lambdas to iterate over sparsity output container (map or vector)
       for (auto const& elem : ctr) {
         size_t idx = getIndex(elem);
         auto mapOpj = getMap(elem);
         SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(mapOpj);
+        NodeID owner = ID(mapOpj).sparsity_creator_node();
+        assert(owner == Network::my_node_id);
         if (d_ends_host[idx] > d_starts_host[idx]) {
           size_t end = d_ends_host[idx];
           size_t start = d_starts_host[idx];
-          RegionInstance entries = this->realm_malloc((end - start) * sizeof(SparsityMapEntry<N,T>), sysmem);
-          SparsityMapEntry<N, T> *h_entries = reinterpret_cast<SparsityMapEntry<N, T> *>(AffineAccessor<char, 1>(entries, 0).base);
+          SparsityMapEntry<N, T> *h_entries = deppart_host_alloc<SparsityMapEntry<N, T>>(end - start);
           CUDA_CHECK(cudaMemcpyAsync(h_entries, final_entries + start, (end - start) * sizeof(SparsityMapEntry<N,T>), cudaMemcpyDeviceToHost, stream), stream);
 
           Rect<N,T> *approx_rects;
@@ -1779,17 +1802,44 @@ namespace Realm {
             );
             CUDA_CHECK(cudaStreamSynchronize(stream), stream);
           }
-          RegionInstance approx_entries = this->realm_malloc(num_approx * sizeof(Rect<N,T>), sysmem);
-          SparsityMapEntry<N, T> *h_approx_entries = reinterpret_cast<SparsityMapEntry<N, T> *>(AffineAccessor<char, 1>(approx_entries, 0).base);
+          Rect<N, T> *h_approx_entries = deppart_host_alloc<Rect<N, T>>(num_approx);
           CUDA_CHECK(cudaMemcpyAsync(h_approx_entries, approx_rects, num_approx * sizeof(Rect<N,T>), cudaMemcpyDeviceToHost, stream), stream);
           CUDA_CHECK(cudaStreamSynchronize(stream), stream);
-          impl->set_instance(entries, end - start);
-          impl->set_approx_instance(approx_entries, num_approx);
+          if(owner == Network::my_node_id) {
+            impl->set_gpu_entries(h_entries, end - start);
+            impl->set_gpu_approx_rects(h_approx_entries, num_approx);
+            local_finalizations.push_back(impl);
+          } else {
+            size_t payload_bytes = ((end - start) * sizeof(SparsityMapEntry<N, T>)) +
+                                   (num_approx * sizeof(Rect<N, T>));
+            ActiveMessage<typename SparsityMapImpl<N, T>::RemoteGpuFinalizeMessage>
+                amsg(owner, payload_bytes);
+            amsg->sparsity = mapOpj;
+            amsg->num_entries = end - start;
+            amsg->num_approx = num_approx;
+            amsg.add_payload(h_entries, (end - start) * sizeof(SparsityMapEntry<N, T>),
+                             PAYLOAD_COPY);
+            amsg.add_payload(h_approx_entries, num_approx * sizeof(Rect<N, T>),
+                             PAYLOAD_COPY);
+            amsg.commit();
+            deppart_host_free(h_entries);
+            deppart_host_free(h_approx_entries);
+          }
+        } else {
+          if(owner == Network::my_node_id) {
+            local_finalizations.push_back(impl);
+          } else {
+            ActiveMessage<typename SparsityMapImpl<N, T>::RemoteGpuFinalizeMessage>
+                amsg(owner);
+            amsg->sparsity = mapOpj;
+            amsg->num_entries = 0;
+            amsg->num_approx = 0;
+            amsg.commit();
+          }
         }
       }
       CUDA_CHECK(cudaStreamSynchronize(stream), stream);
-      for (auto const& elem : ctr) {
-        SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(getMap(elem));
+      for (SparsityMapImpl<N, T> *impl : local_finalizations) {
         impl->gpu_finalize();
       }
     }
@@ -1797,4 +1847,4 @@ namespace Realm {
   }
 
 
-}
\ No newline at end of file
+}
diff --git a/src/realm/deppart/preimage.cc b/src/realm/deppart/preimage.cc
index 9ac7d85606..e283a3ec47 100644
--- a/src/realm/deppart/preimage.cc
+++ b/src/realm/deppart/preimage.cc
@@ -341,7 +341,8 @@ namespace Realm {
 		  parent(_parent),
 		  domain_transform(_domain_transform),
 		  overlap_tester(0),
-		  dummy_overlap_uop(0) {
+		  dummy_overlap_uop(0),
+		  exclusive_gpu_owner(exclusive_gpu_exec_node()) {
 		areg.force_instantiation();
 	}
 
@@ -351,6 +352,33 @@ namespace Realm {
 			delete overlap_tester;
 	}
 
+	template<int N, typename T, int N2, typename T2>
+	NodeID PreimageOperation<N, T, N2, T2>::exclusive_gpu_exec_node(void) const {
+		size_t gpu_ptrs = 0, gpu_rects = 0, cpu_ptrs = 0, cpu_rects = 0;
+		for(size_t i = 0; i < domain_transform.ptr_data.size(); i++) {
+			Memory::Kind kind = domain_transform.ptr_data[i].inst.get_location().kind();
+			if((kind == Memory::GPU_FB_MEM) || (kind == Memory::Z_COPY_MEM))
+				gpu_ptrs++;
+			else
+				cpu_ptrs++;
+		}
+		for(size_t i = 0; i < domain_transform.range_data.size(); i++) {
+			Memory::Kind kind = domain_transform.range_data[i].inst.get_location().kind();
+			if((kind == Memory::GPU_FB_MEM) || (kind == Memory::Z_COPY_MEM))
+				gpu_rects++;
+			else
+				cpu_rects++;
+		}
+		size_t opcount = gpu_ptrs + gpu_rects + cpu_ptrs + cpu_rects;
+		if((gpu_ptrs + gpu_rects) == 0 || (opcount != 1))
+			return -1;
+		if(gpu_ptrs == 1)
+			return ID(domain_transform.ptr_data[0].inst).instance_owner_node();
+		if(gpu_rects == 1)
+			return ID(domain_transform.range_data[0].inst).instance_owner_node();
+		return -1;
+	}
+
 	template<int N, typename T, int N2, typename T2>
 	IndexSpace<N, T> PreimageOperation<N, T, N2, T2>::add_target(const IndexSpace<N2, T2> &target) {
 		// try to filter out obviously empty targets
@@ -364,7 +392,9 @@ namespace Realm {
 		// if the target has a sparsity map, use the same node - otherwise
 		// get a sparsity ID by round-robin'ing across the nodes that have field data
 		int target_node;
-		if (!target.dense())
+		if (exclusive_gpu_owner >= 0)
+			target_node = exclusive_gpu_owner;
+		else if (!target.dense())
 			target_node = ID(target.sparsity).sparsity_creator_node();
 		else if (!domain_transform.ptr_data.empty())
 			target_node =
@@ -378,8 +408,10 @@ namespace Realm {
 						.range_data[targets.size() % domain_transform.range_data.size()]
 						.inst)
 					.instance_owner_node();
-		SparsityMap<N, T> sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert<SparsityMap<N,
-			T> >();
+		if (exclusive_gpu_owner >= 0)
+			assert(target_node == exclusive_gpu_exec_node());
+		SparsityMap<N, T> sparsity =
+				create_deppart_output_sparsity(target_node).convert<SparsityMap<N, T>>();
 		preimage.sparsity = sparsity;
 
 		targets.push_back(target);
@@ -818,13 +850,80 @@ namespace Realm {
 		IndexSpace<N, T> _parent_space, bool _exclusive)
 		: domain_transform(_domain_transform), parent_space(_parent_space) {
 		this->exclusive = _exclusive;
-		Memory my_mem = domain_transform.ptr_data.empty() ? domain_transform.range_data[0].inst.get_location() : domain_transform.ptr_data[0].inst.get_location();
-		Processor best_proc;
-		assert(choose_proc(best_proc, my_mem));
-		Cuda::GPUProcessor* gpu_proc = dynamic_cast<Cuda::GPUProcessor*>(get_runtime()->get_processor_impl(best_proc));
-		assert(gpu_proc);
-		this->gpu = gpu_proc->gpu;
-		this->stream = gpu_proc->gpu->get_deppart_stream();
+		areg.force_instantiation();
+		// GPU setup (this->gpu, this->stream) deferred to execute(), which runs on the
+		// correct node after dispatch() has forwarded to the instance owner if needed.
+	}
+
+	template<int N, typename T, int N2, typename T2>
+	template <typename S>
+	GPUPreimageMicroOp<N, T, N2, T2>::GPUPreimageMicroOp(
+		NodeID _requestor, AsyncMicroOp *_async_microop, S& s)
+		: GPUMicroOp<N,T>(_requestor, _async_microop) {
+		bool ok = true;
+		// domain_transform is always UNSTRUCTURED; only one of ptr_data/range_data
+		// is populated — a single bool distinguishes the two cases.
+		bool use_ptr_data = false;
+		ok = ok && (s >> use_ptr_data);
+		if(use_ptr_data) {
+			domain_transform.type =
+				DomainTransform<N2,T2,N,T>::DomainTransformType::UNSTRUCTURED_PTR;
+			size_t np = 0;
+			ok = ok && (s >> np);
+			domain_transform.ptr_data.resize(np);
+			for(size_t i = 0; i < np && ok; i++)
+				ok = ok && (s >> domain_transform.ptr_data[i].index_space) &&
+				           (s >> domain_transform.ptr_data[i].inst) &&
+				           (s >> domain_transform.ptr_data[i].field_offset) &&
+				           (s >> domain_transform.ptr_data[i].scratch_buffer);
+		} else {
+			domain_transform.type =
+				DomainTransform<N2,T2,N,T>::DomainTransformType::UNSTRUCTURED_RANGE;
+			size_t nr = 0;
+			ok = ok && (s >> nr);
+			domain_transform.range_data.resize(nr);
+			for(size_t i = 0; i < nr && ok; i++)
+				ok = ok && (s >> domain_transform.range_data[i].index_space) &&
+				           (s >> domain_transform.range_data[i].inst) &&
+				           (s >> domain_transform.range_data[i].field_offset) &&
+				           (s >> domain_transform.range_data[i].scratch_buffer);
+		}
+		ok = ok && (s >> parent_space);
+		ok = ok && (s >> this->exclusive);
+		ok = ok && (s >> targets);
+		ok = ok && (s >> sparsity_outputs);
+		assert(ok);
+		(void)ok;
+	}
+
+	template<int N, typename T, int N2, typename T2>
+	template <typename S>
+	bool GPUPreimageMicroOp<N, T, N2, T2>::serialize_params(S& s) const {
+		bool ok = true;
+		// domain_transform is always UNSTRUCTURED; only one of ptr_data/range_data
+		// is populated — a single bool distinguishes the two cases.
+		bool use_ptr_data = !domain_transform.ptr_data.empty();
+		ok = ok && (s << use_ptr_data);
+		if(use_ptr_data) {
+			ok = ok && (s << domain_transform.ptr_data.size());
+			for(size_t i = 0; i < domain_transform.ptr_data.size() && ok; i++)
+				ok = ok && (s << domain_transform.ptr_data[i].index_space) &&
+				           (s << domain_transform.ptr_data[i].inst) &&
+				           (s << domain_transform.ptr_data[i].field_offset) &&
+				           (s << domain_transform.ptr_data[i].scratch_buffer);
+		} else {
+			ok = ok && (s << domain_transform.range_data.size());
+			for(size_t i = 0; i < domain_transform.range_data.size() && ok; i++)
+				ok = ok && (s << domain_transform.range_data[i].index_space) &&
+				           (s << domain_transform.range_data[i].inst) &&
+				           (s << domain_transform.range_data[i].field_offset) &&
+				           (s << domain_transform.range_data[i].scratch_buffer);
+		}
+		ok = ok && (s << parent_space);
+		ok = ok && (s << this->exclusive);
+		ok = ok && (s << targets);
+		ok = ok && (s << sparsity_outputs);
+		return ok;
 	}
 
 	template<int N, typename T, int N2, typename T2>
@@ -841,6 +940,20 @@ namespace Realm {
 	template<int N, typename T, int N2, typename T2>
 	void GPUPreimageMicroOp<N, T, N2, T2>::execute(void) {
 		TimeStamp ts("GPUPreimageMicroOp::execute", true, &log_uop_timing);
+		// Resolve the local GPU processor now that we are guaranteed to be on the
+		// correct node (dispatch() forwarded us here if the instance was remote).
+		{
+			Memory my_mem = domain_transform.ptr_data.empty() ?
+				domain_transform.range_data[0].inst.get_location() :
+				domain_transform.ptr_data[0].inst.get_location();
+			Processor best_proc;
+			assert(choose_proc(best_proc, my_mem));
+			Cuda::GPUProcessor *gpu_proc =
+				dynamic_cast<Cuda::GPUProcessor *>(get_runtime()->get_processor_impl(best_proc));
+			assert(gpu_proc);
+			this->gpu = gpu_proc->gpu;
+			this->stream = gpu_proc->gpu->get_deppart_stream();
+		}
 		Cuda::AutoGPUContext agc(this->gpu);
 	        if (domain_transform.ptr_data.size() > 0) {
 	          gpu_populate_bitmasks();
@@ -852,6 +965,40 @@ namespace Realm {
 	template<int N, typename T, int N2, typename T2>
 	void GPUPreimageMicroOp<N, T, N2, T2>::dispatch(
 		PartitioningOperation *op, bool inline_ok) {
+		// GPU preimage must execute on the node that owns the GPU memory
+		NodeID exec_node = domain_transform.ptr_data.empty() ?
+			ID(domain_transform.range_data[0].inst).instance_owner_node() :
+			ID(domain_transform.ptr_data[0].inst).instance_owner_node();
+		if(this->exclusive) {
+			for(size_t i = 0; i < sparsity_outputs.size(); i++)
+				assert(NodeID(ID(sparsity_outputs[i]).sparsity_creator_node()) == exec_node);
+		}
+		if(exec_node != Network::my_node_id) {
+			PartitioningMicroOp::template forward_microop<GPUPreimageMicroOp<N,T,N2,T2> >(exec_node, op, this);
+			return;
+		}
+
+		for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) {
+			IndexSpace<N, T> inst_space = domain_transform.ptr_data[i].index_space;
+			if (!inst_space.dense()) {
+				// it's safe to add the count after the registration only because we initialized
+				//  the count to 2 instead of 1
+				bool registered = SparsityMapImpl<N,T>::lookup(inst_space.sparsity)->add_waiter(this, true /*precise*/);
+				if(registered)
+					this->wait_count.fetch_add(1);
+			}
+		}
+		for (size_t i = 0; i < domain_transform.range_data.size(); i++) {
+			IndexSpace<N, T> inst_space = domain_transform.range_data[i].index_space;
+			if (!inst_space.dense()) {
+				// it's safe to add the count after the registration only because we initialized
+				//  the count to 2 instead of 1
+				bool registered = SparsityMapImpl<N,T>::lookup(inst_space.sparsity)->add_waiter(this, true /*precise*/);
+				if(registered)
+					this->wait_count.fetch_add(1);
+			}
+		}
+
 		// need valid data for each target
 		for (size_t i = 0; i < targets.size(); i++) {
 			if (!targets[i].dense()) {
@@ -874,8 +1021,13 @@ namespace Realm {
 
 		this->finish_dispatch(op, inline_ok);
 	}
+
+	template<int N, typename T, int N2, typename T2>
+	ActiveMessageHandlerReg<RemoteMicroOpMessage<GPUPreimageMicroOp<N, T, N2, T2> > >
+		GPUPreimageMicroOp<N, T, N2, T2>::areg;
+
 #endif
 
 
 	// instantiations of templates handled in preimage_tmpl.cc
-}; // namespace Realm
\ No newline at end of file
+}; // namespace Realm
diff --git a/src/realm/deppart/preimage.h b/src/realm/deppart/preimage.h
index ed301ad51e..01032d2517 100644
--- a/src/realm/deppart/preimage.h
+++ b/src/realm/deppart/preimage.h
@@ -100,6 +100,7 @@ namespace Realm {
 
   protected:
     static ActiveMessageHandlerReg<ApproxImageResponseMessage<PreimageOperation<N,T,N2,T2> > > areg;
+    NodeID exclusive_gpu_exec_node(void) const;
 
     IndexSpace<N, T> parent;
     DomainTransform<N2, T2, N, T> domain_transform;
@@ -111,6 +112,7 @@ namespace Realm {
     atomic<int> remaining_sparse_images;
     std::vector<atomic<int> > contrib_counts;
     AsyncMicroOp *dummy_overlap_uop;
+    int exclusive_gpu_owner;
   };
 
   template <typename T>
@@ -175,6 +177,16 @@ namespace Realm {
     void dispatch(PartitioningOperation *op, bool inline_ok);
 
   protected:
+    friend struct RemoteMicroOpMessage<GPUPreimageMicroOp<N,T,N2,T2> >;
+    static ActiveMessageHandlerReg<RemoteMicroOpMessage<GPUPreimageMicroOp<N,T,N2,T2> > > areg;
+
+    friend class PartitioningMicroOp;
+    template <typename S>
+    REALM_ATTR_WARN_UNUSED(bool serialize_params(S& s) const);
+
+    // construct from received packet
+    template <typename S>
+    GPUPreimageMicroOp(NodeID _requestor, AsyncMicroOp *_async_microop, S& s);
 
     void gpu_populate_ranges();
     void gpu_populate_bitmasks();
@@ -189,4 +201,4 @@ namespace Realm {
 
   };  // namespace Realm
 
-#endif // REALM_DEPPART_PREIMAGE_H
\ No newline at end of file
+#endif // REALM_DEPPART_PREIMAGE_H
diff --git a/src/realm/deppart/preimage_gpu_impl.hpp b/src/realm/deppart/preimage_gpu_impl.hpp
index 2a93136921..6934772fe4 100644
--- a/src/realm/deppart/preimage_gpu_impl.hpp
+++ b/src/realm/deppart/preimage_gpu_impl.hpp
@@ -12,6 +12,7 @@ namespace Realm {
   template<int N, typename T, int N2, typename T2>
   void GPUPreimageMicroOp<N, T, N2, T2>::gpu_populate_ranges() {
     if (targets.size() == 0) {
+      assert(sparsity_outputs.empty());
       return;
     }
 
@@ -57,13 +58,15 @@ namespace Realm {
 
     GPUMicroOp<N2, T2>::collapse_multi_space(targets, target_space, buffer_arena, stream);
 
-    Memory zcpy_mem;
-    assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM, buffer_arena.location));
-    RegionInstance accessors_instance = this->realm_malloc(domain_transform.range_data.size() * sizeof(AffineAccessor<Rect<N2,T2>,N,T>), zcpy_mem);
-    AffineAccessor<Rect<N2,T2>,N,T>* d_accessors = reinterpret_cast<AffineAccessor<Rect<N2,T2>,N,T>*>(AffineAccessor<char,1>(accessors_instance, 0).base);
+    std::vector<AffineAccessor<Rect<N2,T2>,N,T>> h_accessors(domain_transform.range_data.size());
     for (size_t i = 0; i < domain_transform.range_data.size(); ++i) {
-      d_accessors[i] = AffineAccessor<Rect<N2,T2>,N,T>(domain_transform.range_data[i].inst, domain_transform.range_data[i].field_offset);
+      h_accessors[i] = AffineAccessor<Rect<N2,T2>,N,T>(domain_transform.range_data[i].inst, domain_transform.range_data[i].field_offset);
     }
+    AffineAccessor<Rect<N2,T2>,N,T>* d_accessors =
+        buffer_arena.alloc<AffineAccessor<Rect<N2,T2>,N,T>>(domain_transform.range_data.size());
+    CUDA_CHECK(cudaMemcpyAsync(d_accessors, h_accessors.data(),
+                               domain_transform.range_data.size() * sizeof(AffineAccessor<Rect<N2,T2>,N,T>),
+                               cudaMemcpyHostToDevice, stream), stream);
 
     uint32_t* d_target_counters = buffer_arena.alloc<uint32_t>(2*targets.size() + 1);
     uint32_t* d_targets_prefix = d_target_counters + targets.size();
@@ -78,7 +81,7 @@ namespace Realm {
     int count = 0;
     if (count) {}
     bool host_fallback = false;
-    std::vector<RegionInstance> h_instances(targets.size(), RegionInstance::NO_INST);
+    std::vector<Rect<N, T>*> host_rect_buffers(targets.size(), nullptr);
     std::vector<size_t> entry_counts(targets.size(), 0);
     while (num_completed < inst_space.num_entries) {
       try {
@@ -214,7 +217,7 @@ namespace Realm {
                            });
 
         if (host_fallback) {
-          this->split_output(d_new_rects, num_new_rects, h_instances, entry_counts, buffer_arena);
+          this->split_output(d_new_rects, num_new_rects, host_rect_buffers, entry_counts, buffer_arena);
         }
 
         if (num_output==0 || host_fallback) {
@@ -263,7 +266,7 @@ namespace Realm {
           } else {
             host_fallback = true;
             if (num_output > 0) {
-              this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+              this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena);
             }
             curr_tile = tile_size / 2;
           }
@@ -294,7 +297,7 @@ namespace Realm {
                               return elem;
                            });
       } catch (arena_oom&) {
-        this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+        this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena);
         host_fallback = true;
       }
     }
@@ -306,10 +309,9 @@ namespace Realm {
           impl->set_contributor_count(1);
         }
         if (entry_counts[idx] > 0) {
-          Rect<N, T>* h_rects = reinterpret_cast<Rect<N,T> *>(AffineAccessor<char,1>(h_instances[idx], 0).base);
-          span<Rect<N, T>> h_rects_span(h_rects, entry_counts[idx]);
+          span<Rect<N, T>> h_rects_span(host_rect_buffers[idx], entry_counts[idx]);
           impl->contribute_dense_rect_list(h_rects_span, true);
-          h_instances[idx].destroy();
+          deppart_host_free(host_rect_buffers[idx]);
         } else {
           impl->contribute_nothing();
         }
@@ -320,6 +322,7 @@ namespace Realm {
   template<int N, typename T, int N2, typename T2>
   void GPUPreimageMicroOp<N, T, N2, T2>::gpu_populate_bitmasks() {
     if (targets.size() == 0) {
+      assert(sparsity_outputs.empty());
       return;
     }
 
@@ -365,13 +368,15 @@ namespace Realm {
 
     GPUMicroOp<N2, T2>::collapse_multi_space(targets, target_space, buffer_arena, stream);
 
-    Memory zcpy_mem;
-    assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM, buffer_arena.location));
-    RegionInstance accessors_instance = this->realm_malloc(domain_transform.ptr_data.size() * sizeof(AffineAccessor<Point<N2,T2>,N,T>), zcpy_mem);
-    AffineAccessor<Point<N2,T2>,N,T>* d_accessors = reinterpret_cast<AffineAccessor<Point<N2,T2>,N,T>*>(AffineAccessor<char,1>(accessors_instance, 0).base);
+    std::vector<AffineAccessor<Point<N2,T2>,N,T>> h_accessors(domain_transform.ptr_data.size());
     for (size_t i = 0; i < domain_transform.ptr_data.size(); ++i) {
-      d_accessors[i] = AffineAccessor<Point<N2,T2>,N,T>(domain_transform.ptr_data[i].inst, domain_transform.ptr_data[i].field_offset);
+      h_accessors[i] = AffineAccessor<Point<N2,T2>,N,T>(domain_transform.ptr_data[i].inst, domain_transform.ptr_data[i].field_offset);
     }
+    AffineAccessor<Point<N2,T2>,N,T>* d_accessors =
+        buffer_arena.alloc<AffineAccessor<Point<N2,T2>,N,T>>(domain_transform.ptr_data.size());
+    CUDA_CHECK(cudaMemcpyAsync(d_accessors, h_accessors.data(),
+                               domain_transform.ptr_data.size() * sizeof(AffineAccessor<Point<N2,T2>,N,T>),
+                               cudaMemcpyHostToDevice, stream), stream);
 
     uint32_t* d_target_counters = buffer_arena.alloc<uint32_t>(2*targets.size() + 1);
     uint32_t* d_targets_prefix = d_target_counters + targets.size();
@@ -386,7 +391,7 @@ namespace Realm {
     int count = 0;
     if (count) {}
     bool host_fallback = false;
-    std::vector<RegionInstance> h_instances(targets.size(), RegionInstance::NO_INST);
+    std::vector<Rect<N, T>*> host_rect_buffers(targets.size(), nullptr);
     std::vector<size_t> entry_counts(targets.size(), 0);
     while (num_completed < inst_space.num_entries) {
       try {
@@ -522,7 +527,7 @@ namespace Realm {
                            });
 
         if (host_fallback) {
-          this->split_output(d_new_rects, num_new_rects, h_instances, entry_counts, buffer_arena);
+          this->split_output(d_new_rects, num_new_rects, host_rect_buffers, entry_counts, buffer_arena);
         }
 
         if (num_output==0 || host_fallback) {
@@ -571,7 +576,7 @@ namespace Realm {
           } else {
             host_fallback = true;
             if (num_output > 0) {
-              this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+              this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena);
             }
             curr_tile = tile_size / 2;
           }
@@ -602,7 +607,7 @@ namespace Realm {
                               return elem;
                            });
       } catch (arena_oom&) {
-        this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena);
+        this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena);
         host_fallback = true;
       }
     }
@@ -614,14 +619,13 @@ namespace Realm {
           impl->set_contributor_count(1);
         }
         if (entry_counts[idx] > 0) {
-          Rect<N, T>* h_rects = reinterpret_cast<Rect<N,T> *>(AffineAccessor<char,1>(h_instances[idx], 0).base);
-          span<Rect<N, T>> h_rects_span(h_rects, entry_counts[idx]);
+          span<Rect<N, T>> h_rects_span(host_rect_buffers[idx], entry_counts[idx]);
           impl->contribute_dense_rect_list(h_rects_span, true);
-          h_instances[idx].destroy();
+          deppart_host_free(host_rect_buffers[idx]);
         } else {
           impl->contribute_nothing();
         }
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/src/realm/deppart/sparsity_impl.cc b/src/realm/deppart/sparsity_impl.cc
index a1a511b744..20c655a62c 100644
--- a/src/realm/deppart/sparsity_impl.cc
+++ b/src/realm/deppart/sparsity_impl.cc
@@ -25,9 +25,181 @@
 #include "realm/deppart/rectlist.h"
 #include "realm/deppart/inst_helper.h"
 #include "realm/logging.h"
+#include "realm/machine.h"
+#ifdef REALM_USE_CUDA
+#include <cuda_runtime_api.h>
+#endif
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <condition_variable>
+#include <mutex>
+#include <unordered_map>
 
 namespace Realm {
 
+  namespace {
+    struct PendingOutputSparsityAllocation {
+      std::mutex mutex;
+      std::condition_variable cv;
+      ID result{ID::ID_NULL};
+      bool ready{false};
+    };
+
+    atomic<uint64_t> next_output_sparsity_request{1};
+    std::mutex pending_output_sparsity_mutex;
+    std::unordered_map<uint64_t, PendingOutputSparsityAllocation *>
+        pending_output_sparsity_allocations;
+
+    struct OutputSparsityAllocationRequest {
+      uint64_t request_id;
+
+      static void handle_message(NodeID sender,
+                                 const OutputSparsityAllocationRequest &msg,
+                                 const void *data,
+                                 size_t datalen);
+    };
+
+    struct OutputSparsityAllocationResponse {
+      uint64_t request_id;
+      ID sparsity;
+
+      static void handle_message(NodeID sender,
+                                 const OutputSparsityAllocationResponse &msg,
+                                 const void *data,
+                                 size_t datalen);
+    };
+
+    ActiveMessageHandlerReg<OutputSparsityAllocationRequest>
+        output_sparsity_allocation_request_reg;
+    ActiveMessageHandlerReg<OutputSparsityAllocationResponse>
+        output_sparsity_allocation_response_reg;
+
+    template <typename T>
+    inline T *deppart_gpu_host_alloc(size_t count)
+    {
+      if(count == 0) return nullptr;
+#ifdef REALM_USE_CUDA
+      void *ptr = nullptr;
+      cudaError_t err = cudaHostAlloc(&ptr, count * sizeof(T), cudaHostAllocPortable);
+      assert(err == cudaSuccess);
+      return reinterpret_cast<T *>(ptr);
+#else
+      return static_cast<T *>(std::malloc(count * sizeof(T)));
+#endif
+    }
+
+    inline void deppart_gpu_host_free(void *ptr)
+    {
+      if(ptr == nullptr) return;
+#ifdef REALM_USE_CUDA
+      cudaError_t err = cudaFreeHost(ptr);
+      assert(err == cudaSuccess);
+#else
+      std::free(ptr);
+#endif
+    }
+
+    inline bool deppart_sparsity_trace_enabled(void)
+    {
+      static int enabled = -1;
+      if(enabled < 0)
+        enabled = (std::getenv("REALM_DEPPART_SPARSITY_TRACE") != nullptr) ? 1 : 0;
+      return (enabled == 1);
+    }
+
+    inline void deppart_sparsity_trace(const char *tag,
+                                       ::realm_id_t sparsity,
+                                       NodeID owner,
+                                       NodeID node,
+                                       int remaining_contrib,
+                                       int total_pieces,
+                                       int remaining_pieces,
+                                       size_t extra0 = 0,
+                                       size_t extra1 = 0)
+    {
+      if(!deppart_sparsity_trace_enabled())
+        return;
+      std::fprintf(stderr,
+                   "[deppart-trace] %s map=%llx owner=%d node=%d rem_contrib=%d "
+                   "total_pieces=%d rem_pieces=%d extra0=%zu extra1=%zu\n",
+                   tag,
+                   static_cast<unsigned long long>(sparsity),
+                   owner,
+                   node,
+                   remaining_contrib,
+                   total_pieces,
+                   remaining_pieces,
+                   extra0,
+                   extra1);
+      std::fflush(stderr);
+    }
+  }
+
+  ID create_deppart_output_sparsity(NodeID target_node)
+  {
+    if(target_node == Network::my_node_id) {
+      SparsityMapImplWrapper *wrap =
+          get_runtime()->get_available_sparsity_impl(target_node);
+      wrap->add_references(1);
+      return ID(wrap->me);
+    }
+
+    PendingOutputSparsityAllocation pending;
+    uint64_t request_id = next_output_sparsity_request.fetch_add(1);
+    {
+      std::lock_guard<std::mutex> lock(pending_output_sparsity_mutex);
+      pending_output_sparsity_allocations.emplace(request_id, &pending);
+    }
+
+    ActiveMessage<OutputSparsityAllocationRequest> amsg(target_node);
+    amsg->request_id = request_id;
+    amsg.commit();
+
+    std::unique_lock<std::mutex> lock(pending.mutex);
+    pending.cv.wait(lock, [&pending]() { return pending.ready; });
+    return pending.result;
+  }
+
+  void OutputSparsityAllocationRequest::handle_message(
+      NodeID sender,
+      const OutputSparsityAllocationRequest &msg,
+      const void *data,
+      size_t datalen)
+  {
+    SparsityMapImplWrapper *wrap =
+        get_runtime()->get_available_sparsity_impl(Network::my_node_id);
+    wrap->add_references(1);
+
+    ActiveMessage<OutputSparsityAllocationResponse> amsg(sender);
+    amsg->request_id = msg.request_id;
+    amsg->sparsity = wrap->me;
+    amsg.commit();
+  }
+
+  void OutputSparsityAllocationResponse::handle_message(
+      NodeID sender,
+      const OutputSparsityAllocationResponse &msg,
+      const void *data,
+      size_t datalen)
+  {
+    PendingOutputSparsityAllocation *pending = nullptr;
+    {
+      std::lock_guard<std::mutex> lock(pending_output_sparsity_mutex);
+      auto it = pending_output_sparsity_allocations.find(msg.request_id);
+      assert(it != pending_output_sparsity_allocations.end());
+      pending = it->second;
+      pending_output_sparsity_allocations.erase(it);
+    }
+
+    {
+      std::lock_guard<std::mutex> lock(pending->mutex);
+      pending->result = msg.sparsity;
+      pending->ready = true;
+    }
+    pending->cv.notify_one();
+  }
+
   extern Logger log_part;
 
   ////////////////////////////////////////////////////////////////////////
@@ -1233,13 +1405,8 @@ bool SparsityMapPublicImpl<N, T>::bvh_centroid_less(int axis,
 template<int N, typename T>
 SparsityMapImpl<N, T>::~SparsityMapImpl(void)
 {
-     //We are responsible for our instances
-     //if (this->entries_instance.exists()) {
-     //    this->entries_instance.destroy();
-     //}
-     //if (this->approx_instance.exists()) {
-     //    this->approx_instance.destroy();
-     //}
+     deppart_gpu_host_free(this->gpu_entries);
+     deppart_gpu_host_free(this->gpu_approx_rects);
 }
 
   template <int N, typename T>
@@ -1324,6 +1491,14 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
   template <int N, typename T>
   void SparsityMapImpl<N, T>::set_contributor_count(int count)
   {
+    deppart_sparsity_trace("set_contributor_count.enter",
+                           me.id,
+                           ID(me).sparsity_creator_node(),
+                           Network::my_node_id,
+                           remaining_contributor_count.load(),
+                           total_piece_count.load(),
+                           remaining_piece_count.load(),
+                           count);
     if(NodeID(ID(me).sparsity_creator_node()) == Network::my_node_id) {
       // increment the count atomically - if it brings the total up to 0
       //  (which covers count == 0), immediately propagate the total piece
@@ -1341,8 +1516,23 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
       }
     } else {
       // send the contributor count to the owner node
-      sparsity_comm->send_contribute(me, count, 0, false);
+      // NOTE: must use SetContribCountMessage, not send_contribute!
+      // send_contribute arrives as contribute_raw_rects which DECREMENTS
+      // remaining_contributor_count by 1 (treating it as one contributor's piece),
+      // but set_contributor_count should INCREMENT by count.
+      ActiveMessage<SetContribCountMessage> amsg(ID(me).sparsity_creator_node());
+      amsg->sparsity = me;
+      amsg->count = count;
+      amsg.commit();
     }
+    deppart_sparsity_trace("set_contributor_count.exit",
+                           me.id,
+                           ID(me).sparsity_creator_node(),
+                           Network::my_node_id,
+                           remaining_contributor_count.load(),
+                           total_piece_count.load(),
+                           remaining_piece_count.load(),
+                           count);
   }
 
   template <int N, typename T>
@@ -1410,6 +1600,13 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
   template <int N, typename T>
   void SparsityMapImpl<N, T>::contribute_nothing(void)
   {
+    deppart_sparsity_trace("contribute_nothing.enter",
+                           me.id,
+                           ID(me).sparsity_creator_node(),
+                           Network::my_node_id,
+                           remaining_contributor_count.load(),
+                           total_piece_count.load(),
+                           remaining_piece_count.load());
     NodeID owner = ID(me).sparsity_creator_node();
 
     if(owner != Network::my_node_id) {
@@ -1432,6 +1629,13 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
       if(have_all_pieces)
         finalize();
     }
+    deppart_sparsity_trace("contribute_nothing.exit",
+                           me.id,
+                           ID(me).sparsity_creator_node(),
+                           Network::my_node_id,
+                           remaining_contributor_count.load(),
+                           total_piece_count.load(),
+                           remaining_piece_count.load());
   }
 
   template <int N, typename T>
@@ -1490,6 +1694,15 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
                                                    size_t piece_count, bool disjoint,
                                                    size_t total_count)
   {
+    deppart_sparsity_trace("contribute_raw_rects.enter",
+                           me.id,
+                           ID(me).sparsity_creator_node(),
+                           Network::my_node_id,
+                           remaining_contributor_count.load(),
+                           total_piece_count.load(),
+                           remaining_piece_count.load(),
+                           count,
+                           piece_count);
     if(count > 0) {
       AutoLock<> al(mutex);
 
@@ -1727,6 +1940,15 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
 
       finalize();
     }
+    deppart_sparsity_trace("contribute_raw_rects.exit",
+                           me.id,
+                           ID(me).sparsity_creator_node(),
+                           Network::my_node_id,
+                           remaining_contributor_count.load(),
+                           total_piece_count.load(),
+                           remaining_piece_count.load(),
+                           count,
+                           piece_count);
   }
 
   // adds a microop as a waiter for valid sparsity map data - returns true
@@ -1735,6 +1957,14 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
   template <int N, typename T>
   bool SparsityMapImpl<N, T>::add_waiter(PartitioningMicroOp *uop, bool precise)
   {
+    deppart_sparsity_trace("add_waiter.enter",
+                           me.id,
+                           ID(me).sparsity_creator_node(),
+                           Network::my_node_id,
+                           remaining_contributor_count.load(),
+                           total_piece_count.load(),
+                           remaining_piece_count.load(),
+                           precise ? 1 : 0);
 
     // early out
     if(precise ? this->entries_valid.load_acquire() : this->approx_valid.load_acquire())
@@ -1784,6 +2014,15 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
       sparsity_comm->send_request(me, request_precise, request_approx);
     }
 
+    deppart_sparsity_trace("add_waiter.exit",
+                           me.id,
+                           ID(me).sparsity_creator_node(),
+                           Network::my_node_id,
+                           remaining_contributor_count.load(),
+                           total_piece_count.load(),
+                           remaining_piece_count.load(),
+                           precise ? 1 : 0,
+                           registered ? 1 : 0);
     return registered;
   }
 
@@ -1827,6 +2066,15 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
   void SparsityMapImpl<N, T>::remote_data_reply(NodeID requestor, bool reply_precise,
                                                 bool reply_approx)
   {
+    deppart_sparsity_trace("remote_data_reply.enter",
+                           me.id,
+                           ID(me).sparsity_creator_node(),
+                           Network::my_node_id,
+                           remaining_contributor_count.load(),
+                           total_piece_count.load(),
+                           remaining_piece_count.load(),
+                           reply_precise ? 1 : 0,
+                           reply_approx ? 1 : 0);
     if(reply_approx) {
       // TODO
       if(!this->approx_valid.load_acquire())
@@ -1879,6 +2127,15 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
       sparsity_comm->send_contribute(requestor, me, num_pieces + 1, total_count,
                                      /*disjoint=*/true, rdata, bytes);
     }
+    deppart_sparsity_trace("remote_data_reply.exit",
+                           me.id,
+                           ID(me).sparsity_creator_node(),
+                           Network::my_node_id,
+                           remaining_contributor_count.load(),
+                           total_piece_count.load(),
+                           remaining_piece_count.load(),
+                           reply_precise ? 1 : 0,
+                           reply_approx ? 1 : 0);
   }
 
   template <int N, typename T>
@@ -2039,6 +2296,14 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
   template <int N, typename T>
   void SparsityMapImpl<N, T>::finalize(void)
   {
+    deppart_sparsity_trace("finalize.enter",
+                           me.id,
+                           ID(me).sparsity_creator_node(),
+                           Network::my_node_id,
+                           remaining_contributor_count.load(),
+                           total_piece_count.load(),
+                           remaining_piece_count.load(),
+                           this->entries.size());
 
     this->from_gpu = false;
 
@@ -2180,6 +2445,15 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
     if(trigger_precise.exists())
       GenEventImpl::trigger(trigger_precise, false /*!poisoned*/);
 
+    deppart_sparsity_trace("finalize.exit",
+                           me.id,
+                           ID(me).sparsity_creator_node(),
+                           Network::my_node_id,
+                           remaining_contributor_count.load(),
+                           total_piece_count.load(),
+                           remaining_piece_count.load(),
+                           this->entries.size());
+
   }
 
 
@@ -2189,7 +2463,16 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
   template <int N, typename T>
   void SparsityMapImpl<N, T>::gpu_finalize(void)
   {
-    this->from_gpu = true;
+    deppart_sparsity_trace("gpu_finalize.enter",
+                           me.id,
+                           ID(me).sparsity_creator_node(),
+                           Network::my_node_id,
+                           remaining_contributor_count.load(),
+                           total_piece_count.load(),
+                           remaining_piece_count.load(),
+                           this->num_entries,
+                           this->num_approx);
+    this->from_gpu = ((this->gpu_entries != nullptr) || (this->gpu_approx_rects != nullptr));
 
     if(true /*ID(me).sparsity_creator_node() == Network::my_node_id*/) {
       assert(!this->approx_valid.load());
@@ -2273,22 +2556,33 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
 
     if(trigger_precise.exists())
       GenEventImpl::trigger(trigger_precise, false /*!poisoned*/);
+    deppart_sparsity_trace("gpu_finalize.exit",
+                           me.id,
+                           ID(me).sparsity_creator_node(),
+                           Network::my_node_id,
+                           remaining_contributor_count.load(),
+                           total_piece_count.load(),
+                           remaining_piece_count.load(),
+                           this->num_entries,
+                           this->num_approx);
   }
 
 
-  //Allows a GPU deppart client to set the entries directly with a host region instance
   template<int N, typename T>
-    void SparsityMapImpl<N, T>::set_instance(RegionInstance _entries_instance, size_t size)
+  void SparsityMapImpl<N, T>::set_gpu_entries(SparsityMapEntry<N, T> *entries, size_t size)
   {
-    this->entries_instance = _entries_instance;
+    deppart_gpu_host_free(this->gpu_entries);
+    this->gpu_entries = entries;
+    this->entries.clear();
     this->num_entries = size;
   }
 
-  //Allows a GPU deppart client to set the approx rects directly with a host region instance
   template<int N, typename T>
-  void SparsityMapImpl<N, T>::set_approx_instance(RegionInstance _approx_instance, size_t size)
+  void SparsityMapImpl<N, T>::set_gpu_approx_rects(Rect<N, T> *approx_rects, size_t size)
   {
-    this->approx_instance = _approx_instance;
+    deppart_gpu_host_free(this->gpu_approx_rects);
+    this->gpu_approx_rects = approx_rects;
+    this->approx_rects.clear();
     this->num_approx = size;
   }
 
@@ -2304,6 +2598,10 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
   /*static*/ ActiveMessageHandlerReg<
       typename SparsityMapImpl<N, T>::SetContribCountMessage>
       SparsityMapImpl<N, T>::set_contrib_count_msg_reg;
+  template <int N, typename T>
+  /*static*/ ActiveMessageHandlerReg<
+      typename SparsityMapImpl<N, T>::RemoteGpuFinalizeMessage>
+      SparsityMapImpl<N, T>::remote_gpu_finalize_msg_reg;
 
   /*static*/ ActiveMessageHandlerReg<
       typename SparsityMapRefCounter::SparsityMapAddReferenceMessage>
@@ -2361,6 +2659,42 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
     SparsityMapImpl<N, T>::lookup(msg.sparsity)->set_contributor_count(msg.count);
   }
 
+  ////////////////////////////////////////////////////////////////////////
+  //
+  // class SparsityMapImpl<N,T>::RemoteGpuFinalizeMessage
+
+  template <int N, typename T>
+  inline /*static*/ void SparsityMapImpl<N, T>::RemoteGpuFinalizeMessage::handle_message(
+      NodeID sender, const SparsityMapImpl<N, T>::RemoteGpuFinalizeMessage &msg,
+      const void *data, size_t datalen)
+  {
+    size_t expected = (msg.num_entries * sizeof(SparsityMapEntry<N, T>)) +
+                      (msg.num_approx * sizeof(Rect<N, T>));
+    assert(datalen == expected);
+    (void)sender;
+
+    const char *payload = static_cast<const char *>(data);
+    SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(msg.sparsity);
+
+    if(msg.num_entries > 0) {
+      SparsityMapEntry<N, T> *entries = deppart_gpu_host_alloc<SparsityMapEntry<N, T>>(msg.num_entries);
+      std::memcpy(entries, payload, msg.num_entries * sizeof(SparsityMapEntry<N, T>));
+      impl->set_gpu_entries(entries, msg.num_entries);
+      payload += msg.num_entries * sizeof(SparsityMapEntry<N, T>);
+    } else {
+      impl->set_gpu_entries(nullptr, 0);
+    }
+
+    if(msg.num_approx > 0) {
+      Rect<N, T> *approx = deppart_gpu_host_alloc<Rect<N, T>>(msg.num_approx);
+      std::memcpy(approx, payload, msg.num_approx * sizeof(Rect<N, T>));
+      impl->set_gpu_approx_rects(approx, msg.num_approx);
+    } else {
+      impl->set_gpu_approx_rects(nullptr, 0);
+    }
+    impl->gpu_finalize();
+  }
+
 #define DOIT(N, T)                                                                       \
   template class SparsityMapPublicImpl<N, T>;                                            \
   template class SparsityMapImpl<N, T>;                                                  \
diff --git a/src/realm/deppart/sparsity_impl.h b/src/realm/deppart/sparsity_impl.h
index f9656e65b6..aa94d7200f 100644
--- a/src/realm/deppart/sparsity_impl.h
+++ b/src/realm/deppart/sparsity_impl.h
@@ -33,6 +33,9 @@
 
 namespace Realm {
 
+  REALM_INTERNAL_API_EXTERNAL_LINKAGE
+  ID create_deppart_output_sparsity(NodeID target_node);
+
   class PartitioningMicroOp;
 
   /**
@@ -139,8 +142,8 @@ namespace Realm {
     void remote_data_request(NodeID requestor, bool send_precise, bool send_approx);
     void remote_data_reply(NodeID requestor, bool send_precise, bool send_approx);
 
-    void set_instance(RegionInstance _entries_instance, size_t size);
-    void set_approx_instance(RegionInstance _approx_instance, size_t size);
+    void set_gpu_entries(SparsityMapEntry<N, T> *entries, size_t size);
+    void set_gpu_approx_rects(Rect<N, T> *approx_rects, size_t size);
     void gpu_finalize(void);
 
     SparsityMap<N, T> me;
@@ -174,12 +177,22 @@ namespace Realm {
                                  const void *data, size_t datalen);
     };
 
+    struct RemoteGpuFinalizeMessage {
+      SparsityMap<N, T> sparsity;
+      size_t num_entries;
+      size_t num_approx;
+
+      static void handle_message(NodeID sender, const RemoteGpuFinalizeMessage &msg,
+                                 const void *data, size_t datalen);
+    };
+
   protected:
     void finalize(void);
 
     static ActiveMessageHandlerReg<RemoteSparsityRequest> remote_sparsity_request_reg;
     static ActiveMessageHandlerReg<RemoteSparsityContrib> remote_sparsity_contrib_reg;
     static ActiveMessageHandlerReg<SetContribCountMessage> set_contrib_count_msg_reg;
+    static ActiveMessageHandlerReg<RemoteGpuFinalizeMessage> remote_gpu_finalize_msg_reg;
 
     atomic<int> remaining_contributor_count{0};
     atomic<int> total_piece_count{0}, remaining_piece_count{0};
diff --git a/src/realm/sparsity.h b/src/realm/sparsity.h
index dc9fe74300..616f86f499 100644
--- a/src/realm/sparsity.h
+++ b/src/realm/sparsity.h
@@ -318,11 +318,11 @@ namespace Realm {
     std::vector<SparsityMapEntry<N,T> > entries;
     std::vector<Rect<N,T> > approx_rects;
 
-    //Stores rectangles for GPU deppart (allows fast copy after merged on GPU)
-    RegionInstance entries_instance = RegionInstance::NO_INST;
+    // Stores rectangles for GPU deppart in host buffers owned by the sparsity map.
+    SparsityMapEntry<N, T> *gpu_entries = nullptr;
     size_t num_entries = 0;
 
-    RegionInstance approx_instance = RegionInstance::NO_INST;
+    Rect<N, T> *gpu_approx_rects = nullptr;
     size_t num_approx = 0;
 
     //Tracks whether to use instance or vector
diff --git a/src/realm/sparsity.inl b/src/realm/sparsity.inl
index 7ff00ef552..60ffa41a70 100644
--- a/src/realm/sparsity.inl
+++ b/src/realm/sparsity.inl
@@ -91,10 +91,7 @@ namespace Realm {
       if (num_entries == 0) {
         return span<SparsityMapEntry<N, T>>();
       }
-      return span<SparsityMapEntry<N, T>>(
-          reinterpret_cast<SparsityMapEntry<N, T> *>(entries_instance.pointer_untyped(
-              0, num_entries * sizeof(SparsityMapEntry<N, T>))),
-          num_entries);
+      return span<SparsityMapEntry<N, T>>(gpu_entries, num_entries);
     } else {
       return span<SparsityMapEntry<N, T>>(entries.data(), entries.size());
     }
@@ -108,10 +105,7 @@ namespace Realm {
       if (num_approx == 0) {
         return span<Rect<N, T>>();
       }
-      return span<Rect<N, T>>(
-          reinterpret_cast<Rect<N, T> *>(
-              approx_instance.pointer_untyped(0, num_approx * sizeof(Rect<N, T>))),
-          num_approx);
+      return span<Rect<N, T>>(gpu_approx_rects, num_approx);
     } else {
       return span<Rect<N, T>>(approx_rects.data(), approx_rects.size());
     }
diff --git a/tests/benchmark.cc b/tests/benchmark.cc
index b0bed444e1..9277436a9f 100644
--- a/tests/benchmark.cc
+++ b/tests/benchmark.cc
@@ -1202,7 +1202,7 @@ class PreimageTest : public TestInterface {
   {
     NODE_SUBGRAPH_STREAM,
   };
-
+ci
   // assign subgraph ids to nodes
   void chase_point(int idx, Point<N2>& color)
   {

From 15628d98ea7d460d6dd9650df2212f6f133c46be Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Tue, 24 Mar 2026 09:30:02 -0700
Subject: [PATCH 28/32] Export CPU_BVH for shared builds

---
 src/realm/sparsity.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/realm/sparsity.h b/src/realm/sparsity.h
index 616f86f499..b16fce5ed7 100644
--- a/src/realm/sparsity.h
+++ b/src/realm/sparsity.h
@@ -155,7 +155,7 @@ namespace Realm {
   };
 
   template <int N, typename T>
-  struct CPU_BVH {
+  struct REALM_INTERNAL_API_EXTERNAL_LINKAGE CPU_BVH {
     struct Node {
       Rect<N, T> bounds;
       int left = -1;

From 2301eb276dd7fe982f8cded734d3404b56f43388 Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Tue, 24 Mar 2026 09:45:47 -0700
Subject: [PATCH 29/32] Restore feature-gated source selection

---
 src/CMakeLists.txt | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c277a1b74d..df1132bbdb 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -69,7 +69,7 @@ set(REALM_SOURCES
     procset/procset_module.cc
 )
 
-if(TARGET CUDA::cuda_driver AND REALM_USE_CUDA)
+if(REALM_USE_CUDA)
   list(APPEND REALM_SOURCES cuda/cuda_module.cc cuda/cuda_internal.cc cuda/cuda_access.cc)
   if(REALM_USE_NVTX)
     list(APPEND REALM_SOURCES nvtx.cc)
@@ -77,15 +77,15 @@ if(TARGET CUDA::cuda_driver AND REALM_USE_CUDA)
   list(APPEND REALM_CUDA_SOURCES cuda/cuda_memcpy.cu)
 endif()
 
-if(TARGET hip::host)
+if(REALM_USE_HIP)
   list(APPEND REALM_SOURCES hip/hip_module.cc hip/hip_internal.cc hip/hip_access.cc)
 endif()
 
-if(TARGET LLVM::LLVM)
+if(REALM_USE_LLVM)
   list(APPEND REALM_SOURCES llvmjit/llvmjit_internal.cc llvmjit/llvmjit_module.cc)
 endif()
 
-if(TARGET hdf5::hdf5)
+if(REALM_USE_HDF5)
   list(APPEND REALM_SOURCES hdf5/hdf5_module.cc hdf5/hdf5_internal.cc hdf5/hdf5_access.cc)
 endif()
 
@@ -100,11 +100,11 @@ if(REALM_USE_PREALM)
   list(APPEND REALM_SOURCES prealm/prealm.cc)
 endif()
 
-if(TARGET Python3::Python)
+if(REALM_USE_PYTHON)
   list(APPEND REALM_SOURCES python/python_module.cc python/python_source.cc)
 endif()
 
-if(TARGET ucx::ucp)
+if(REALM_USE_UCX)
   list(
     APPEND
     REALM_SOURCES
@@ -119,12 +119,14 @@ if(TARGET ucx::ucp)
   )
 endif()
 
-if(TARGET GASNet::GASNet)
-  list(APPEND REALM_SOURCES gasnet1/gasnet1_module.cc gasnet1/gasnetmsg.cc)
+if(REALM_USE_GASNETEX)
+  if(NOT REALM_ENABLE_GASNETEX_WRAPPER)
+    list(APPEND REALM_SOURCES gasnet1/gasnet1_module.cc gasnet1/gasnetmsg.cc)
+  endif()
   list(APPEND REALM_SOURCES gasnetex/gasnetex_module.cc gasnetex/gasnetex_internal.cc)
 endif()
 
-if(TARGET MPI::MPI_CXX)
+if(REALM_USE_MPI)
   list(APPEND REALM_SOURCES mpi/mpi_module.cc mpi/am_mpi.cc)
 endif()
 

From 04df5861dd1131f969a70396772f0905329ae3b6 Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Wed, 25 Mar 2026 13:10:47 -0700
Subject: [PATCH 30/32] deppart: add pinned host pool and NVTX tracing

---
 src/realm/deppart/partitions.h            |  25 +++++
 src/realm/deppart/partitions_gpu_impl.hpp |  51 +++------
 src/realm/deppart/sparsity_impl.cc        | 129 ++++++++++++++++------
 src/realm/deppart/sparsity_impl.h         |   6 +
 4 files changed, 146 insertions(+), 65 deletions(-)

diff --git a/src/realm/deppart/partitions.h b/src/realm/deppart/partitions.h
index 0af8ec0673..a6b3fe371f 100644
--- a/src/realm/deppart/partitions.h
+++ b/src/realm/deppart/partitions.h
@@ -42,6 +42,31 @@ typedef CUstream_st* CUstream;
 
 #endif
 
+#ifdef REALM_USE_NVTX
+#include "realm/nvtx.h"
+#endif
+
+//NVTX macros to only add ranges if defined.
+#ifdef REALM_USE_NVTX
+
+#include <atomic>
+
+inline int32_t next_nvtx_payload() {
+  static std::atomic<int32_t> counter{0};
+  return counter.fetch_add(1, std::memory_order_relaxed);
+}
+
+#define NVTX_CAT2(a, b) a##b
+#define NVTX_CAT(a, b) NVTX_CAT2(a, b)
+
+#define NVTX_DEPPART(message) \
+  nvtxScopedRange NVTX_CAT(nvtx_, __LINE__)("cuda", #message, next_nvtx_payload())
+
+#else
+
+  #define NVTX_DEPPART(message) do { } while (0)
+
+#endif
 
 namespace Realm {
 
diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp
index e293419b9a..e28195d550 100644
--- a/src/realm/deppart/partitions_gpu_impl.hpp
+++ b/src/realm/deppart/partitions_gpu_impl.hpp
@@ -1,9 +1,7 @@
 #pragma once
 #include "deppart_config.h"
 #include "partitions.h"
-#ifdef REALM_USE_NVTX
-#include "realm/nvtx.h"
-#endif
+
 #include "realm/cuda/cuda_internal.h"
 #include "realm/deppart/partitions_gpu_kernels.hpp"
 #include <cub/cub.cuh>
@@ -47,44 +45,18 @@
     }                                                                           \
   } while (0)
 
-
-//NVTX macros to only add ranges if defined.
-#ifdef REALM_USE_NVTX
-
-#include <atomic>
-
-inline int32_t next_nvtx_payload() {
-  static std::atomic<int32_t> counter{0};
-  return counter.fetch_add(1, std::memory_order_relaxed);
-}
-
-#define NVTX_CAT2(a, b) a##b
-#define NVTX_CAT(a, b) NVTX_CAT2(a, b)
-
-#define NVTX_DEPPART(message) \
-  nvtxScopedRange NVTX_CAT(nvtx_, __LINE__)("cuda", #message, next_nvtx_payload())
-
-#else
-
-  #define NVTX_DEPPART(message) do { } while (0)
-
-#endif
-
 namespace Realm {
 
   template <typename T>
   inline T *deppart_host_alloc(size_t count, unsigned flags = cudaHostAllocPortable)
   {
-    if(count == 0) return nullptr;
-    void *ptr = nullptr;
-    CUDA_HOST_CHECK(cudaHostAlloc(&ptr, count * sizeof(T), flags));
-    return reinterpret_cast<T *>(ptr);
+    (void)flags;
+    return static_cast<T *>(deppart_pinned_host_alloc_bytes(count * sizeof(T)));
   }
 
   inline void deppart_host_free(void *ptr)
   {
-    if(ptr != nullptr)
-      CUDA_HOST_CHECK(cudaFreeHost(ptr));
+    deppart_pinned_host_free(ptr);
   }
 
   // Used by cub::DeviceReduce to compute bad GPU approximation.
@@ -1731,13 +1703,18 @@ namespace Realm {
     assert(find_memory(sysmem, Memory::SYSTEM_MEM, my_arena.location));
     if (!this->exclusive) {
       for (auto const& elem : ctr) {
+        NVTX_DEPPART(cpu_finalize);
         size_t idx = getIndex(elem);
         auto mapOpj = getMap(elem);
         SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(mapOpj);
         if (d_ends_host[idx] > d_starts_host[idx]) {
           size_t end = d_ends_host[idx];
           size_t start = d_starts_host[idx];
-          Rect<N, T> *h_rects = deppart_host_alloc<Rect<N, T>>(end - start);
+          Rect<N, T> * h_rects;
+          {
+            NVTX_DEPPART(rects_alloc);
+            h_rects = deppart_host_alloc<Rect<N, T>>(end - start);
+          }
           CUDA_CHECK(cudaMemcpyAsync(h_rects, final_rects + start, (end - start) * sizeof(Rect<N,T>), cudaMemcpyDeviceToHost, stream), stream);
           CUDA_CHECK(cudaStreamSynchronize(stream), stream);
           span<Rect<N, T>> h_rects_span(h_rects, end - start);
@@ -1753,6 +1730,7 @@ namespace Realm {
 
       //Use provided lambdas to iterate over sparsity output container (map or vector)
       for (auto const& elem : ctr) {
+        NVTX_DEPPART(gpu_finalize);
         size_t idx = getIndex(elem);
         auto mapOpj = getMap(elem);
         SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(mapOpj);
@@ -1761,7 +1739,11 @@ namespace Realm {
         if (d_ends_host[idx] > d_starts_host[idx]) {
           size_t end = d_ends_host[idx];
           size_t start = d_starts_host[idx];
-          SparsityMapEntry<N, T> *h_entries = deppart_host_alloc<SparsityMapEntry<N, T>>(end - start);
+          SparsityMapEntry<N, T> *h_entries;
+          {
+            NVTX_DEPPART(alloc_entries);
+            h_entries = deppart_host_alloc<SparsityMapEntry<N, T>>(end - start);
+          }
           CUDA_CHECK(cudaMemcpyAsync(h_entries, final_entries + start, (end - start) * sizeof(SparsityMapEntry<N,T>), cudaMemcpyDeviceToHost, stream), stream);
 
           Rect<N,T> *approx_rects;
@@ -1838,6 +1820,7 @@ namespace Realm {
           }
         }
       }
+      NVTX_DEPPART(cleanup);
       CUDA_CHECK(cudaStreamSynchronize(stream), stream);
       for (SparsityMapImpl<N, T> *impl : local_finalizations) {
         impl->gpu_finalize();
diff --git a/src/realm/deppart/sparsity_impl.cc b/src/realm/deppart/sparsity_impl.cc
index 20c655a62c..1314126d64 100644
--- a/src/realm/deppart/sparsity_impl.cc
+++ b/src/realm/deppart/sparsity_impl.cc
@@ -29,6 +29,8 @@
 #ifdef REALM_USE_CUDA
 #include <cuda_runtime_api.h>
 #endif
+
+
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
@@ -39,6 +41,77 @@
 namespace Realm {
 
   namespace {
+    class DeppartPinnedHostPool {
+    public:
+      void *alloc(size_t bytes)
+      {
+        if(bytes == 0)
+          return nullptr;
+
+        const size_t bucket_size = round_up(bytes);
+        void *ptr = nullptr;
+        {
+          std::lock_guard<std::mutex> lock(mutex);
+          std::vector<void *> &bucket = free_blocks[bucket_size];
+          if(!bucket.empty()) {
+            ptr = bucket.back();
+            bucket.pop_back();
+          }
+        }
+
+        if(ptr == nullptr) {
+#ifdef REALM_USE_CUDA
+          cudaError_t err = cudaHostAlloc(&ptr, bucket_size, cudaHostAllocPortable);
+          assert(err == cudaSuccess);
+#else
+          ptr = std::malloc(bucket_size);
+          assert(ptr != nullptr);
+#endif
+        }
+
+        {
+          std::lock_guard<std::mutex> lock(mutex);
+          live_blocks[ptr] = bucket_size;
+        }
+        return ptr;
+      }
+
+      void release(void *ptr)
+      {
+        if(ptr == nullptr)
+          return;
+
+        std::lock_guard<std::mutex> lock(mutex);
+        auto it = live_blocks.find(ptr);
+        assert(it != live_blocks.end());
+        free_blocks[it->second].push_back(ptr);
+        live_blocks.erase(it);
+      }
+
+    private:
+      static size_t round_up(size_t bytes)
+      {
+        size_t rounded = 4096;
+        while((rounded < bytes) && (rounded < (size_t(1) << 30)))
+          rounded <<= 1;
+        if(rounded >= bytes)
+          return rounded;
+
+        const size_t granularity = size_t(1) << 20;
+        return ((bytes + granularity - 1) / granularity) * granularity;
+      }
+
+      std::mutex mutex;
+      std::unordered_map<size_t, std::vector<void *>> free_blocks;
+      std::unordered_map<void *, size_t> live_blocks;
+    };
+
+    DeppartPinnedHostPool &get_deppart_pinned_host_pool(void)
+    {
+      static DeppartPinnedHostPool *pool = new DeppartPinnedHostPool();
+      return *pool;
+    }
+
     struct PendingOutputSparsityAllocation {
       std::mutex mutex;
       std::condition_variable cv;
@@ -75,31 +148,6 @@ namespace Realm {
     ActiveMessageHandlerReg<OutputSparsityAllocationResponse>
         output_sparsity_allocation_response_reg;
 
-    template <typename T>
-    inline T *deppart_gpu_host_alloc(size_t count)
-    {
-      if(count == 0) return nullptr;
-#ifdef REALM_USE_CUDA
-      void *ptr = nullptr;
-      cudaError_t err = cudaHostAlloc(&ptr, count * sizeof(T), cudaHostAllocPortable);
-      assert(err == cudaSuccess);
-      return reinterpret_cast<T *>(ptr);
-#else
-      return static_cast<T *>(std::malloc(count * sizeof(T)));
-#endif
-    }
-
-    inline void deppart_gpu_host_free(void *ptr)
-    {
-      if(ptr == nullptr) return;
-#ifdef REALM_USE_CUDA
-      cudaError_t err = cudaFreeHost(ptr);
-      assert(err == cudaSuccess);
-#else
-      std::free(ptr);
-#endif
-    }
-
     inline bool deppart_sparsity_trace_enabled(void)
     {
       static int enabled = -1;
@@ -200,6 +248,16 @@ namespace Realm {
     pending->cv.notify_one();
   }
 
+  void *deppart_pinned_host_alloc_bytes(size_t bytes)
+  {
+    return get_deppart_pinned_host_pool().alloc(bytes);
+  }
+
+  void deppart_pinned_host_free(void *ptr)
+  {
+    get_deppart_pinned_host_pool().release(ptr);
+  }
+
   extern Logger log_part;
 
   ////////////////////////////////////////////////////////////////////////
@@ -1405,8 +1463,8 @@ bool SparsityMapPublicImpl<N, T>::bvh_centroid_less(int axis,
 template<int N, typename T>
 SparsityMapImpl<N, T>::~SparsityMapImpl(void)
 {
-     deppart_gpu_host_free(this->gpu_entries);
-     deppart_gpu_host_free(this->gpu_approx_rects);
+     deppart_pinned_host_free(this->gpu_entries);
+     deppart_pinned_host_free(this->gpu_approx_rects);
 }
 
   template <int N, typename T>
@@ -1694,6 +1752,8 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
                                                    size_t piece_count, bool disjoint,
                                                    size_t total_count)
   {
+    NVTX_DEPPART(contribute_raw_rects);
+
     deppart_sparsity_trace("contribute_raw_rects.enter",
                            me.id,
                            ID(me).sparsity_creator_node(),
@@ -2296,6 +2356,7 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
   template <int N, typename T>
   void SparsityMapImpl<N, T>::finalize(void)
   {
+    NVTX_DEPPART(finalize);
     deppart_sparsity_trace("finalize.enter",
                            me.id,
                            ID(me).sparsity_creator_node(),
@@ -2504,6 +2565,7 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
     Event trigger_approx = Event::NO_EVENT;
     std::vector<PartitioningMicroOp *> precise_waiters_copy, approx_waiters_copy;
     {
+      NVTX_DEPPART(synchronization);
       AutoLock<> al(mutex);
 
       assert(!this->entries_valid.load());
@@ -2533,6 +2595,7 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
       (*it)->sparsity_map_ready(this, false);
 
     if(!sendto_approx.empty()) {
+      NVTX_DEPPART(send_to_approx);
       for(NodeID i = 0; (i <= Network::max_node_id) && !sendto_approx.empty(); i++)
         if(sendto_approx.contains(i)) {
           bool also_precise = sendto_precise.contains(i);
@@ -2544,6 +2607,7 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
     }
 
     if(!sendto_precise.empty()) {
+      NVTX_DEPPART(sendto_precise);
       for(NodeID i = 0; (i <= Network::max_node_id) && !sendto_precise.empty(); i++)
         if(sendto_precise.contains(i)) {
           remote_data_reply(i, true, false);
@@ -2571,7 +2635,7 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
   template<int N, typename T>
   void SparsityMapImpl<N, T>::set_gpu_entries(SparsityMapEntry<N, T> *entries, size_t size)
   {
-    deppart_gpu_host_free(this->gpu_entries);
+    deppart_pinned_host_free(this->gpu_entries);
     this->gpu_entries = entries;
     this->entries.clear();
     this->num_entries = size;
@@ -2580,7 +2644,7 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
   template<int N, typename T>
   void SparsityMapImpl<N, T>::set_gpu_approx_rects(Rect<N, T> *approx_rects, size_t size)
   {
-    deppart_gpu_host_free(this->gpu_approx_rects);
+    deppart_pinned_host_free(this->gpu_approx_rects);
     this->gpu_approx_rects = approx_rects;
     this->approx_rects.clear();
     this->num_approx = size;
@@ -2677,7 +2741,9 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
     SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(msg.sparsity);
 
     if(msg.num_entries > 0) {
-      SparsityMapEntry<N, T> *entries = deppart_gpu_host_alloc<SparsityMapEntry<N, T>>(msg.num_entries);
+      SparsityMapEntry<N, T> *entries =
+          static_cast<SparsityMapEntry<N, T> *>(deppart_pinned_host_alloc_bytes(
+              msg.num_entries * sizeof(SparsityMapEntry<N, T>)));
       std::memcpy(entries, payload, msg.num_entries * sizeof(SparsityMapEntry<N, T>));
       impl->set_gpu_entries(entries, msg.num_entries);
       payload += msg.num_entries * sizeof(SparsityMapEntry<N, T>);
@@ -2686,7 +2752,8 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
     }
 
     if(msg.num_approx > 0) {
-      Rect<N, T> *approx = deppart_gpu_host_alloc<Rect<N, T>>(msg.num_approx);
+      Rect<N, T> *approx = static_cast<Rect<N, T> *>(deppart_pinned_host_alloc_bytes(
+          msg.num_approx * sizeof(Rect<N, T>)));
       std::memcpy(approx, payload, msg.num_approx * sizeof(Rect<N, T>));
       impl->set_gpu_approx_rects(approx, msg.num_approx);
     } else {
diff --git a/src/realm/deppart/sparsity_impl.h b/src/realm/deppart/sparsity_impl.h
index aa94d7200f..f1a6a3756f 100644
--- a/src/realm/deppart/sparsity_impl.h
+++ b/src/realm/deppart/sparsity_impl.h
@@ -36,6 +36,12 @@ namespace Realm {
   REALM_INTERNAL_API_EXTERNAL_LINKAGE
   ID create_deppart_output_sparsity(NodeID target_node);
 
+  REALM_INTERNAL_API_EXTERNAL_LINKAGE
+  void *deppart_pinned_host_alloc_bytes(size_t bytes);
+
+  REALM_INTERNAL_API_EXTERNAL_LINKAGE
+  void deppart_pinned_host_free(void *ptr);
+
   class PartitioningMicroOp;
 
   /**

From de04613843191192c26daec38093b905ea9e86e1 Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Wed, 8 Apr 2026 09:14:52 -0700
Subject: [PATCH 31/32] Revert "deppart: add pinned host pool and NVTX tracing"

This reverts commit 04df5861dd1131f969a70396772f0905329ae3b6.
---
 src/realm/deppart/partitions.h            |  25 -----
 src/realm/deppart/partitions_gpu_impl.hpp |  51 ++++++---
 src/realm/deppart/sparsity_impl.cc        | 129 ++++++----------------
 src/realm/deppart/sparsity_impl.h         |   6 -
 4 files changed, 65 insertions(+), 146 deletions(-)

diff --git a/src/realm/deppart/partitions.h b/src/realm/deppart/partitions.h
index a6b3fe371f..0af8ec0673 100644
--- a/src/realm/deppart/partitions.h
+++ b/src/realm/deppart/partitions.h
@@ -42,31 +42,6 @@ typedef CUstream_st* CUstream;
 
 #endif
 
-#ifdef REALM_USE_NVTX
-#include "realm/nvtx.h"
-#endif
-
-//NVTX macros to only add ranges if defined.
-#ifdef REALM_USE_NVTX
-
-#include <atomic>
-
-inline int32_t next_nvtx_payload() {
-  static std::atomic<int32_t> counter{0};
-  return counter.fetch_add(1, std::memory_order_relaxed);
-}
-
-#define NVTX_CAT2(a, b) a##b
-#define NVTX_CAT(a, b) NVTX_CAT2(a, b)
-
-#define NVTX_DEPPART(message) \
-  nvtxScopedRange NVTX_CAT(nvtx_, __LINE__)("cuda", #message, next_nvtx_payload())
-
-#else
-
-  #define NVTX_DEPPART(message) do { } while (0)
-
-#endif
 
 namespace Realm {
 
diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp
index e28195d550..e293419b9a 100644
--- a/src/realm/deppart/partitions_gpu_impl.hpp
+++ b/src/realm/deppart/partitions_gpu_impl.hpp
@@ -1,7 +1,9 @@
 #pragma once
 #include "deppart_config.h"
 #include "partitions.h"
-
+#ifdef REALM_USE_NVTX
+#include "realm/nvtx.h"
+#endif
 #include "realm/cuda/cuda_internal.h"
 #include "realm/deppart/partitions_gpu_kernels.hpp"
 #include <cub/cub.cuh>
@@ -45,18 +47,44 @@
     }                                                                           \
   } while (0)
 
+
+//NVTX macros to only add ranges if defined.
+#ifdef REALM_USE_NVTX
+
+#include <atomic>
+
+inline int32_t next_nvtx_payload() {
+  static std::atomic<int32_t> counter{0};
+  return counter.fetch_add(1, std::memory_order_relaxed);
+}
+
+#define NVTX_CAT2(a, b) a##b
+#define NVTX_CAT(a, b) NVTX_CAT2(a, b)
+
+#define NVTX_DEPPART(message) \
+  nvtxScopedRange NVTX_CAT(nvtx_, __LINE__)("cuda", #message, next_nvtx_payload())
+
+#else
+
+  #define NVTX_DEPPART(message) do { } while (0)
+
+#endif
+
 namespace Realm {
 
   template <typename T>
   inline T *deppart_host_alloc(size_t count, unsigned flags = cudaHostAllocPortable)
   {
-    (void)flags;
-    return static_cast<T *>(deppart_pinned_host_alloc_bytes(count * sizeof(T)));
+    if(count == 0) return nullptr;
+    void *ptr = nullptr;
+    CUDA_HOST_CHECK(cudaHostAlloc(&ptr, count * sizeof(T), flags));
+    return reinterpret_cast<T *>(ptr);
   }
 
   inline void deppart_host_free(void *ptr)
   {
-    deppart_pinned_host_free(ptr);
+    if(ptr != nullptr)
+      CUDA_HOST_CHECK(cudaFreeHost(ptr));
   }
 
   // Used by cub::DeviceReduce to compute bad GPU approximation.
@@ -1703,18 +1731,13 @@ namespace Realm {
     assert(find_memory(sysmem, Memory::SYSTEM_MEM, my_arena.location));
     if (!this->exclusive) {
       for (auto const& elem : ctr) {
-        NVTX_DEPPART(cpu_finalize);
         size_t idx = getIndex(elem);
         auto mapOpj = getMap(elem);
         SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(mapOpj);
         if (d_ends_host[idx] > d_starts_host[idx]) {
           size_t end = d_ends_host[idx];
           size_t start = d_starts_host[idx];
-          Rect<N, T> * h_rects;
-          {
-            NVTX_DEPPART(rects_alloc);
-            h_rects = deppart_host_alloc<Rect<N, T>>(end - start);
-          }
+          Rect<N, T> *h_rects = deppart_host_alloc<Rect<N, T>>(end - start);
           CUDA_CHECK(cudaMemcpyAsync(h_rects, final_rects + start, (end - start) * sizeof(Rect<N,T>), cudaMemcpyDeviceToHost, stream), stream);
           CUDA_CHECK(cudaStreamSynchronize(stream), stream);
           span<Rect<N, T>> h_rects_span(h_rects, end - start);
@@ -1730,7 +1753,6 @@ namespace Realm {
 
       //Use provided lambdas to iterate over sparsity output container (map or vector)
       for (auto const& elem : ctr) {
-        NVTX_DEPPART(gpu_finalize);
         size_t idx = getIndex(elem);
         auto mapOpj = getMap(elem);
         SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(mapOpj);
@@ -1739,11 +1761,7 @@ namespace Realm {
         if (d_ends_host[idx] > d_starts_host[idx]) {
           size_t end = d_ends_host[idx];
           size_t start = d_starts_host[idx];
-          SparsityMapEntry<N, T> *h_entries;
-          {
-            NVTX_DEPPART(alloc_entries);
-            h_entries = deppart_host_alloc<SparsityMapEntry<N, T>>(end - start);
-          }
+          SparsityMapEntry<N, T> *h_entries = deppart_host_alloc<SparsityMapEntry<N, T>>(end - start);
           CUDA_CHECK(cudaMemcpyAsync(h_entries, final_entries + start, (end - start) * sizeof(SparsityMapEntry<N,T>), cudaMemcpyDeviceToHost, stream), stream);
 
           Rect<N,T> *approx_rects;
@@ -1820,7 +1838,6 @@ namespace Realm {
           }
         }
       }
-      NVTX_DEPPART(cleanup);
       CUDA_CHECK(cudaStreamSynchronize(stream), stream);
       for (SparsityMapImpl<N, T> *impl : local_finalizations) {
         impl->gpu_finalize();
diff --git a/src/realm/deppart/sparsity_impl.cc b/src/realm/deppart/sparsity_impl.cc
index 1314126d64..20c655a62c 100644
--- a/src/realm/deppart/sparsity_impl.cc
+++ b/src/realm/deppart/sparsity_impl.cc
@@ -29,8 +29,6 @@
 #ifdef REALM_USE_CUDA
 #include <cuda_runtime_api.h>
 #endif
-
-
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
@@ -41,77 +39,6 @@
 namespace Realm {
 
   namespace {
-    class DeppartPinnedHostPool {
-    public:
-      void *alloc(size_t bytes)
-      {
-        if(bytes == 0)
-          return nullptr;
-
-        const size_t bucket_size = round_up(bytes);
-        void *ptr = nullptr;
-        {
-          std::lock_guard<std::mutex> lock(mutex);
-          std::vector<void *> &bucket = free_blocks[bucket_size];
-          if(!bucket.empty()) {
-            ptr = bucket.back();
-            bucket.pop_back();
-          }
-        }
-
-        if(ptr == nullptr) {
-#ifdef REALM_USE_CUDA
-          cudaError_t err = cudaHostAlloc(&ptr, bucket_size, cudaHostAllocPortable);
-          assert(err == cudaSuccess);
-#else
-          ptr = std::malloc(bucket_size);
-          assert(ptr != nullptr);
-#endif
-        }
-
-        {
-          std::lock_guard<std::mutex> lock(mutex);
-          live_blocks[ptr] = bucket_size;
-        }
-        return ptr;
-      }
-
-      void release(void *ptr)
-      {
-        if(ptr == nullptr)
-          return;
-
-        std::lock_guard<std::mutex> lock(mutex);
-        auto it = live_blocks.find(ptr);
-        assert(it != live_blocks.end());
-        free_blocks[it->second].push_back(ptr);
-        live_blocks.erase(it);
-      }
-
-    private:
-      static size_t round_up(size_t bytes)
-      {
-        size_t rounded = 4096;
-        while((rounded < bytes) && (rounded < (size_t(1) << 30)))
-          rounded <<= 1;
-        if(rounded >= bytes)
-          return rounded;
-
-        const size_t granularity = size_t(1) << 20;
-        return ((bytes + granularity - 1) / granularity) * granularity;
-      }
-
-      std::mutex mutex;
-      std::unordered_map<size_t, std::vector<void *>> free_blocks;
-      std::unordered_map<void *, size_t> live_blocks;
-    };
-
-    DeppartPinnedHostPool &get_deppart_pinned_host_pool(void)
-    {
-      static DeppartPinnedHostPool *pool = new DeppartPinnedHostPool();
-      return *pool;
-    }
-
     struct PendingOutputSparsityAllocation {
       std::mutex mutex;
       std::condition_variable cv;
@@ -148,6 +75,31 @@ namespace Realm {
     ActiveMessageHandlerReg<OutputSparsityAllocationResponse>
         output_sparsity_allocation_response_reg;
 
+    template <typename T>
+    inline T *deppart_gpu_host_alloc(size_t count)
+    {
+      if(count == 0) return nullptr;
+#ifdef REALM_USE_CUDA
+      void *ptr = nullptr;
+      cudaError_t err = cudaHostAlloc(&ptr, count * sizeof(T), cudaHostAllocPortable);
+      assert(err == cudaSuccess);
+      return reinterpret_cast<T *>(ptr);
+#else
+      return static_cast<T *>(std::malloc(count * sizeof(T)));
+#endif
+    }
+
+    inline void deppart_gpu_host_free(void *ptr)
+    {
+      if(ptr == nullptr) return;
+#ifdef REALM_USE_CUDA
+      cudaError_t err = cudaFreeHost(ptr);
+      assert(err == cudaSuccess);
+#else
+      std::free(ptr);
+#endif
+    }
+
     inline bool deppart_sparsity_trace_enabled(void)
     {
       static int enabled = -1;
@@ -248,16 +200,6 @@ namespace Realm {
     pending->cv.notify_one();
   }
 
-  void *deppart_pinned_host_alloc_bytes(size_t bytes)
-  {
-    return get_deppart_pinned_host_pool().alloc(bytes);
-  }
-
-  void deppart_pinned_host_free(void *ptr)
-  {
-    get_deppart_pinned_host_pool().release(ptr);
-  }
-
   extern Logger log_part;
 
   ////////////////////////////////////////////////////////////////////////
@@ -1463,8 +1405,8 @@ bool SparsityMapPublicImpl<N, T>::bvh_centroid_less(int axis,
 template<int N, typename T>
 SparsityMapImpl<N, T>::~SparsityMapImpl(void)
 {
-     deppart_pinned_host_free(this->gpu_entries);
-     deppart_pinned_host_free(this->gpu_approx_rects);
+     deppart_gpu_host_free(this->gpu_entries);
+     deppart_gpu_host_free(this->gpu_approx_rects);
 }
 
   template <int N, typename T>
@@ -1752,8 +1694,6 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
                                                    size_t piece_count, bool disjoint,
                                                    size_t total_count)
   {
-    NVTX_DEPPART(contribute_raw_rects);
-
     deppart_sparsity_trace("contribute_raw_rects.enter",
                            me.id,
                            ID(me).sparsity_creator_node(),
@@ -2356,7 +2296,6 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
   template <int N, typename T>
   void SparsityMapImpl<N, T>::finalize(void)
   {
-    NVTX_DEPPART(finalize);
     deppart_sparsity_trace("finalize.enter",
                            me.id,
                            ID(me).sparsity_creator_node(),
@@ -2565,7 +2504,6 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
     Event trigger_approx = Event::NO_EVENT;
     std::vector<PartitioningMicroOp *> precise_waiters_copy, approx_waiters_copy;
     {
-      NVTX_DEPPART(synchronization);
       AutoLock<> al(mutex);
 
       assert(!this->entries_valid.load());
@@ -2595,7 +2533,6 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
       (*it)->sparsity_map_ready(this, false);
 
     if(!sendto_approx.empty()) {
-      NVTX_DEPPART(send_to_approx);
       for(NodeID i = 0; (i <= Network::max_node_id) && !sendto_approx.empty(); i++)
         if(sendto_approx.contains(i)) {
           bool also_precise = sendto_precise.contains(i);
@@ -2607,7 +2544,6 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
     }
 
     if(!sendto_precise.empty()) {
-      NVTX_DEPPART(sendto_precise);
       for(NodeID i = 0; (i <= Network::max_node_id) && !sendto_precise.empty(); i++)
         if(sendto_precise.contains(i)) {
           remote_data_reply(i, true, false);
@@ -2635,7 +2571,7 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
   template<int N, typename T>
   void SparsityMapImpl<N, T>::set_gpu_entries(SparsityMapEntry<N, T> *entries, size_t size)
   {
-    deppart_pinned_host_free(this->gpu_entries);
+    deppart_gpu_host_free(this->gpu_entries);
     this->gpu_entries = entries;
     this->entries.clear();
     this->num_entries = size;
@@ -2644,7 +2580,7 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
   template<int N, typename T>
   void SparsityMapImpl<N, T>::set_gpu_approx_rects(Rect<N, T> *approx_rects, size_t size)
   {
-    deppart_pinned_host_free(this->gpu_approx_rects);
+    deppart_gpu_host_free(this->gpu_approx_rects);
     this->gpu_approx_rects = approx_rects;
     this->approx_rects.clear();
     this->num_approx = size;
@@ -2741,9 +2677,7 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
     SparsityMapImpl<N, T> *impl = SparsityMapImpl<N, T>::lookup(msg.sparsity);
 
     if(msg.num_entries > 0) {
-      SparsityMapEntry<N, T> *entries =
-          static_cast<SparsityMapEntry<N, T> *>(deppart_pinned_host_alloc_bytes(
-              msg.num_entries * sizeof(SparsityMapEntry<N, T>)));
+      SparsityMapEntry<N, T> *entries = deppart_gpu_host_alloc<SparsityMapEntry<N, T>>(msg.num_entries);
       std::memcpy(entries, payload, msg.num_entries * sizeof(SparsityMapEntry<N, T>));
       impl->set_gpu_entries(entries, msg.num_entries);
       payload += msg.num_entries * sizeof(SparsityMapEntry<N, T>);
@@ -2752,8 +2686,7 @@ SparsityMapImpl<N, T>::~SparsityMapImpl(void)
     }
 
     if(msg.num_approx > 0) {
-      Rect<N, T> *approx = static_cast<Rect<N, T> *>(deppart_pinned_host_alloc_bytes(
-          msg.num_approx * sizeof(Rect<N, T>)));
+      Rect<N, T> *approx = deppart_gpu_host_alloc<Rect<N, T>>(msg.num_approx);
       std::memcpy(approx, payload, msg.num_approx * sizeof(Rect<N, T>));
       impl->set_gpu_approx_rects(approx, msg.num_approx);
     } else {
diff --git a/src/realm/deppart/sparsity_impl.h b/src/realm/deppart/sparsity_impl.h
index f1a6a3756f..aa94d7200f 100644
--- a/src/realm/deppart/sparsity_impl.h
+++ b/src/realm/deppart/sparsity_impl.h
@@ -36,12 +36,6 @@ namespace Realm {
   REALM_INTERNAL_API_EXTERNAL_LINKAGE
   ID create_deppart_output_sparsity(NodeID target_node);
 
-  REALM_INTERNAL_API_EXTERNAL_LINKAGE
-  void *deppart_pinned_host_alloc_bytes(size_t bytes);
-
-  REALM_INTERNAL_API_EXTERNAL_LINKAGE
-  void deppart_pinned_host_free(void *ptr);
-
   class PartitioningMicroOp;
 
   /**

From be8544f845b9f025460dac4ec514433e29ec636d Mon Sep 17 00:00:00 2001
From: Rohan Chanani <rohanchanani@gmail.com>
Date: Wed, 8 Apr 2026 09:18:44 -0700
Subject: [PATCH 32/32] added .codex to gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index ae19316444..78cce53e0c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,7 @@ install/
 .idea/
 .vscode/
 .cursor/
+.codex
 
 # clangd LSP cache
 .cache/