From 64c32c78dc8292bb16b4912a2b31f636be9cb15b Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Fri, 9 Jan 2026 12:14:21 -0800 Subject: [PATCH 01/32] cleaned up diff --- cmake/deppart_tmpl.cu.in | 20 + src/CMakeLists.txt | 36 +- src/realm/deppart/byfield.cc | 120 +- src/realm/deppart/byfield.h | 24 + src/realm/deppart/byfield_gpu_impl.hpp | 155 ++ src/realm/deppart/byfield_gpu_kernels.hpp | 57 + src/realm/deppart/byfield_gpu_tmpl.cu | 64 + src/realm/deppart/byfield_tmpl.cc | 15 +- src/realm/deppart/image.cc | 402 ++++- src/realm/deppart/image.h | 291 ++-- src/realm/deppart/image_gpu_impl.hpp | 446 +++++ src/realm/deppart/image_gpu_kernels.hpp | 167 ++ src/realm/deppart/image_gpu_tmpl.cu | 62 + src/realm/deppart/image_tmpl.cc | 7 +- src/realm/deppart/partitions.cc | 27 +- src/realm/deppart/partitions.h | 241 +++ src/realm/deppart/partitions_gpu.cu | 29 + src/realm/deppart/partitions_gpu_impl.hpp | 1604 +++++++++++++++++ src/realm/deppart/partitions_gpu_kernels.hpp | 811 +++++++++ src/realm/deppart/setops.cc | 26 +- src/realm/deppart/sparsity_impl.cc | 181 +- src/realm/deppart/sparsity_impl.h | 6 + src/realm/deppart/untemplated_gpu_kernels.cu | 119 ++ src/realm/indexspace.h | 20 + src/realm/indexspace.inl | 193 ++- src/realm/inst_layout.inl | 10 +- src/realm/sparsity.h | 23 +- src/realm/sparsity.inl | 32 +- tests/CMakeLists.txt | 4 + tests/deppart.cc | 1621 +++++++++++++++++- tests/gpu_deppart_1d.cc | 327 ++++ 31 files changed, 6742 insertions(+), 398 deletions(-) create mode 100644 cmake/deppart_tmpl.cu.in create mode 100644 src/realm/deppart/byfield_gpu_impl.hpp create mode 100644 src/realm/deppart/byfield_gpu_kernels.hpp create mode 100644 src/realm/deppart/byfield_gpu_tmpl.cu create mode 100644 src/realm/deppart/image_gpu_impl.hpp create mode 100644 src/realm/deppart/image_gpu_kernels.hpp create mode 100644 src/realm/deppart/image_gpu_tmpl.cu create mode 100644 src/realm/deppart/partitions_gpu.cu create mode 100644 src/realm/deppart/partitions_gpu_impl.hpp create mode 100644 src/realm/deppart/partitions_gpu_kernels.hpp create mode 100644 src/realm/deppart/untemplated_gpu_kernels.cu create mode 100644 tests/gpu_deppart_1d.cc diff --git a/cmake/deppart_tmpl.cu.in b/cmake/deppart_tmpl.cu.in new file mode 100644 index 0000000000..01978e21ac --- /dev/null +++ b/cmake/deppart_tmpl.cu.in @@ -0,0 +1,20 @@ +/* + * Copyright 2025 Stanford University, NVIDIA Corporation + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#cmakedefine INST_N1 @INST_N1@ +#cmakedefine INST_N2 @INST_N2@ +#include "@SRCFILE@_gpu_tmpl.cu" \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7054eb2e94..fd0b1fb81a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -38,7 +38,6 @@ set(REALM_SOURCES nodeset.cc operation.cc proc_impl.cc - realm_assert.cc repl_heap.cc rsrv_impl.cc runtime_impl.cc @@ -64,12 +63,13 @@ set(REALM_SOURCES deppart/partitions.cc deppart/setops.cc deppart/sparsity_impl.cc + deppart/untemplated_gpu_kernels.cu numa/numa_module.cc numa/numasysif.cc procset/procset_module.cc ) -if(REALM_USE_CUDA) +if(TARGET CUDA::cuda_driver AND REALM_USE_CUDA) list(APPEND REALM_SOURCES cuda/cuda_module.cc cuda/cuda_internal.cc cuda/cuda_access.cc) if(REALM_USE_NVTX) list(APPEND REALM_SOURCES nvtx.cc) @@ -77,15 +77,15 @@ if(REALM_USE_CUDA) list(APPEND REALM_CUDA_SOURCES cuda/cuda_memcpy.cu) endif() -if(REALM_USE_HIP) +if(TARGET hip::host) list(APPEND REALM_SOURCES hip/hip_module.cc hip/hip_internal.cc hip/hip_access.cc) endif() -if(REALM_USE_LLVM) +if(TARGET LLVM::LLVM) list(APPEND REALM_SOURCES llvmjit/llvmjit_internal.cc llvmjit/llvmjit_module.cc) endif() -if(REALM_USE_HDF5) +if(TARGET hdf5::hdf5) list(APPEND REALM_SOURCES hdf5/hdf5_module.cc hdf5/hdf5_internal.cc hdf5/hdf5_access.cc) endif() @@ -100,11 +100,11 @@ if(REALM_USE_PREALM) list(APPEND REALM_SOURCES prealm/prealm.cc) endif() -if(REALM_USE_PYTHON) +if(TARGET Python3::Python) list(APPEND REALM_SOURCES python/python_module.cc python/python_source.cc) endif() -if(REALM_USE_UCX) +if(TARGET ucx::ucp) list( APPEND REALM_SOURCES @@ -119,14 +119,12 @@ if(REALM_USE_UCX) ) endif() -if(REALM_USE_GASNETEX) - if (NOT REALM_ENABLE_GASNETEX_WRAPPER) - list(APPEND REALM_SOURCES gasnet1/gasnet1_module.cc gasnet1/gasnetmsg.cc) - endif() +if(TARGET GASNet::GASNet) + list(APPEND REALM_SOURCES gasnet1/gasnet1_module.cc gasnet1/gasnetmsg.cc) list(APPEND REALM_SOURCES gasnetex/gasnetex_module.cc gasnetex/gasnetex_internal.cc) endif() -if(REALM_USE_MPI) +if(TARGET MPI::MPI_CXX) list(APPEND REALM_SOURCES mpi/mpi_module.cc mpi/am_mpi.cc) endif() @@ -145,7 +143,7 @@ configure_file( @ONLY ) -# generate per-dimension object files for deppart stuff +# Generate per-dimension object files for CPU deppart. foreach(INST_N1 RANGE 1 ${REALM_MAX_DIM}) foreach(INST_N2 RANGE 1 ${REALM_MAX_DIM}) foreach(SRCFILE realm/deppart/image realm/deppart/preimage realm/deppart/byfield) @@ -157,6 +155,18 @@ foreach(INST_N1 RANGE 1 ${REALM_MAX_DIM}) endforeach() endforeach() +# Generate per-dimension object files for GPU deppart. +foreach(INST_N1 RANGE 1 ${REALM_MAX_DIM}) + foreach(INST_N2 RANGE 1 ${REALM_MAX_DIM}) + foreach(SRCFILE realm/deppart/byfield realm/deppart/image) + set(_result_file "${CMAKE_CURRENT_BINARY_DIR}/${SRCFILE}_gpu_${INST_N1}_${INST_N2}.cu") + # use cmake's configure_file for a portable way of creating wrapper source files + configure_file("${PROJECT_SOURCE_DIR}/cmake/deppart_tmpl.cu.in" "${_result_file}") + list(APPEND REALM_SOURCES "${_result_file}") + endforeach() + endforeach() +endforeach() + set(REALM_SOURCES ${REALM_SOURCES} PARENT_SCOPE diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc index cc6a0d6cc4..51b106f519 100644 --- a/src/realm/deppart/byfield.cc +++ b/src/realm/deppart/byfield.cc @@ -277,8 +277,55 @@ namespace Realm { (void)ok; } - template - ActiveMessageHandlerReg > > ByFieldMicroOp::areg; + template + ActiveMessageHandlerReg > > ByFieldMicroOp::areg; + + + //////////////////////////////////////////////////////////////////////// + // + // class GPUByFieldMicroOp + + template + GPUByFieldMicroOp::GPUByFieldMicroOp( + const IndexSpace &_parent, + std::vector, FT> > _field_data, + bool _exclusive) + : parent_space(_parent), field_data(_field_data) { + this->exclusive = _exclusive; + } + + template + GPUByFieldMicroOp::~GPUByFieldMicroOp() { + } + + template + void GPUByFieldMicroOp::dispatch( + PartitioningOperation *op, bool inline_ok) { + + // We have to register ourselves as a waiter on sparse inputs before dispatching. + + for (size_t i = 0; i < field_data.size(); i++) { + IndexSpace inst_space = field_data[i].index_space; + if (!inst_space.dense()) { + bool registered = SparsityMapImpl::lookup(inst_space.sparsity)->add_waiter(this, true /*precise*/); + if (registered) + this->wait_count.fetch_add(1); + } + } + + if (!parent_space.dense()) { + bool registered = SparsityMapImpl::lookup(parent_space.sparsity)->add_waiter(this, true /*precise*/); + if (registered) this->wait_count.fetch_add(1); + } + this->finish_dispatch(op, inline_ok); + } + + template + void GPUByFieldMicroOp::add_sparsity_output( + FT _val, SparsityMap _sparsity) { + colors.push_back(_val); + sparsity_outputs[_val] = _sparsity; + } //////////////////////////////////////////////////////////////////////// @@ -322,21 +369,44 @@ namespace Realm { return subspace; } - template - void ByFieldOperation::execute(void) - { - for(size_t i = 0; i < subspaces.size(); i++) - SparsityMapImpl::lookup(subspaces[i])->set_contributor_count(field_data.size()); - - for(size_t i = 0; i < field_data.size(); i++) { - ByFieldMicroOp *uop = new ByFieldMicroOp(parent, - field_data[i].index_space, - field_data[i].inst, - field_data[i].field_offset); - for(size_t j = 0; j < colors.size(); j++) - uop->add_sparsity_output(colors[j], subspaces[j]); - //uop.set_value_set(colors); - uop->dispatch(this, true /* ok to run in this thread */); + template + void ByFieldOperation::execute(void) { + + + // If the field data is on the GPU, we need to launch a GPUByFieldMicroOp. + // Rather than one micro-op per field, we can do them all in one micro-op. + // Launching multiple GPU micro-ops just adds overhead, and + // there isn't enough work to need multiple GPUs. + std::vector,FT> > gpu_field_data; + std::vector,FT> > cpu_field_data; + for (size_t i = 0; i < field_data.size(); i++) { + if (field_data[i].inst.get_location().kind() == Memory::GPU_FB_MEM) { + gpu_field_data.push_back(field_data[i]); + } else { + cpu_field_data.push_back(field_data[i]); + } + } + if (!cpu_field_data.empty()) { + for (size_t i = 0; i < subspaces.size(); i++) + SparsityMapImpl::lookup(subspaces[i])->set_contributor_count(cpu_field_data.size() + (gpu_field_data.empty() ? 0 : 1)); + for (size_t i = 0; i < cpu_field_data.size(); i++) { + ByFieldMicroOp *uop = new ByFieldMicroOp(parent, + cpu_field_data[i].index_space, + cpu_field_data[i].inst, + cpu_field_data[i].field_offset); + for (size_t j = 0; j < colors.size(); j++) + uop->add_sparsity_output(colors[j], subspaces[j]); + + uop->dispatch(this, true /* ok to run in this thread */); + } + } + if (!gpu_field_data.empty()) { + GPUByFieldMicroOp *uop = new GPUByFieldMicroOp(parent, gpu_field_data, cpu_field_data.empty()); + for (size_t i = 0; i < colors.size(); i++) { + uop->add_sparsity_output(colors[i], subspaces[i]); + } + uop->dispatch(this, false); + } } @@ -345,20 +415,4 @@ namespace Realm { { os << "ByFieldOperation(" << parent << ")"; } - -#define DOIT(N,T,F) \ - template class ByFieldMicroOp; \ - template class ByFieldOperation; \ - template ByFieldMicroOp::ByFieldMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \ - template Event IndexSpace::create_subspaces_by_field(const std::vector,F> >&, \ - const std::vector&, \ - std::vector >&, \ - const ProfilingRequestSet &, \ - Event) const; -#ifndef REALM_TEMPLATES_ONLY - FOREACH_NTF(DOIT) -#endif - - // instantiations of point/rect-field templates handled in byfield_tmpl.cc - }; diff --git a/src/realm/deppart/byfield.h b/src/realm/deppart/byfield.h index 1ff62b415e..92902efbd1 100644 --- a/src/realm/deppart/byfield.h +++ b/src/realm/deppart/byfield.h @@ -21,6 +21,7 @@ #define REALM_DEPPART_BYFIELD_H #include "realm/deppart/partitions.h" +#include "realm/deppart/rectlist.h" namespace Realm { @@ -67,6 +68,29 @@ namespace Realm { std::map > sparsity_outputs; }; + template + class GPUByFieldMicroOp : public GPUMicroOp { + public: + GPUByFieldMicroOp( + const IndexSpace &_parent, + std::vector,FT> > _field_data, + bool _exclusive); + + virtual ~GPUByFieldMicroOp(void); + + virtual void execute(void); + + void dispatch(PartitioningOperation *op, bool inline_ok); + + void add_sparsity_output(FT _val, SparsityMap _sparsity); + + protected: + const IndexSpace parent_space; + std::vector,FT> > field_data; + std::vector colors; + std::map > sparsity_outputs; + }; + template class ByFieldOperation : public PartitioningOperation { public: diff --git a/src/realm/deppart/byfield_gpu_impl.hpp b/src/realm/deppart/byfield_gpu_impl.hpp new file mode 100644 index 0000000000..f2aa8c3288 --- /dev/null +++ b/src/realm/deppart/byfield_gpu_impl.hpp @@ -0,0 +1,155 @@ +#pragma once +#include "realm/deppart/byfield.h" +#include "realm/deppart/byfield_gpu_kernels.hpp" +#include "realm/deppart/partitions_gpu_impl.hpp" +#include +#include "realm/nvtx.h" + +namespace Realm { + +/* + * Input (stored in MicroOp): Array of field instances, a parent index space, and a list of colors + * Output: A list of (potentially overlapping) points in original instances ∩ parent index space marked with their color, + * which it then sends off to complete_pipeline. + * Approach: Intersect all instance rectangles with parent rectangles in parallel. For surviving rectangles, use + * prefix sum + binary search to iterate over these in parallel and mark each point with its color. + */ +template +void GPUByFieldMicroOp::execute() +{ + + // For profiling. + NVTX_DEPPART(byfield_gpu); + + cudaStream_t stream = Cuda::get_task_cuda_stream(); + + Memory my_mem = field_data[0].inst.get_location(); + + collapsed_space inst_space; + + const char* val = std::getenv("TILE_SIZE"); // or any env var + size_t tile_size = 100000000; //default + if (val) { + tile_size = atoi(val); + } + + RegionInstance fixed_buffer = this->realm_malloc(tile_size, my_mem); + Arena buffer_arena(reinterpret_cast(AffineAccessor(fixed_buffer, 0).base), tile_size); + + inst_space.offsets = buffer_arena.alloc(field_data.size() + 1); + inst_space.num_children = field_data.size(); + + GPUMicroOp::collapse_multi_space(field_data, inst_space, buffer_arena, stream); + + collapsed_space collapsed_parent; + + // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs. + GPUMicroOp::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream); + + + // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter + // to figure out where to write each rectangle. + RegionInstance inst_counters_instance = this->realm_malloc((2*field_data.size() + 1) * sizeof(uint32_t), my_mem); + uint32_t* d_inst_counters = reinterpret_cast(AffineAccessor(inst_counters_instance, 0).base); + + // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second + // to track which instance each rectangle came from in the populate phase. + uint32_t* d_inst_prefix = d_inst_counters + field_data.size(); + size_t num_valid_rects = 0; + Rect* d_valid_rects; + + // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from. + GPUMicroOp::template construct_input_rectlist>(inst_space, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream); + + + // Early out if we don't have any rectangles. + if (num_valid_rects == 0) { + for (std::pair> it : sparsity_outputs) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(it.second); + if (this->exclusive) { + impl->gpu_finalize(); + } else { + impl->contribute_nothing(); + } + } + inst_counters_instance.destroy(); + return; + } + + + // Prefix sum the valid rectangles by volume. + size_t total_pts; + + size_t* d_prefix_rects; + GPUMicroOp::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream); + + // Now we have everything we need to actually populate our outputs. + RegionInstance points_instance = this->realm_malloc(total_pts * sizeof(PointDesc), my_mem); + PointDesc* d_points = reinterpret_cast*>(AffineAccessor(points_instance, 0).base); + + FT* d_colors; + RegionInstance colors_instance; + + + // Memcpying a boolean vector breaks things for some reason so we have this disgusting workaround. + if constexpr(std::is_same_v) { + std::vector flat_colors(colors.size()); + for (size_t i = 0; i < colors.size(); i++) { + flat_colors[i] = colors[i] ? 1 : 0; + } + colors_instance = this->realm_malloc(total_pts * sizeof(PointDesc), my_mem); + uint8_t* d_flat_colors = reinterpret_cast(AffineAccessor(colors_instance, 0).base); + CUDA_CHECK(cudaMemcpyAsync(d_flat_colors, flat_colors.data(), colors.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, stream), stream); + d_colors = reinterpret_cast(d_flat_colors); + } else { + colors_instance = this->realm_malloc(colors.size() * sizeof(FT), my_mem); + d_colors = reinterpret_cast(AffineAccessor(colors_instance, 0).base); + CUDA_CHECK(cudaMemcpyAsync(d_colors, colors.data(), colors.size() * sizeof(FT), cudaMemcpyHostToDevice, stream), stream); + } + + + Memory zcpy_mem; + assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM)); + + // We need to pass the accessors to the GPU so it can read field values. + RegionInstance accessors_instance = this->realm_malloc(field_data.size() * sizeof(AffineAccessor), zcpy_mem); + AffineAccessor* d_accessors = reinterpret_cast*>(AffineAccessor(accessors_instance, 0).base); + for (size_t i = 0; i < field_data.size(); ++i) { + d_accessors[i] = AffineAccessor(field_data[i].inst, field_data[i].field_offset); + } + + + + // This is where the work is actually done - each thread figures out which points to read, reads it, marks a PointDesc with its color, and writes it out. + byfield_gpuPopulateBitmasksKernel<<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, d_colors, total_pts, colors.size(), num_valid_rects, field_data.size(), d_points); + KERNEL_CHECK(stream); + + + // Map colors to their output index to match send output iterator. + std::map color_indices; + for (size_t i = 0; i < colors.size(); i++) { + color_indices[colors[i]] = i; + } + + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + colors_instance.destroy(); + accessors_instance.destroy(); + inst_counters_instance.destroy(); + + // Ship off the points for final processing. + size_t out_rects = 0; + RectDesc* trash; + this->complete_pipeline(d_points, total_pts, trash, out_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& kv){ + // elem is a SparsityMap from the vector + return color_indices.at(kv.first); + }, + /* getMap: */ [&](auto const& kv){ + // return the SparsityMap key itself + return kv.second; + }); + + points_instance.destroy(); +} +} diff --git a/src/realm/deppart/byfield_gpu_kernels.hpp b/src/realm/deppart/byfield_gpu_kernels.hpp new file mode 100644 index 0000000000..f1ec217f9b --- /dev/null +++ b/src/realm/deppart/byfield_gpu_kernels.hpp @@ -0,0 +1,57 @@ +#pragma once +#include "realm/deppart/byfield.h" +#include "realm/deppart/partitions_gpu_kernels.hpp" + +namespace Realm { + + +template < + int N, typename T, typename FT +> +__global__ +void byfield_gpuPopulateBitmasksKernel( + AffineAccessor* accessors, + Rect* rects, + size_t* prefix, + uint32_t* inst_prefix, + FT* d_colors, + size_t numPoints, + size_t numColors, + size_t numRects, + size_t num_insts, + PointDesc *d_points +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numPoints) return; + + // Binary search to find which rectangle this point belongs to. + uint32_t r = bsearch(prefix, numRects, idx); + + // Binary search to find which instance this rectangle belongs to. + size_t inst_idx = bsearch(inst_prefix, num_insts, r); + + // Now we know which rectangle we're in, figure out the point coordinates. + size_t offset = idx - prefix[r]; + Point p; + for (int k = N-1; k >= 0; --k) { + size_t dim = rects[r].hi[k] + 1 - rects[r].lo[k]; + p[k] = rects[r].lo[k] + (offset % dim); + offset /= dim; + } + + // Read the field value at that point. + FT ptr = accessors[inst_idx].read(p); + + // Find our color's idx and write output. + PointDesc point_desc; + point_desc.point = p; + for (size_t i = 0; i < numColors; ++i) { + if (ptr == d_colors[i]) { + point_desc.src_idx = i; + break; + } + } + d_points[idx] = point_desc; +} + +} // namespace Realm \ No newline at end of file diff --git a/src/realm/deppart/byfield_gpu_tmpl.cu b/src/realm/deppart/byfield_gpu_tmpl.cu new file mode 100644 index 0000000000..807fc1ad0b --- /dev/null +++ b/src/realm/deppart/byfield_gpu_tmpl.cu @@ -0,0 +1,64 @@ +/* Copyright 2024 Stanford University, NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// per‐dimension instantiator for the GPU By Field Operation +// Mirrors CPU Approach (byfield_tmpl.cc) + +#define REALM_TEMPLATES_ONLY +#include "realm/deppart/byfield_gpu_impl.hpp" +#include "realm/deppart/inst_helper.h" + + +#ifndef INST_N1 + #error "INST_N1 must be defined before including byfield_gpu_tmpl.cu" +#endif +#ifndef INST_N2 + #error "INST_N2 must be defined before including byfield_gpu_tmpl.cu" +#endif + +#define FOREACH_TT(__func__) \ + __func__(int, int) \ + __func__(int, unsigned) \ + __func__(int, long long) \ + __func__(unsigned,int) \ + __func__(unsigned,unsigned) \ + __func__(unsigned,long long) \ + __func__(long long, int) \ + __func__(long long, unsigned) \ + __func__(long long, long long) + +#define FOREACH_T(__func__) \ + __func__(int) \ + __func__(unsigned) \ + __func__(long long) + +namespace Realm { + #define N1 INST_N1 + #define N2 INST_N2 + + #define ZP(N,T) Point + #define ZR(N,T) Rect + + #define DO_WITH_FT(N, T, FT) \ + template class ByFieldMicroOp; \ + template class GPUByFieldMicroOp; + + #define DOIT(T1,T2) \ + DO_WITH_FT(N1,T1,ZP(N2,T2)) + + FOREACH_TT(DOIT) + + FOREACH_NTF(DO_WITH_FT) +} // namespace Realm \ No newline at end of file diff --git a/src/realm/deppart/byfield_tmpl.cc b/src/realm/deppart/byfield_tmpl.cc index 7c58bc725b..38a95a040d 100644 --- a/src/realm/deppart/byfield_tmpl.cc +++ b/src/realm/deppart/byfield_tmpl.cc @@ -17,7 +17,7 @@ // per-dimension instantiator for byfield.cc -#define REALM_TEMPLATES_ONLY +#undef REALM_TEMPLATES_ONLY #include "./byfield.cc" #ifndef INST_N1 @@ -43,6 +43,19 @@ namespace Realm { #define N1 INST_N1 #define N2 INST_N2 +#define DOIT(N,T,F) \ + template class ByFieldMicroOp; \ + template class GPUByFieldMicroOp; \ + template class ByFieldOperation; \ + template ByFieldMicroOp::ByFieldMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \ + template Event IndexSpace::create_subspaces_by_field(const std::vector,F> >&, \ + const std::vector&, \ + std::vector >&, \ + const ProfilingRequestSet &, \ + Event) const; + +FOREACH_NTF(DOIT) + #define ZP(N,T) Point #define ZR(N,T) Rect #define DOIT2(T1,T2) \ diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc index e598c22033..660d0f77ad 100644 --- a/src/realm/deppart/image.cc +++ b/src/realm/deppart/image.cc @@ -30,6 +30,77 @@ namespace Realm { extern Logger log_part; extern Logger log_uop_timing; + template + template + Event IndexSpace::gpu_subspaces_by_image( + const DomainTransform &domain_transform, + const std::vector> &sources, + std::vector> &images, const ProfilingRequestSet &reqs, + std::pair &sizes, RegionInstance buffer, Event wait_on) const { + // output vector should start out empty + assert(images.empty()); + + if (buffer==RegionInstance::NO_INST) { + size_t optimal_size = 0; + for (size_t i = 0; i < sources.size(); i++) { + optimal_size += 5 * sources[i].volume() * sizeof(RectDesc); + } + size_t minimal_size = 0; + size_t source_entries = 0; + bool bvh = false; + for (size_t i = 0; i < sources.size(); ++i) { + IndexSpace my_space = sources[i]; + if (my_space.dense()) { + source_entries += 1; + } else { + bvh = true; + source_entries += my_space.sparsity.impl()->get_entries().size(); + } + } + minimal_size += sizeof(Rect) * source_entries; + if (this->dense()) { + minimal_size += sizeof(Rect); + } else { + minimal_size += sizeof(Rect) * this->sparsity.impl()->get_entries().size(); + } + if (bvh) { + minimal_size += + (source_entries * sizeof(uint64_t)) + + (source_entries * sizeof(size_t)) + + ((2*source_entries - 1) * sizeof(Rect)) + + (2 * (2*source_entries - 1) * sizeof(int)) + + sizeof(Rect) + + (2 * source_entries * sizeof(uint64_t)) + + (source_entries * sizeof(uint64_t)); + } + sizes = std::make_pair(minimal_size, minimal_size + optimal_size); + return Event::NO_EVENT; + } + + GenEventImpl *finish_event = GenEventImpl::create_genevent(); + Event e = finish_event->current_event(); + + GPUImageOperation *op = new GPUImageOperation( + *this, domain_transform, reqs, sizes.first, buffer, finish_event, ID(e).event_generation()); + + size_t n = sources.size(); + images.resize(n); + for (size_t i = 0; i < n; i++) { + images[i] = op->add_source(sources[i]); + + if(!images[i].dense()) { + e = Event::merge_events( + {e, SparsityMapRefCounter(images[i].sparsity.id).add_references(1)}); + } + + log_dpops.info() << "image: " << *this << " src=" << sources[i] << " -> " + << images[i] << " (" << e << ")"; + } + + op->launch(wait_on); + return e; + } + template template Event IndexSpace::create_subspaces_by_image( @@ -495,23 +566,83 @@ namespace Realm { target_node = ID(source.sparsity).sparsity_creator_node(); else if(!domain_transform.ptr_data.empty()) - target_node = ID(domain_transform.ptr_data[sources.size() % domain_transform.ptr_data.size()].inst).instance_owner_node(); + target_node = ID(domain_transform.ptr_data[sources.size() % domain_transform.ptr_data.size()].inst).instance_owner_node(); else - target_node = ID(domain_transform.range_data[sources.size() % domain_transform.range_data.size()].inst).instance_owner_node(); + target_node = ID(domain_transform.range_data[sources.size() % domain_transform.range_data.size()].inst).instance_owner_node(); + + SparsityMap sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert >(); + image.sparsity = sparsity; + + sources.push_back(source); + diff_rhss.push_back(diff_rhs); + images.push_back(sparsity); + is_intersection = false; + + return image; + } + + template + IndexSpace ImageOperation::add_source_with_intersection(const IndexSpace& source, + const IndexSpace& diff_rhs) + { + // try to filter out obviously empty sources + if(parent.empty() || source.empty()) + return IndexSpace::make_empty(); + + // otherwise it'll be something smaller than the current parent + IndexSpace image; + image.bounds = parent.bounds; + + // if the source has a sparsity map, use the same node - otherwise + // get a sparsity ID by round-robin'ing across the nodes that have field data + int target_node; + if(!source.dense()) + target_node = ID(source.sparsity).sparsity_creator_node(); + else + if(!domain_transform.ptr_data.empty()) + target_node = ID(domain_transform.ptr_data[sources.size() % domain_transform.ptr_data.size()].inst).instance_owner_node(); + else + target_node = ID(domain_transform.range_data[sources.size() % domain_transform.range_data.size()].inst).instance_owner_node(); + SparsityMap sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert >(); image.sparsity = sparsity; sources.push_back(source); diff_rhss.push_back(diff_rhs); images.push_back(sparsity); + is_intersection = true; return image; } template void ImageOperation::execute(void) { - if (domain_transform.type == - DomainTransform::DomainTransformType::STRUCTURED) { + + std::vector,Point> > gpu_ptr_data; + std::vector,Point> > cpu_ptr_data; + std::vector,Rect> > gpu_rect_data; + std::vector,Rect> > cpu_rect_data; + for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) { + if (domain_transform.ptr_data[i].inst.get_location().kind() == + Memory::GPU_FB_MEM) { + gpu_ptr_data.push_back(domain_transform.ptr_data[i]); + } else { + cpu_ptr_data.push_back(domain_transform.ptr_data[i]); + } + } + for (size_t i = 0; i < domain_transform.range_data.size(); i++) { + if (domain_transform.range_data[i].inst.get_location().kind() == + Memory::GPU_FB_MEM) { + gpu_rect_data.push_back(domain_transform.range_data[i]); + } else { + cpu_rect_data.push_back(domain_transform.range_data[i]); + } + } + bool gpu_data = !gpu_ptr_data.empty() || !gpu_rect_data.empty(); + bool cpu_data = !cpu_ptr_data.empty() || !cpu_rect_data.empty(); + if (domain_transform.type == + DomainTransform::DomainTransformType::STRUCTURED && !gpu_data) { + for (size_t i = 0; i < sources.size(); i++) { SparsityMapImpl::lookup(images[i])->set_contributor_count(1); } @@ -523,64 +654,89 @@ namespace Realm { for (size_t j = 0; j < sources.size(); j++) { micro_op->add_sparsity_output(sources[j], images[j]); } - micro_op->dispatch(this, /*inline_ok=*/true); - } else { - if (!DeppartConfig::cfg_disable_intersection_optimization) { - // build the overlap tester based on the field index spaces - they're more - // likely to be known and - // denser - ComputeOverlapMicroOp *uop = - new ComputeOverlapMicroOp(this); + } else if (!DeppartConfig::cfg_disable_intersection_optimization && !gpu_data) { + // build the overlap tester based on the field index spaces - they're more + // likely to be known and + // denser + ComputeOverlapMicroOp *uop = + new ComputeOverlapMicroOp(this); - for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) - uop->add_input_space(domain_transform.ptr_data[i].index_space); + for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) + uop->add_input_space(domain_transform.ptr_data[i].index_space); - for (size_t i = 0; i < domain_transform.range_data.size(); i++) - uop->add_input_space(domain_transform.range_data[i].index_space); + for (size_t i = 0; i < domain_transform.range_data.size(); i++) + uop->add_input_space(domain_transform.range_data[i].index_space); - // we will ask this uop to also prefetch the sources we will intersect test - // against it - for (size_t i = 0; i < sources.size(); i++) - uop->add_extra_dependency(sources[i]); + // we will ask this uop to also prefetch the sources we will intersect test + // against it + for (size_t i = 0; i < sources.size(); i++) + uop->add_extra_dependency(sources[i]); - uop->dispatch(this, true /* ok to run in this thread */); + uop->dispatch(this, true /* ok to run in this thread */); } else { - // launch full cross-product of image micro ops right away - for (size_t i = 0; i < sources.size(); i++) - SparsityMapImpl::lookup(images[i])->set_contributor_count( - domain_transform.ptr_data.size() + - domain_transform.range_data.size()); - - for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) { - ImageMicroOp *uop = new ImageMicroOp( - parent, domain_transform.ptr_data[i].index_space, - domain_transform.ptr_data[i].inst, - domain_transform.ptr_data[i].field_offset, false /*ptrs*/); - for (size_t j = 0; j < sources.size(); j++) - if (diff_rhss.empty()) - uop->add_sparsity_output(sources[j], images[j]); - else - uop->add_sparsity_output_with_difference(sources[j], diff_rhss[j], - images[j]); - - uop->dispatch(this, true /* ok to run in this thread */); - } - - for (size_t i = 0; i < domain_transform.range_data.size(); i++) { - ImageMicroOp *uop = new ImageMicroOp( - parent, domain_transform.range_data[i].index_space, - domain_transform.range_data[i].inst, - domain_transform.range_data[i].field_offset, true /*ranges*/); - for (size_t j = 0; j < sources.size(); j++) - if (diff_rhss.empty()) - uop->add_sparsity_output(sources[j], images[j]); - else - uop->add_sparsity_output_with_difference(sources[j], diff_rhss[j], - images[j]); - - uop->dispatch(this, true /* ok to run in this thread */); - } + if (cpu_data) { + // launch full cross-product of image micro ops right away + for (size_t i = 0; i < sources.size(); i++) + SparsityMapImpl::lookup(images[i])->set_contributor_count( + cpu_ptr_data.size() + + cpu_rect_data.size() + (gpu_data ? 1 : 0)); + + for (size_t i = 0; i < cpu_ptr_data.size(); i++) { + ImageMicroOp *uop = new ImageMicroOp( + parent, cpu_ptr_data[i].index_space, + cpu_ptr_data[i].inst, + cpu_ptr_data[i].field_offset, false /*ptrs*/); + for (size_t j = 0; j < sources.size(); j++) + if (diff_rhss.empty()) + uop->add_sparsity_output(sources[j], images[j]); + else + uop->add_sparsity_output_with_difference(sources[j], diff_rhss[j], + images[j]); + + uop->dispatch(this, true /* ok to run in this thread */); + } + + for (size_t i = 0; i < cpu_rect_data.size(); i++) { + ImageMicroOp *uop = new ImageMicroOp( + parent, cpu_rect_data[i].index_space, + cpu_rect_data[i].inst, + cpu_rect_data[i].field_offset, true /*ranges*/); + for (size_t j = 0; j < sources.size(); j++) + if (diff_rhss.empty()) + uop->add_sparsity_output(sources[j], images[j]); + else + uop->add_sparsity_output_with_difference(sources[j], diff_rhss[j], + images[j]); + + uop->dispatch(this, true /* ok to run in this thread */); + } + } + if (gpu_data) { + std::swap(domain_transform.ptr_data, gpu_ptr_data); + std::swap(domain_transform.range_data, gpu_rect_data); + const char* val = std::getenv("TILE_SIZE"); // or any env var + size_t tile_size = 100000000; //default + if (val) { + tile_size = atoi(val); + } + std::vector byte_fields = {sizeof(char)}; + IndexSpace<1> instance_index_space(Rect<1>(0, tile_size-1)); + RegionInstance buffer; + Memory my_mem; + if (domain_transform.ptr_data.size() > 0) { + my_mem = domain_transform.ptr_data[0].inst.get_location(); + } else { + my_mem = domain_transform.range_data[0].inst.get_location(); + } + RegionInstance::create_instance(buffer, my_mem, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + GPUImageMicroOp *micro_op = + new GPUImageMicroOp( + parent, domain_transform, !cpu_data, tile_size, buffer); + for (size_t j = 0; j < sources.size(); j++) { + micro_op->add_sparsity_output(sources[j], images[j]); + } + micro_op->dispatch(this, true); } } } @@ -662,6 +818,74 @@ namespace Realm { os << "ImageOperation(" << parent << ")"; } + //////////////////////////////////////////////////////////////////////// + // + // class GPUImageOperation + + template + GPUImageOperation::GPUImageOperation( + const IndexSpace &_parent, + const DomainTransform &_domain_transform, + const ProfilingRequestSet &reqs, size_t _buffer_size, RegionInstance _buffer, + GenEventImpl *_finish_event, EventImpl::gen_t _finish_gen) + : PartitioningOperation(reqs, _finish_event, _finish_gen), + parent(_parent), + domain_transform(_domain_transform), + buffer_size(_buffer_size), + buffer(_buffer) {} + + template + GPUImageOperation::~GPUImageOperation(void) + {} + + template + IndexSpace GPUImageOperation::add_source(const IndexSpace& source) + { + // try to filter out obviously empty sources + if(parent.empty() || source.empty()) + return IndexSpace::make_empty(); + + // otherwise it'll be something smaller than the current parent + IndexSpace image; + image.bounds = parent.bounds; + + // if the source has a sparsity map, use the same node - otherwise + // get a sparsity ID by round-robin'ing across the nodes that have field data + int target_node = 0; + if(!source.dense()) + target_node = ID(source.sparsity).sparsity_creator_node(); + else + if(!domain_transform.ptr_data.empty()) + target_node = ID(domain_transform.ptr_data[sources.size() % domain_transform.ptr_data.size()].inst).instance_owner_node(); + else + target_node = ID(domain_transform.range_data[sources.size() % domain_transform.range_data.size()].inst).instance_owner_node(); + + SparsityMap sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert >(); + image.sparsity = sparsity; + + sources.push_back(source); + images.push_back(sparsity); + + return image; + } + + template + void GPUImageOperation::execute(void) { + GPUImageMicroOp *micro_op = + new GPUImageMicroOp( + parent, domain_transform, true, buffer_size, buffer); + for (size_t j = 0; j < sources.size(); j++) { + micro_op->add_sparsity_output(sources[j], images[j]); + } + micro_op->dispatch(this, true); + } + + template + void GPUImageOperation::print(std::ostream& os) const + { + os << "ImageOperation(" << parent << ")"; + } + //////////////////////////////////////////////////////////////////////// // // class StructuredImageMicroOp @@ -783,6 +1007,72 @@ namespace Realm { } } + //////////////////////////////////////////////////////////////////////// + // + // class StructuredImageMicroOp + + template + GPUImageMicroOp::GPUImageMicroOp( + const IndexSpace &_parent, + const DomainTransform &_domain_transform, + bool _exclusive, size_t _fixed_buffer_size, RegionInstance _buffer) + : parent_space(_parent), domain_transform(_domain_transform), fixed_buffer_size(_fixed_buffer_size), buffer(_buffer) + { + this->exclusive = _exclusive; + } + + template + GPUImageMicroOp::~GPUImageMicroOp() {} + + template + void GPUImageMicroOp::dispatch( + PartitioningOperation *op, bool inline_ok) { + + for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) { + IndexSpace inst_space = domain_transform.ptr_data[i].index_space; + if (!inst_space.dense()) { + // it's safe to add the count after the registration only because we initialized + // the count to 2 instead of 1 + bool registered = SparsityMapImpl::lookup(inst_space.sparsity)->add_waiter(this, true /*precise*/); + if(registered) + this->wait_count.fetch_add(1); + } + } + + for (size_t i = 0; i < sources.size(); i++) { + if (!sources[i].dense()) { + bool registered = SparsityMapImpl::lookup(sources[i].sparsity) + ->add_waiter(this, true /*precise*/); + if (registered) this->wait_count.fetch_add(1); + } + } + + if (!parent_space.dense()) { + bool registered = SparsityMapImpl::lookup(parent_space.sparsity) + ->add_waiter(this, true /*precise*/); + if (registered) this->wait_count.fetch_add(1); + } + this->finish_dispatch(op, inline_ok); + } + + template + void GPUImageMicroOp::add_sparsity_output( + IndexSpace _source, SparsityMap _sparsity) { + sources.push_back(_source); + // TODO(apryakhin): Handle and test this sparsity ref-count path. + sparsity_outputs.push_back(_sparsity); + } + + template + void GPUImageMicroOp::execute(void) { + TimeStamp ts("StructuredImageMicroOp::execute", true, &log_uop_timing); + if (domain_transform.ptr_data.size() > 0) { + gpu_populate_ptrs(); + } else { + gpu_populate_rngs(); + } + } + //////////////////////////////////////////////////////////////////////// // instantiations of templates handled in image_tmpl.cc diff --git a/src/realm/deppart/image.h b/src/realm/deppart/image.h index 0e0fbfe03f..82b6393eb7 100644 --- a/src/realm/deppart/image.h +++ b/src/realm/deppart/image.h @@ -24,117 +24,188 @@ #include "realm/deppart/rectlist.h" namespace Realm { + template + class ImageMicroOp : public PartitioningMicroOp { + public: + static const int DIM = N; + typedef T IDXTYPE; + static const int DIM2 = N2; + typedef T2 IDXTYPE2; - template - class ImageMicroOp : public PartitioningMicroOp { - public: - static const int DIM = N; - typedef T IDXTYPE; - static const int DIM2 = N2; - typedef T2 IDXTYPE2; - - ImageMicroOp(IndexSpace _parent_space, IndexSpace _inst_space, - RegionInstance _inst, size_t _field_offset, bool _is_ranged); - - virtual ~ImageMicroOp(void); - - void add_sparsity_output(IndexSpace _source, SparsityMap _sparsity); - void add_sparsity_output_with_difference(IndexSpace _source, - IndexSpace _diff_rhs, - SparsityMap _sparsity); - void add_approx_output(int index, PartitioningOperation *op); - - virtual void execute(void); - - void dispatch(PartitioningOperation *op, bool inline_ok); - - protected: - friend struct RemoteMicroOpMessage >; - static ActiveMessageHandlerReg > > areg; - - friend class PartitioningMicroOp; - template - REALM_ATTR_WARN_UNUSED(bool serialize_params(S& s) const); - - // construct from received packet - template - ImageMicroOp(NodeID _requestor, AsyncMicroOp *_async_microop, S& s); - - template - void populate_bitmasks_ptrs(std::map& bitmasks); - - template - void populate_bitmasks_ranges(std::map& bitmasks); - - template - void populate_approx_bitmask_ptrs(BM& bitmask); - - template - void populate_approx_bitmask_ranges(BM& bitmask); - - IndexSpace parent_space; - IndexSpace inst_space; - RegionInstance inst; - size_t field_offset; - bool is_ranged; - std::vector > sources; - std::vector > diff_rhss; - std::vector > sparsity_outputs; - int approx_output_index; - intptr_t approx_output_op; - }; - - template - class ImageOperation : public PartitioningOperation { - public: - ImageOperation(const IndexSpace& _parent, - const DomainTransform& _domain_transform, - const ProfilingRequestSet& reqs, GenEventImpl* _finish_event, - EventImpl::gen_t _finish_gen); - - virtual ~ImageOperation(void); - - IndexSpace add_source(const IndexSpace& source); - IndexSpace add_source_with_difference( - const IndexSpace& source, const IndexSpace& diff_rhs); - - virtual void execute(void); - - virtual void print(std::ostream& os) const; - - virtual void set_overlap_tester(void* tester); - - protected: - IndexSpace parent; - DomainTransform domain_transform; - std::vector> sources; - std::vector> diff_rhss; - std::vector> images; - }; - - template - class StructuredImageMicroOp : public PartitioningMicroOp { - public: - StructuredImageMicroOp( - const IndexSpace& _parent, - const StructuredTransform& _transform); - - virtual ~StructuredImageMicroOp(void); - virtual void execute(void); - - virtual void populate(std::map*>& bitmasks); - - void dispatch(PartitioningOperation* op, bool inline_ok); - void add_sparsity_output(IndexSpace _source, - SparsityMap _sparsity); - - protected: - IndexSpace parent_space; - StructuredTransform transform; - std::vector> sources; - std::vector> sparsity_outputs; - }; + ImageMicroOp(IndexSpace _parent_space, IndexSpace _inst_space, + RegionInstance _inst, size_t _field_offset, bool _is_ranged); - }; // namespace Realm + virtual ~ImageMicroOp(void); + + void add_sparsity_output(IndexSpace _source, SparsityMap _sparsity); + + void add_sparsity_output_with_difference(IndexSpace _source, + IndexSpace _diff_rhs, + SparsityMap _sparsity); + + void add_sparsity_output_with_intersection(IndexSpace _source, + IndexSpace _diff_rhs, + SparsityMap _sparsity); + + void add_approx_output(int index, PartitioningOperation *op); + + virtual void execute(void); + + void dispatch(PartitioningOperation *op, bool inline_ok); + + protected: + friend struct RemoteMicroOpMessage >; + static ActiveMessageHandlerReg > > areg; + + friend class PartitioningMicroOp; + + template + REALM_ATTR_WARN_UNUSED(bool serialize_params(S& s) const); + + // construct from received packet + template + ImageMicroOp(NodeID _requestor, AsyncMicroOp *_async_microop, S &s); + + template + void populate_bitmasks_ptrs(std::map &bitmasks); + + template + void populate_bitmasks_ranges(std::map &bitmasks); + + template + void populate_approx_bitmask_ptrs(BM &bitmask); + + template + void populate_approx_bitmask_ranges(BM &bitmask); + + IndexSpace parent_space; + IndexSpace inst_space; + RegionInstance inst; + size_t field_offset; + bool is_ranged; + bool is_intersection; + std::vector > sources; + std::vector > diff_rhss; + std::vector > sparsity_outputs; + int approx_output_index; + intptr_t approx_output_op; + }; + + template + class ImageOperation : public PartitioningOperation { + public: + ImageOperation(const IndexSpace &_parent, + const DomainTransform &_domain_transform, + const ProfilingRequestSet &reqs, GenEventImpl *_finish_event, + EventImpl::gen_t _finish_gen); + + virtual ~ImageOperation(void); + + IndexSpace add_source(const IndexSpace &source); + + IndexSpace add_source_with_difference( + const IndexSpace &source, const IndexSpace &diff_rhs); + + IndexSpace add_source_with_intersection( + const IndexSpace &source, const IndexSpace &diff_rhs); + + virtual void execute(void); + + virtual void print(std::ostream &os) const; + + virtual void set_overlap_tester(void *tester); + + protected: + IndexSpace parent; + DomainTransform domain_transform; + std::vector > sources; + std::vector > diff_rhss; + std::vector > images; + bool is_intersection; + }; + + template + class StructuredImageMicroOp : public PartitioningMicroOp { + public: + StructuredImageMicroOp( + const IndexSpace &_parent, + const StructuredTransform &_transform); + + virtual ~StructuredImageMicroOp(void); + + virtual void execute(void); + + virtual void populate(std::map *> &bitmasks); + + void dispatch(PartitioningOperation *op, bool inline_ok); + + void add_sparsity_output(IndexSpace _source, + SparsityMap _sparsity); + + protected: + IndexSpace parent_space; + StructuredTransform transform; + std::vector > sources; + std::vector > sparsity_outputs; + }; + + template + class GPUImageOperation : public PartitioningOperation { + public: + GPUImageOperation(const IndexSpace &_parent, + const DomainTransform &_domain_transform, + const ProfilingRequestSet &reqs, + size_t _buffer_size, + RegionInstance _buffer, + GenEventImpl *_finish_event, + EventImpl::gen_t _finish_gen); + + virtual ~GPUImageOperation(void); + + IndexSpace add_source(const IndexSpace &source); + + virtual void execute(void); + + virtual void print(std::ostream &os) const; + + protected: + IndexSpace parent; + DomainTransform domain_transform; + std::vector > sources; + std::vector > images; + size_t buffer_size; + RegionInstance buffer; + }; + + template + class GPUImageMicroOp : public GPUMicroOp { + public: + GPUImageMicroOp( + const IndexSpace &_parent, + const DomainTransform &_domain_transform, + bool _exclusive, size_t fixed_buffer_size = 0, RegionInstance buffer = RegionInstance::NO_INST); + + virtual ~GPUImageMicroOp(void); + + virtual void execute(void); + + virtual void gpu_populate_ptrs(); + + virtual void gpu_populate_rngs(); + + void dispatch(PartitioningOperation *op, bool inline_ok); + + void add_sparsity_output(IndexSpace _source, + SparsityMap _sparsity); + + protected: + IndexSpace parent_space; + DomainTransform domain_transform; + std::vector > sources; + std::vector > sparsity_outputs; + size_t fixed_buffer_size; + RegionInstance buffer; + }; +}; // namespace Realm #endif // REALM_DEPPART_IMAGE_H diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp new file mode 100644 index 0000000000..6abb27c043 --- /dev/null +++ b/src/realm/deppart/image_gpu_impl.hpp @@ -0,0 +1,446 @@ +#pragma once +#include "realm/deppart/image.h" +#include "realm/deppart/image_gpu_kernels.hpp" +#include "realm/deppart/partitions_gpu_impl.hpp" +#include +#include +#include +#include "realm/nvtx.h" + +namespace Realm { + +//TODO: INTERSECTING INPUT/OUTPUT RECTS CAN BE DONE WITH BVH IF BECOME EXPENSIVE + +template +struct RectDescVolumeOp { + __device__ __forceinline__ + size_t operator()(const RectDesc& rd) const { + return rd.rect.volume(); + } +}; + +template +struct SparsityMapEntryVolumeOp { + __device__ __forceinline__ + size_t operator()(const SparsityMapEntry& entry) const { + return entry.bounds.volume(); + } +}; + + /* + * Input (stored in MicroOp): Array of field instances, a parent index space, and a list of source index spaces + * Output: A list of (potentially overlapping) rectangles that result from chasing all the pointers in the source index spaces + * through the provided instances and emitting only those that intersect the parent index space labeled by which source they came from, + * which are then sent off to complete_rect_pipeline. + * Approach: Intersect all instance rectangles with source rectangles in parallel. Prefix sum + binary search to iterate over these in + * parallel and chase all the pointers in the source rectangles to their corresponding rectangle. Finally, intersect the output rectangles + * with the parent rectangles in parallel. + */ +template +void GPUImageMicroOp::gpu_populate_rngs() +{ + + if (sources.size() == 0) { + return; + } + + NVTX_DEPPART(gpu_image); + + Memory my_mem = domain_transform.range_data[0].inst.get_location(); + + cudaStream_t stream = Cuda::get_task_cuda_stream(); + + const char* val = std::getenv("TILE_SIZE"); // or any env var + size_t tile_size = 100000000; //default + if (val) { + tile_size = atoi(val); + } + + RegionInstance fixed_buffer = this->realm_malloc(tile_size, my_mem); + Arena buffer_arena(reinterpret_cast(AffineAccessor(fixed_buffer, 0).base), tile_size); + + collapsed_space src_space; + RegionInstance offsets_instance = this->realm_malloc((sources.size()+1) * sizeof(size_t), my_mem); + src_space.offsets = reinterpret_cast(AffineAccessor(offsets_instance, 0).base); + src_space.num_children = sources.size(); + + GPUMicroOp::collapse_multi_space(sources, src_space, buffer_arena, stream); + + collapsed_space inst_space; + + // We combine all of our instances into one to batch work, tracking the offsets between instances. + RegionInstance inst_offsets_instance = this->realm_malloc((domain_transform.range_data.size() + 1) * sizeof(size_t), my_mem); + inst_space.offsets = reinterpret_cast(AffineAccessor(inst_offsets_instance, 0).base); + inst_space.num_children = domain_transform.range_data.size(); + + GPUMicroOp::collapse_multi_space(domain_transform.range_data, inst_space, buffer_arena, stream); + + // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter + // to figure out where to write each rectangle. + RegionInstance inst_counters_instance = this->realm_malloc((2*domain_transform.range_data.size() + 1) * sizeof(uint32_t), my_mem); + uint32_t* d_inst_counters = reinterpret_cast(AffineAccessor(inst_counters_instance, 0).base); + + // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second + // to track which instance each rectangle came from in the populate phase. + uint32_t* d_inst_prefix = d_inst_counters + domain_transform.range_data.size(); + RegionInstance valid_rects_instance; + size_t num_valid_rects; + RectDesc* d_valid_rects; + + // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from. + GPUMicroOp::template construct_input_rectlist>(inst_space, src_space, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream); + inst_offsets_instance.destroy(); + + if (num_valid_rects == 0) { + for (SparsityMap it : sparsity_outputs) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(it); + if (this->exclusive) { + impl->gpu_finalize(); + } else { + impl->contribute_nothing(); + } + } + valid_rects_instance.destroy(); + inst_counters_instance.destroy(); + return; + } + + // Prefix sum the valid rectangles by volume. + size_t* d_prefix_rects; + size_t total_pts; + + GPUMicroOp::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream); + + RegionInstance rngs_instance = this->realm_malloc(total_pts * sizeof(RectDesc), my_mem); + RectDesc* d_rngs = reinterpret_cast*>(AffineAccessor(rngs_instance, 0).base); + + + Memory zcpy_mem; + assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM)); + RegionInstance accessors_instance = this->realm_malloc(domain_transform.range_data.size() * sizeof(AffineAccessor,N2,T2>), zcpy_mem); + AffineAccessor,N2,T2>* d_accessors = reinterpret_cast,N2,T2>*>(AffineAccessor(accessors_instance, 0).base); + for (size_t i = 0; i < domain_transform.range_data.size(); ++i) { + d_accessors[i] = AffineAccessor,N2,T2>(domain_transform.range_data[i].inst, domain_transform.range_data[i].field_offset); + } + + image_gpuPopulateBitmasksRngsKernel<<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, total_pts, num_valid_rects, domain_transform.range_data.size(), d_rngs); + KERNEL_CHECK(stream); + + RegionInstance parent_entries_instance; + collapsed_space collapsed_parent; + + // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs. + GPUMicroOp::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream); + + + RegionInstance src_counters_instance = this->realm_malloc(sources.size() * sizeof(uint32_t), my_mem); + uint32_t* d_src_counters = reinterpret_cast(AffineAccessor(src_counters_instance, 0).base); + CUDA_CHECK(cudaMemsetAsync(d_src_counters, 0, sources.size() * sizeof(uint32_t), stream), stream); + + + //Finally, we do another two pass count + emit to intersect with the parent rectangles + image_intersect_output<<>>(collapsed_parent.entries_buffer, d_rngs, nullptr, collapsed_parent.num_entries, total_pts, d_src_counters, nullptr); + KERNEL_CHECK(stream); + + std::vector h_src_counters(sources.size()+1); + h_src_counters[0] = 0; // prefix sum starts at 0 + CUDA_CHECK(cudaMemcpyAsync(h_src_counters.data()+1, d_src_counters, sources.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); + + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + valid_rects_instance.destroy(); + accessors_instance.destroy(); + + for (size_t i = 0; i < sources.size(); ++i) { + h_src_counters[i+1] += h_src_counters[i]; + } + + size_t num_valid_output = h_src_counters[sources.size()]; + + if (num_valid_output == 0) { + for (SparsityMap it : sparsity_outputs) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(it); + if (this->exclusive) { + impl->gpu_finalize(); + } else { + impl->contribute_nothing(); + } + } + parent_entries_instance.destroy(); + src_counters_instance.destroy(); + rngs_instance.destroy(); + return; + } + + + RegionInstance valid_intersect_instance = this->realm_malloc(num_valid_output * sizeof(RectDesc), my_mem); + RectDesc* d_valid_intersect = reinterpret_cast*>(AffineAccessor(valid_intersect_instance, 0).base); + + RegionInstance src_prefix_instance = this->realm_malloc((sources.size() + 1) * sizeof(uint32_t), my_mem); + uint32_t* d_src_prefix = reinterpret_cast(AffineAccessor(src_prefix_instance, 0).base); + CUDA_CHECK(cudaMemcpyAsync(d_src_prefix, h_src_counters.data(), (sources.size() + 1) * sizeof(size_t), cudaMemcpyHostToDevice, stream), stream); + + CUDA_CHECK(cudaMemsetAsync(d_src_counters, 0, sources.size() * sizeof(uint32_t), stream), stream); + + image_intersect_output<<>>(collapsed_parent.entries_buffer, d_rngs, d_src_prefix, collapsed_parent.num_entries, total_pts, d_src_counters, d_valid_intersect); + KERNEL_CHECK(stream); + + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + src_prefix_instance.destroy(); + parent_entries_instance.destroy(); + src_counters_instance.destroy(); + rngs_instance.destroy(); + + size_t out_rects = 0; + RectDesc* trash; + this->complete_rect_pipeline(d_valid_intersect, num_valid_output, trash, out_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + + valid_intersect_instance.destroy(); + +} + + /* + * Input (stored in MicroOp): Array of field instances, a parent index space, and a list of source index spaces + * Output: A list of (potentially overlapping) points that result from chasing all the pointers in the source index spaces + * through the provided instances and emitting only points in the parent index space labeled by which source they came from, + * which are then sent off to complete_pipeline. + * Approach: Intersect all instance rectangles with source rectangles in parallel. Prefix sum + binary search to iterate over these in + * parallel and chase all the pointers in the source rectangles to their corresponding point. Here, the pointer chasing is also a count + emit, + * where only points that are in the parent index space are counted/emitted. + */ +template +void GPUImageMicroOp::gpu_populate_ptrs() +{ + if (sources.size() == 0) { + return; + } + + NVTX_DEPPART(gpu_image); + + Memory sysmem; + find_memory(sysmem, Memory::SYSTEM_MEM); + + cudaStream_t stream = Cuda::get_task_cuda_stream(); + + size_t tile_size = fixed_buffer_size; + RegionInstance fixed_buffer = buffer; + Arena buffer_arena(reinterpret_cast(AffineAccessor(fixed_buffer, 0).base), tile_size); + + collapsed_space src_space; + src_space.offsets = buffer_arena.alloc(sources.size()+1); + src_space.num_children = sources.size(); + + GPUMicroOp::collapse_multi_space(sources, src_space, buffer_arena, stream); + + collapsed_space inst_space; + + // We combine all of our instances into one to batch work, tracking the offsets between instances. + inst_space.offsets = buffer_arena.alloc(domain_transform.ptr_data.size()+1); + inst_space.num_children = domain_transform.ptr_data.size(); + + Arena no; + GPUMicroOp::collapse_multi_space(domain_transform.ptr_data, inst_space, no, stream); + + // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter + // to figure out where to write each rectangle. + uint32_t* d_inst_counters = buffer_arena.alloc(2*domain_transform.ptr_data.size()+1); + + + // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second + // to track which instance each rectangle came from in the populate phase. + uint32_t* d_inst_prefix = d_inst_counters + domain_transform.ptr_data.size(); + size_t num_valid_rects = tile_size; + + //Uniform for all tiles + collapsed_space collapsed_parent; + + // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs. + GPUMicroOp::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream); + + Memory zcpy_mem; + assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM)); + RegionInstance accessors_instance = this->realm_malloc(domain_transform.ptr_data.size() * sizeof(AffineAccessor,N2,T2>), zcpy_mem); + AffineAccessor,N2,T2>* d_accessors = reinterpret_cast,N2,T2>*>(AffineAccessor(accessors_instance, 0).base); + for (size_t i = 0; i < domain_transform.ptr_data.size(); ++i) { + d_accessors[i] = AffineAccessor,N2,T2>(domain_transform.ptr_data[i].inst, domain_transform.ptr_data[i].field_offset); + } + + uint32_t* d_prefix_points = buffer_arena.alloc(domain_transform.ptr_data.size()+1); + + buffer_arena.commit(false); + + size_t left = buffer_arena.used(); + + //Here we iterate over the tiles + size_t num_output = 0; + RectDesc* output_start = nullptr; + size_t num_completed = 0; + size_t curr_tile = tile_size / 2; + int count = 0; + while (num_completed < inst_space.num_entries) { + try { + std::cout << "Tile iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; + buffer_arena.start(); + std::cout << "Amount Used: " << buffer_arena.used() << std::endl; + std::cout << "Expected Amount Used: " << left + num_output * sizeof(RectDesc) << std::endl; + if (num_completed + curr_tile > inst_space.num_entries) { + curr_tile = inst_space.num_entries - num_completed; + } + collapsed_space inst_space_tile = inst_space; + inst_space_tile.num_entries = curr_tile; + inst_space_tile.entries_buffer = buffer_arena.alloc>(curr_tile); + CUDA_CHECK(cudaMemcpyAsync(inst_space_tile.entries_buffer, inst_space.entries_buffer + num_completed, curr_tile * sizeof(SparsityMapEntry), cudaMemcpyHostToDevice, stream), stream); + + RectDesc* d_valid_rects; + GPUMicroOp::template construct_input_rectlist>(inst_space_tile, src_space, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream); + + if (num_valid_rects == std::numeric_limits::max()) { + curr_tile /= 2; + continue; + } + + if (num_valid_rects == 0) { + num_completed += curr_tile; + curr_tile = tile_size / 2; + subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + continue; + } + + // Prefix sum the valid rectangles by volume. + size_t* d_prefix_rects; + size_t total_pts; + + GPUMicroOp::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream); + + CUDA_CHECK(cudaMemsetAsync(d_inst_counters, 0, (domain_transform.ptr_data.size()) * sizeof(uint32_t), stream), stream); + + //We do a two pass count + emit to chase all the pointers in parallel and check for membership in the parent index space + image_gpuPopulateBitmasksPtrsKernel<<>>(d_accessors, d_valid_rects, collapsed_parent.entries_buffer, d_prefix_rects, d_inst_prefix, nullptr, total_pts, num_valid_rects, domain_transform.ptr_data.size(), collapsed_parent.num_entries, d_inst_counters, nullptr); + KERNEL_CHECK(stream); + + std::vector h_inst_counters(domain_transform.ptr_data.size()+1); + h_inst_counters[0] = 0; // prefix sum starts at 0 + CUDA_CHECK(cudaMemcpyAsync(h_inst_counters.data()+1, d_inst_counters, domain_transform.ptr_data.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t i = 0; i < domain_transform.ptr_data.size(); ++i) { + h_inst_counters[i+1] += h_inst_counters[i]; + } + + size_t num_valid_points = h_inst_counters[domain_transform.ptr_data.size()]; + + if (num_valid_points == 0) { + num_completed += curr_tile; + curr_tile = tile_size / 2; + subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + continue; + } + + CUDA_CHECK(cudaMemcpyAsync(d_prefix_points, h_inst_counters.data(), (domain_transform.ptr_data.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream); + + buffer_arena.flip_parity(); + PointDesc* d_valid_points = buffer_arena.alloc>(num_valid_points); + + CUDA_CHECK(cudaMemsetAsync(d_inst_counters, 0, (domain_transform.ptr_data.size()) * sizeof(uint32_t), stream), stream); + + image_gpuPopulateBitmasksPtrsKernel<<>>(d_accessors, d_valid_rects, collapsed_parent.entries_buffer, d_prefix_rects, d_inst_prefix, d_prefix_points, total_pts, num_valid_rects, domain_transform.ptr_data.size(), collapsed_parent.num_entries, d_inst_counters, d_valid_points); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + + size_t num_new_rects = 1; + assert(!buffer_arena.get_parity()); + RectDesc* d_new_rects; + + //Send it off for processing + this->complete_pipeline(d_valid_points, num_valid_points, d_new_rects, num_new_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + + if (num_output==0) { + buffer_arena.flip_parity(); + buffer_arena.reset(true); + output_start = buffer_arena.alloc>(num_new_rects); + buffer_arena.commit(true); + CUDA_CHECK(cudaMemcpyAsync(output_start, d_new_rects, num_new_rects * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); + num_output = num_new_rects; + num_completed += curr_tile; + subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); + curr_tile = tile_size / 2; + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + continue; + } + + RectDesc* d_old_rects = buffer_arena.alloc>(num_output); + assert(d_old_rects == d_new_rects + num_new_rects); + CUDA_CHECK(cudaMemcpyAsync(d_old_rects, output_start, num_output * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + size_t num_final_rects = 1; + + //Send it off for processing + this->complete_rect_pipeline(d_new_rects, num_output + num_new_rects, output_start, num_final_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + num_completed += curr_tile; + num_output = num_final_rects; + subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); + curr_tile = tile_size / 2; + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } + catch (arena_oom&) { + std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; + std::cout << buffer_arena.used() << " bytes used in arena." << std::endl; + curr_tile /= 2; + if (curr_tile == 0) { + throw; + } + } + } + + if (num_output == 0) { + for (SparsityMap it : sparsity_outputs) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(it); + if (this->exclusive) { + impl->gpu_finalize(); + } else { + impl->contribute_nothing(); + } + } + return; + } + this->send_output(output_start, num_output, buffer_arena, sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); +} +} \ No newline at end of file diff --git a/src/realm/deppart/image_gpu_kernels.hpp b/src/realm/deppart/image_gpu_kernels.hpp new file mode 100644 index 0000000000..146d4e781f --- /dev/null +++ b/src/realm/deppart/image_gpu_kernels.hpp @@ -0,0 +1,167 @@ +#pragma once +#include "realm/deppart/image.h" + +namespace Realm { + +//Device helper to check parent space for membership +//TODO: if expensive, may benefit from BVH +template +__device__ bool image_isInIndexSpace( + const Point& p, + const SparsityMapEntry* parent_entries, + size_t numRects) +{ + // for each rectangle, check all dims… + for(size_t i = 0; i < numRects; ++i) { + const auto &r = parent_entries[i].bounds; + bool inside = true; + #pragma unroll + for(int d = 0; d < N; ++d) { + if(p[d] < r.lo[d] || p[d] > r.hi[d]) { + inside = false; + break; + } + } + if(inside) return true; + } + return false; +} + +//Count + emit to chase pointers and check for membership in parent space +template < + int N, typename T, + int N2, typename T2 +> +__global__ +void image_gpuPopulateBitmasksPtrsKernel( + AffineAccessor,N2,T2> *accessors, + RectDesc* rects, + SparsityMapEntry* parent_entries, + size_t* prefix, + uint32_t *inst_offsets, + uint32_t *d_inst_prefix, + size_t numPoints, + size_t numRects, + size_t num_insts, + size_t numParentRects, + uint32_t* d_inst_counters, + PointDesc *d_points +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numPoints) return; + size_t low = 0, high = numRects; + while (low < high) { + size_t mid = (low + high) >> 1; + if (prefix[mid+1] <= idx) low = mid + 1; + else high = mid; + } + size_t r = low; + bool found = false; + size_t inst_idx; + for (inst_idx = 0; inst_idx < num_insts; ++inst_idx) { + if (inst_offsets[inst_idx] <= r && inst_offsets[inst_idx+1] > r) { + found = true; + break; + } + } + assert(found); + size_t offset = idx - prefix[r]; + Point p; + for (int k = N2-1; k >= 0; --k) { + size_t dim = rects[r].rect.hi[k] + 1 - rects[r].rect.lo[k]; + p[k] = rects[r].rect.lo[k] + (offset % dim); + offset /= dim; + } + Point ptr = accessors[inst_idx].read(p); + if (image_isInIndexSpace(ptr, parent_entries, numParentRects)) { + uint32_t local = atomicAdd(&d_inst_counters[inst_idx], 1); + if (d_points != nullptr) { + uint32_t out_idx = d_inst_prefix[inst_idx] + local; + PointDesc point_desc; + point_desc.src_idx = rects[r].src_idx; + point_desc.point = ptr; + d_points[out_idx] = point_desc; + } + } + +} + +//Same as image_intersect_input, but for output rectangles and parent entries +//rather than input rectangles and parent rectangles + template +__global__ void image_intersect_output( + const SparsityMapEntry* d_parent_entries, + const RectDesc* d_output_rngs, + const uint32_t* d_src_prefix, + size_t numParentRects, + size_t numOutputRects, + uint32_t* d_src_counters, + RectDesc* d_rects +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numParentRects * numOutputRects) return; + size_t idx_x = idx % numParentRects; + size_t idx_y = idx / numParentRects; + const auto parent_entry = d_parent_entries[idx_x]; + const auto output_entry = d_output_rngs[idx_y]; + RectDesc rect_output; + rect_output.rect = parent_entry.bounds.intersection(output_entry.rect); + if (!rect_output.rect.empty()) { + uint32_t local = atomicAdd(&d_src_counters[output_entry.src_idx], 1); + if (d_rects != nullptr) { + rect_output.src_idx = output_entry.src_idx; + size_t out_idx = d_src_prefix[output_entry.src_idx] + local; + d_rects[out_idx] = rect_output; + } + } +} + +//Single pass function to chase pointers to rectangles. + template < + int N, typename T, + int N2, typename T2 +> +__global__ +void image_gpuPopulateBitmasksRngsKernel( + AffineAccessor,N2,T2> *accessors, + RectDesc* rects, + size_t* prefix, + uint32_t *inst_offsets, + size_t numPoints, + size_t numRects, + size_t num_insts, + RectDesc *d_rects +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numPoints) return; + size_t low = 0, high = numRects; + while (low < high) { + size_t mid = (low + high) >> 1; + if (prefix[mid+1] <= idx) low = mid + 1; + else high = mid; + } + size_t r = low; + bool found = false; + size_t inst_idx; + for (inst_idx = 0; inst_idx < num_insts; ++inst_idx) { + if (inst_offsets[inst_idx] <= r && inst_offsets[inst_idx+1] > r) { + found = true; + break; + } + } + assert(found); + size_t offset = idx - prefix[r]; + Point p; + for (int k = N2-1; k >= 0; --k) { + size_t dim = rects[r].rect.hi[k] + 1 - rects[r].rect.lo[k]; + p[k] = rects[r].rect.lo[k] + (offset % dim); + offset /= dim; + } + Rect rng = accessors[inst_idx].read(p); + RectDesc rect_desc; + rect_desc.src_idx = rects[r].src_idx; + rect_desc.rect = rng; + d_rects[idx] = rect_desc; +} + +} // namespace Realm \ No newline at end of file diff --git a/src/realm/deppart/image_gpu_tmpl.cu b/src/realm/deppart/image_gpu_tmpl.cu new file mode 100644 index 0000000000..6af4dcde61 --- /dev/null +++ b/src/realm/deppart/image_gpu_tmpl.cu @@ -0,0 +1,62 @@ +/* Copyright 2024 Stanford University, NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// per‐dimension instantiator for the GPU Image Operation +// Mirrors CPU Approach (image_tmpl.cc) + + +#include "realm/deppart/image_gpu_kernels.hpp" +#include "realm/deppart/image_gpu_impl.hpp" +#include "realm/deppart/inst_helper.h" + +#ifndef INST_N1 + #error "INST_N1 must be defined before including image_gpu_tmpl.cu" +#endif +#ifndef INST_N2 + #error "INST_N2 must be defined before including image_gpu_tmpl.cu" +#endif + +#define FOREACH_TT(__func__) \ + __func__(int, int) \ + __func__(int, unsigned) \ + __func__(int, long long) \ + __func__(unsigned,int) \ + __func__(unsigned,unsigned) \ + __func__(unsigned,long long) \ + __func__(long long, int) \ + __func__(long long, unsigned) \ + __func__(long long, long long) + +#define FOREACH_T(__func__) \ + __func__(int) \ + __func__(unsigned) \ + __func__(long long) + +namespace Realm { + #define N1 INST_N1 + #define N2 INST_N2 + + + #define DO_DOUBLE(T1,T2) \ + template class ImageMicroOp; \ + template class GPUImageMicroOp; + + FOREACH_TT(DO_DOUBLE) + + #undef DO_DOUBLE + #undef N1 + #undef N2 + +} // namespace Realm \ No newline at end of file diff --git a/src/realm/deppart/image_tmpl.cc b/src/realm/deppart/image_tmpl.cc index 578a78226b..c12dfdb138 100644 --- a/src/realm/deppart/image_tmpl.cc +++ b/src/realm/deppart/image_tmpl.cc @@ -46,13 +46,18 @@ namespace Realm { #define DOIT(T1,T2) \ template class StructuredImageMicroOp; \ - template class ImageMicroOp; \ + template class ImageMicroOp; \ + template class GPUImageMicroOp; \ template class ImageOperation; \ template ImageMicroOp::ImageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \ template Event IndexSpace::create_subspaces_by_image( \ const DomainTransform &, const std::vector > &, \ std::vector > &, const ProfilingRequestSet &, Event) \ const; \ + template Event IndexSpace::gpu_subspaces_by_image( \ + const DomainTransform &, const std::vector > &, \ + std::vector > &, const ProfilingRequestSet &, std::pair &, \ + RegionInstance, Event) const; \ template Event IndexSpace::create_subspaces_by_image_with_difference( \ const DomainTransform &, \ const std::vector >&, \ diff --git a/src/realm/deppart/partitions.cc b/src/realm/deppart/partitions.cc index b023f468fc..f342519f71 100644 --- a/src/realm/deppart/partitions.cc +++ b/src/realm/deppart/partitions.cc @@ -71,7 +71,7 @@ namespace Realm { size_t start, size_t count, size_t volume, IndexSpace *results, size_t first_result, size_t last_result, - const std::vector >& entries) + const span >& entries) { // should never be here with empty bounds assert(!bounds.empty()); @@ -111,13 +111,11 @@ namespace Realm { size_t lo_volume[N]; for(int i = 0; i < N; i++) lo_volume[i] = 0; - for(typename std::vector >::const_iterator it = entries.begin(); - it != entries.end(); - it++) { + for(size_t j = 0; j < entries.size(); j++) { for(int i = 0; i < N; i++) - lo_volume[i] += it->bounds.intersection(lo_half[i]).volume(); + lo_volume[i] += entries[j].bounds.intersection(lo_half[i]).volume(); } - // now compute how many subspaces would fall in each half and the + // now compute how many subspaces would fall in each half and the // inefficiency of the split size_t lo_count[N], inefficiency[N]; for(int i = 0; i < N; i++) { @@ -233,7 +231,7 @@ namespace Realm { // TODO: sparse case where we have to wait SparsityMapPublicImpl *impl = sparsity.impl(); assert(impl->is_valid()); - const std::vector >& entries = impl->get_entries(); + const span >& entries = impl->get_entries(); // initially every subspace will be a copy of this one, and then // we'll decompose the bounds subspace = *this; @@ -307,7 +305,7 @@ namespace Realm { // TODO: sparse case where we have to wait SparsityMapPublicImpl *impl = sparsity.impl(); assert(impl->is_valid()); - const std::vector >& entries = impl->get_entries(); + span> entries = impl->get_entries(); // initially every subspace will be a copy of this one, and then // we'll decompose the bounds subspaces.resize(count, *this); @@ -498,7 +496,7 @@ namespace Realm { template class RectListAdapter { public: - RectListAdapter(const std::vector >& _rects) + RectListAdapter(const span >& _rects) : rects(_rects.empty() ? 0 : &_rects[0]), count(_rects.size()) {} RectListAdapter(const Rect<1,T> *_rects, size_t _count) : rects(_rects), count(_count) {} @@ -583,7 +581,6 @@ namespace Realm { os << "AsyncMicroOp(" << (void *)uop << ")"; } - //////////////////////////////////////////////////////////////////////// // // class PartitioningMicroOp @@ -666,6 +663,16 @@ namespace Realm { } } + RegionInstance PartitioningMicroOp::realm_malloc(size_t size, Memory location) { + assert(location != Memory::NO_MEMORY); + assert(size > 0); + std::vector byte_fields = {sizeof(char)}; + IndexSpace<1> instance_index_space(Rect<1>(0, size-1)); + RegionInstance result; + RegionInstance::create_instance(result, location, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + return result; + } + //////////////////////////////////////////////////////////////////////// // // class ComputeOverlapMicroOp diff --git a/src/realm/deppart/partitions.h b/src/realm/deppart/partitions.h index 7bb68c3630..4ec4560984 100644 --- a/src/realm/deppart/partitions.h +++ b/src/realm/deppart/partitions.h @@ -35,11 +35,211 @@ #include "realm/deppart/inst_helper.h" #include "realm/bgwork.h" +struct CUstream_st; +typedef CUstream_st* cudaStream_t; + namespace Realm { class PartitioningMicroOp; class PartitioningOperation; + template + constexpr std::string_view type_name() { + #if defined(__clang__) + std::string_view p = __PRETTY_FUNCTION__; + return {p.data() + 34, p.size() - 34 - 1}; + #elif defined(__GNUC__) + std::string_view p = __PRETTY_FUNCTION__; + return {p.data() + 49, p.size() - 49 - 1}; + #elif defined(_MSC_VER) + std::string_view p = __FUNCSIG__; + return {p.data() + 84, p.size() - 84 - 7}; + #else + return "unknown"; + #endif + } + + template + struct HiFlag { + T hi; + uint8_t head; + }; + + struct DeltaFlag { + int32_t delta; + uint8_t head; + }; + + // Data representations for GPU micro-ops + // src idx tracks which subspace each rect/point + // belongs to and allows multiple subspaces to be + // computed together in a micro-op + template + struct RectDesc { + Rect rect; + size_t src_idx; + }; + + template + struct PointDesc { + Point point; + size_t src_idx; + }; + + // Combines one or multiple index spaces into a single struct + // If multiple, offsets tracks transitions between spaces + template + struct collapsed_space { + SparsityMapEntry* entries_buffer; + size_t num_entries; + size_t* offsets; + size_t num_children; + Rect bounds; + }; + + // Stores everything necessary to query a BVH + // Used with GPUMicroOp::build_bvh + template + struct BVH { + int root; + size_t num_leaves; + Rect* boxes; + uint64_t* indices; + size_t* labels; + int* childLeft; + int* childRight; + }; + + struct arena_oom : std::bad_alloc { + const char* what() const noexcept override { return "arena_oom"; } + }; + + class Arena { + public: + using byte = std::byte; + + Arena() noexcept : base_(nullptr), cap_(0), parity_(false), left_(0), right_(0), base_left_(0), base_right_(0) {} + Arena(void* buffer, size_t bytes) noexcept + : base_(reinterpret_cast(buffer)), cap_(bytes), parity_(false), left_(0), right_(0), base_left_(0), base_right_(0) {} + + size_t capacity() const noexcept { return cap_; } + size_t used() const noexcept { return left_ + right_; } + + size_t mark() const noexcept { + return parity_ ? right_ : left_; + } + + void rollback(size_t mark) noexcept { + if (parity_) { + right_ = mark; + } else { + left_ = mark; + } + } + + template + T* alloc(size_t count = 1) { + try { + if (parity_) { + return alloc_right(count); + } else { + return alloc_left(count); + } + } catch (arena_oom&) { + std::cout << "Arena OOM: requested " << count << " of " << type_name() + << " capacity " << cap_ << " bytes, " + << " used " << used() << " bytes, " + << " left " << (cap_ - left_ - right_) << " bytes.\n"; + throw arena_oom{}; + } + } + + void flip_parity(void) noexcept { + if (parity_) { + // switching from right to left + left_ = base_left_; + } else { + // switching from left to right + right_ = base_right_; + } + parity_ = !parity_; + } + + void commit(bool parity) noexcept { + if (parity) { + base_right_ = right_; + } else { + base_left_ = left_; + } + } + + void reset(bool parity) noexcept { + if (parity) { + base_right_ = 0; + right_ = 0; + } else { + base_left_ = 0; + left_ = 0; + } + } + + bool get_parity(void) const noexcept { + return parity_; + } + + void start(void) noexcept { + left_ = base_left_; + right_ = base_right_; + parity_ = false; + } + + private: + + void* alloc_left_bytes(size_t bytes, size_t align = alignof(std::max_align_t)) { + const size_t aligned = align_up(left_, align); + if (aligned + bytes + right_ > cap_) throw arena_oom{}; + void* p = base_ + aligned; + left_ = aligned + bytes; + return p; + } + + void* alloc_right_bytes(size_t bytes, size_t align = alignof(std::max_align_t)) { + if (bytes + right_ > cap_) throw arena_oom{}; + const size_t aligned = align_down(cap_ - right_ - bytes, align); + if (aligned < left_) throw arena_oom{}; + void *p = base_ + aligned; + right_ = cap_ - aligned; + return p; + } + + template + T* alloc_left(size_t count = 1) { + static_assert(!std::is_void_v, "alloc is invalid"); + return reinterpret_cast(alloc_left_bytes(sizeof(T) * count, alignof(T))); + } + + template + T* alloc_right(size_t count = 1) { + static_assert(!std::is_void_v, "alloc is invalid"); + return reinterpret_cast(alloc_right_bytes(sizeof(T) * count, alignof(T))); + } + + static size_t align_up(size_t x, size_t a) noexcept { + return (x + (a - 1)) & ~(a - 1); + } + + static size_t align_down(size_t x, size_t a) noexcept { + return x & ~(a - 1); + } + + byte* base_; + size_t cap_; + bool parity_; + size_t left_; + size_t right_; + size_t base_left_; + size_t base_right_; + }; template class OverlapTester { @@ -108,6 +308,8 @@ namespace Realm { template void sparsity_map_ready(SparsityMapImpl *sparsity, bool precise); + static RegionInstance realm_malloc(size_t size, Memory location = Memory::NO_MEMORY); + IntrusiveListLink uop_link; REALM_PMTA_DEFN(PartitioningMicroOp,IntrusiveListLink,uop_link); typedef IntrusiveList MicroOpList; @@ -147,6 +349,45 @@ namespace Realm { std::vector *> extra_deps; }; + //The parent class for all GPU partitioning micro-ops. Provides output utility functions + + template + class GPUMicroOp : public PartitioningMicroOp { + public: + GPUMicroOp(void) = default; + virtual ~GPUMicroOp(void) = default; + + virtual void execute(void) = 0; + + template + static void collapse_multi_space(const std::vector& field_data, collapsed_space &out_space, Arena &my_arena, cudaStream_t stream); + + static void collapse_parent_space(const IndexSpace& parent_space, collapsed_space &out_space, Arena &my_arena, cudaStream_t stream); + + static void build_bvh(const collapsed_space &space, BVH &bvh, Arena &my_arena, cudaStream_t stream); + + template + static void construct_input_rectlist(const collapsed_space &lhs, const collapsed_space &rhs, out_t* &d_valid_rects, size_t& out_size, uint32_t* counters, uint32_t* out_offsets, Arena &my_arena, cudaStream_t stream); + + template + static void volume_prefix_sum(const out_t* d_rects, size_t total_rects, size_t* &d_prefix_rects, size_t& num_pts, Arena &my_arena, cudaStream_t stream); + + template + void complete_pipeline(PointDesc* d_points, size_t total_pts, RectDesc* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap); + + template + void complete_rect_pipeline(RectDesc* d_rects, size_t total_rects, RectDesc* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap); + + template + void complete1d_pipeline(RectDesc* d_rects, size_t total_rects, RectDesc* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap); + + template + void send_output(RectDesc* d_rects, size_t total_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap); + + bool exclusive = false; + + }; + //////////////////////////////////////// // diff --git a/src/realm/deppart/partitions_gpu.cu b/src/realm/deppart/partitions_gpu.cu new file mode 100644 index 0000000000..b842e93f58 --- /dev/null +++ b/src/realm/deppart/partitions_gpu.cu @@ -0,0 +1,29 @@ +/* Copyright 2024 Stanford University, NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// per‐dimension instantiator for the GPU version of +// ImageMicroOp<…>::gpu_populate_bitmasks_ptrs + + +#include "realm/deppart/partitions_gpu_impl.hpp" +#include "realm/deppart/inst_helper.h" + +namespace Realm { + #define DOIT(N,T) \ + template class GPUMicroOp; + + FOREACH_NT(DOIT) + +} // namespace Realm \ No newline at end of file diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp new file mode 100644 index 0000000000..678102b56f --- /dev/null +++ b/src/realm/deppart/partitions_gpu_impl.hpp @@ -0,0 +1,1604 @@ +#pragma once +#include "deppart_config.h" +#include "partitions.h" +#ifdef REALM_USE_NVTX +#include "realm/nvtx.h" +#endif +#include "realm/cuda/cuda_internal.h" +#include "realm/deppart/partitions_gpu_kernels.hpp" +#include + +//CUDA ERROR CHECKING MACROS + +#define CUDA_CHECK(call, stream) \ + do { \ + cudaError_t err = (call); \ + if (err != cudaSuccess) { \ + std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ \ + << " '" #call "' failed with " \ + << cudaGetErrorString(err) << " (" << err << ")\n"; \ + assert(false); \ + } \ + } while (0) + +#define KERNEL_CHECK(stream) \ + do { \ + cudaError_t err = cudaGetLastError(); \ + if (err != cudaSuccess) { \ + std::cerr << "Kernel launch failed at " << __FILE__ << ":" << __LINE__ \ + << ": " << cudaGetErrorString(err) << "\n"; \ + assert(false); \ + } \ + } while (0) + +#define THREADS_PER_BLOCK 256 + +#define COMPUTE_GRID(num_items) \ + (((num_items) + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK) + + +//NVTX macros to only add ranges if defined. +#ifdef REALM_USE_NVTX + + #define NVTX_CAT(a,b) a##b + + #define NVTX_DEPPART(message) \ + nvtxScopedRange NVTX_CAT(nvtx_, message)("cuda", #message, 0) + +#else + + #define NVTX_DEPPART(message) do { } while (0) + +#endif + +namespace Realm { + + // Used by cub::DeviceReduce to compute bad GPU approximation. + template + struct UnionRectOp { + __host__ __device__ + Rect operator()(const Rect& a, + const Rect& b) const { + Rect r; + for(int d=0; d b.hi[d] ? a.hi[d] : b.hi[d]; + } + return r; + } + }; + + // Used to compute prefix sum by volume for an array of Rects or RectDescs. + template + struct RectVolumeOp { + __device__ __forceinline__ + size_t operator()(const out_t& r) const { + if constexpr (std::is_same_v, out_t>) { + return r.volume(); + } else { + return r.rect.volume(); + } + } + }; + + // Finds a memory of the specified kind. Returns true on success, false otherwise. + inline bool find_memory(Memory &output, Memory::Kind kind) + { + bool found = false; + Machine machine = Machine::get_machine(); + std::set all_memories; + machine.get_all_memories(all_memories); + for(auto& memory : all_memories) { + if(memory.kind() == kind) { + output = memory; + found = true; + break; + } + } + return found; + } + + //Given a list of spaces, compacts them all into one collapsed_space + template + template + void GPUMicroOp::collapse_multi_space(const std::vector& spaces, collapsed_space &out_space, Arena &my_arena, cudaStream_t stream) + { + + char *val = std::getenv("SHATTER_SIZE"); // or any env var + int shatter_size = 1; //default + if (val) { + shatter_size = atoi(val); + } + // We need space_offsets to preserve which space each rectangle came from + std::vector space_offsets(spaces.size() + 1); + + // Determine size of allocation for combined rects. + out_space.num_entries = 0; + + for (size_t i = 0; i < spaces.size(); ++i) { + space_offsets[i] = out_space.num_entries; + IndexSpace my_space; + if constexpr (std::is_same_v>) { + my_space = spaces[i]; + } else { + my_space = spaces[i].index_space; + } + if (my_space.dense()) { + if constexpr (std::is_same_v>) { + out_space.num_entries += 1; + } else { + out_space.num_entries += shatter_size; + } + } else { + out_space.num_entries += my_space.sparsity.impl()->get_entries().size(); + } + } + space_offsets[spaces.size()] = out_space.num_entries; + + //We copy into one contiguous host buffer, then copy to device + Memory sysmem; + assert(find_memory(sysmem, Memory::SYSTEM_MEM)); + + + RegionInstance h_instance = realm_malloc(out_space.num_entries * sizeof(SparsityMapEntry), sysmem); + SparsityMapEntry* h_entries = reinterpret_cast*>(AffineAccessor(h_instance, 0).base); + + if (my_arena.capacity()==0) { + out_space.entries_buffer = reinterpret_cast*>(AffineAccessor(h_instance, 0).base); + } else { + out_space.entries_buffer = my_arena.alloc >(out_space.num_entries); + } + + + //Now we fill the host array with all rectangles + size_t pos = 0; + for (size_t i = 0; i < spaces.size(); ++i) { + IndexSpace my_space; + if constexpr (std::is_same_v>) { + my_space = spaces[i]; + } else { + my_space = spaces[i].index_space; + } + if (my_space.dense()) { + if constexpr (std::is_same_v>) { + SparsityMapEntry entry; + entry.bounds = my_space.bounds; + memcpy(h_entries + pos, &entry, sizeof(SparsityMapEntry)); + ++pos; + } else { + std::vector > tmp(shatter_size); + int ppt = (my_space.bounds.hi[0] - my_space.bounds.lo[0]+1) / shatter_size; + for (int i = 0; i < shatter_size; ++i) { + Rect new_rect = my_space.bounds; + new_rect.lo[0] = my_space.bounds.lo[0] + i * ppt; + new_rect.hi[0] = (i == shatter_size - 1) ? my_space.bounds.hi[0] : (new_rect.lo[0] + ppt - 1); + SparsityMapEntry entry; + entry.bounds = new_rect; + entry.sparsity.id = 0; + entry.bitmap = 0; + tmp[i] = entry; + } + memcpy(h_entries + pos, tmp.data(), tmp.size() * sizeof(SparsityMapEntry)); + pos += shatter_size; + } + } else { + span> tmp = my_space.sparsity.impl()->get_entries(); + memcpy(h_entries + pos, tmp.data(), tmp.size() * sizeof(SparsityMapEntry)); + pos += tmp.size(); + } + } + + //Now we copy our entries and offsets to the device + CUDA_CHECK(cudaMemcpyAsync(out_space.offsets, space_offsets.data(), (spaces.size() + 1) * sizeof(size_t), cudaMemcpyHostToDevice, stream), stream); + if (my_arena.capacity() != 0) { + CUDA_CHECK(cudaMemcpyAsync(out_space.entries_buffer, h_entries, out_space.num_entries * sizeof(SparsityMapEntry), cudaMemcpyHostToDevice, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + h_instance.destroy(); + } + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + } + + // Only real work here is getting dense/sparse into a single collapsed_space. + template + void GPUMicroOp::collapse_parent_space(const IndexSpace& parent_space, collapsed_space &out_space, Arena &my_arena, cudaStream_t stream) + { + if (parent_space.dense()) { + SparsityMapEntry entry; + entry.bounds = parent_space.bounds; + out_space.entries_buffer = my_arena.alloc>(1); + out_space.num_entries = 1; + CUDA_CHECK(cudaMemcpyAsync(out_space.entries_buffer, &entry, sizeof(SparsityMapEntry), cudaMemcpyHostToDevice, stream), stream); + } else { + span> tmp = parent_space.sparsity.impl()->get_entries(); + out_space.num_entries = tmp.size(); + out_space.entries_buffer = my_arena.alloc>(tmp.size()); + out_space.bounds = parent_space.bounds; + CUDA_CHECK(cudaMemcpyAsync(out_space.entries_buffer, tmp.data(), tmp.size() * sizeof(SparsityMapEntry), cudaMemcpyHostToDevice, stream), stream); + } + out_space.offsets = nullptr; + out_space.num_children = 1; + } + + // Given a collapsed space, builds a (potentially marked) bvh over that space. + // Based on Tero Karras' Maximizing Parallelism in the Construction of BVHs, Octrees, and k-d Trees + template + void GPUMicroOp::build_bvh(const collapsed_space &space, BVH &result, Arena &my_arena, cudaStream_t stream) + { + + //We want to keep the entire BVH that we return in one instance for convenience. + size_t indices_instance_size = space.num_entries * sizeof(uint64_t); + size_t labels_instance_size = space.offsets == nullptr ? 0 : space.num_entries * sizeof(size_t); + size_t boxes_instance_size = (2*space.num_entries - 1) * sizeof(Rect); + size_t child_instance_size = (2*space.num_entries - 1) * sizeof(int); + + size_t total_instance_size = indices_instance_size + labels_instance_size + boxes_instance_size + 2 * child_instance_size; + char* bvh_ptr = my_arena.alloc(total_instance_size); + + result.num_leaves = space.num_entries; + + size_t curr_idx = 0; + result.indices = reinterpret_cast(bvh_ptr + curr_idx); + curr_idx += indices_instance_size; + result.labels = space.offsets == nullptr ? nullptr : reinterpret_cast(bvh_ptr + curr_idx); + curr_idx += labels_instance_size; + result.boxes = reinterpret_cast*>(bvh_ptr + curr_idx); + curr_idx += boxes_instance_size; + result.childLeft = reinterpret_cast(bvh_ptr + curr_idx); + curr_idx += child_instance_size; + result.childRight = reinterpret_cast(bvh_ptr + curr_idx); + + size_t prev = my_arena.mark(); + + // Bounds used for morton code computation. + Rect* d_global_bounds = my_arena.alloc>(1); + CUDA_CHECK(cudaMemcpyAsync(d_global_bounds, &space.bounds, sizeof(Rect), cudaMemcpyHostToDevice, stream), stream); + + // These are intermediate instances we'll destroy before returning. + char* d_morton_visit = my_arena.alloc(2 * space.num_entries * max(sizeof(uint64_t), sizeof(int))); + uint64_t* d_morton_codes = reinterpret_cast(d_morton_visit); + + size_t intermed = my_arena.mark(); + + uint64_t* d_indices_in = my_arena.alloc(space.num_entries); + + // We compute morton codes for each leaf and sort, labeling if necessary. + bvh_build_morton_codes<<>>(space.entries_buffer, space.offsets, d_global_bounds, space.num_entries, space.num_children, d_morton_codes, d_indices_in, result.labels); + KERNEL_CHECK(stream); + + uint64_t* d_morton_codes_out = d_morton_codes + space.num_entries; + uint64_t* d_indices_out = result.indices; + + void *bvh_temp = nullptr; + size_t bvh_temp_bytes = 0; + cub::DeviceRadixSort::SortPairs(bvh_temp, bvh_temp_bytes, d_morton_codes, d_morton_codes_out, d_indices_in, + d_indices_out, space.num_entries, 0, 64, stream); + bvh_temp = reinterpret_cast(my_arena.alloc(bvh_temp_bytes)); + cub::DeviceRadixSort::SortPairs(bvh_temp, bvh_temp_bytes, d_morton_codes, d_morton_codes_out, d_indices_in, + d_indices_out, space.num_entries, 0, 64, stream); + + std::swap(d_morton_codes, d_morton_codes_out); + + my_arena.rollback(intermed); + + + // Another temporary instance. + int* d_parent = my_arena.alloc(2*space.num_entries - 1); + CUDA_CHECK(cudaMemsetAsync(d_parent, -1, (2*space.num_entries - 1) * sizeof(int), stream), stream); + + // Here's where we actually build the BVH + int n = (int) space.num_entries; + bvh_build_radix_tree_kernel<<< COMPUTE_GRID(space.num_entries - 1), THREADS_PER_BLOCK, 0, stream>>>(d_morton_codes, result.indices, n, result.childLeft, result.childRight, d_parent); + KERNEL_CHECK(stream); + + // Figure out which node didn't get its parent set. + int* d_root = my_arena.alloc(1); + + CUDA_CHECK(cudaMemsetAsync(d_root, -1, sizeof(int), stream), stream); + + bvh_build_root_kernel<<< COMPUTE_GRID(2 * space.num_entries - 1), THREADS_PER_BLOCK, 0, stream>>>(d_root, d_parent, space.num_entries); + KERNEL_CHECK(stream); + + CUDA_CHECK(cudaMemcpyAsync(&result.root, d_root, sizeof(int), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + // Now we materialize the tree into something the client can query. + bvh_init_leaf_boxes_kernel<<>>(space.entries_buffer, result.indices, space.num_entries, result.boxes); + KERNEL_CHECK(stream); + + int* d_visitCount = reinterpret_cast(d_morton_visit); + CUDA_CHECK(cudaMemsetAsync(d_visitCount, 0, (2*space.num_entries - 1) * sizeof(int), stream), stream); + + bvh_merge_internal_boxes_kernel < N, T ><<< COMPUTE_GRID(space.num_entries), THREADS_PER_BLOCK, 0, stream>>>(space.num_entries, result.childLeft, result.childRight, d_parent, result.boxes, d_visitCount); + KERNEL_CHECK(stream); + + // Cleanup. + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + my_arena.rollback(prev); + + } + + // Intersects two collapsed spaces, where lhs is always instances and rhs is either parent or sources/targets. + // If rhs is sources/targets, we mark the intersected rectangles by where they came from. + // If the intersection is costly, we accelerate with a BVH. + template + template + void GPUMicroOp::construct_input_rectlist(const collapsed_space &lhs, const collapsed_space &rhs, out_t* &d_valid_rects, size_t& out_size, uint32_t* counters, uint32_t* out_offsets, Arena &my_arena, cudaStream_t stream) + { + + CUDA_CHECK(cudaMemsetAsync(counters, 0, (lhs.num_children) * sizeof(uint32_t), stream), stream); + + BVH my_bvh; + bool bvh_valid = rhs.num_children < rhs.num_entries; + if (bvh_valid) { + build_bvh(rhs, my_bvh, my_arena, stream); + } + + // First pass: figure out how many rectangles survive intersection. + if (!bvh_valid) { + intersect_input_rects<<>>(lhs.entries_buffer, rhs.entries_buffer, lhs.offsets, nullptr, rhs.offsets, lhs.num_entries, rhs.num_entries, lhs.num_children, rhs.num_children, counters, nullptr); + } else { + query_input_bvh<<>>(lhs.entries_buffer, lhs.offsets, my_bvh.root, my_bvh.childLeft, my_bvh.childRight, my_bvh.indices, my_bvh.labels, my_bvh.boxes, lhs.num_entries, my_bvh.num_leaves, lhs.num_children, nullptr, counters, nullptr); + } + KERNEL_CHECK(stream); + + + // Prefix sum over instances (small enough to keep on host). + std::vector h_inst_counters(lhs.num_children+1); + h_inst_counters[0] = 0; // prefix sum starts at 0 + CUDA_CHECK(cudaMemcpyAsync(h_inst_counters.data()+1, counters, lhs.num_children * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t i = 0; i < lhs.num_children; ++i) { + h_inst_counters[i+1] += h_inst_counters[i]; + } + + out_size = h_inst_counters[lhs.num_children]; + + if (out_size==0) { + return; + } + + //Moving on... + my_arena.flip_parity(); + + // Non-empty rectangles from the intersection. + d_valid_rects = my_arena.alloc(out_size); + + // Where each instance should start writing its rectangles. + CUDA_CHECK(cudaMemcpyAsync(out_offsets, h_inst_counters.data(), (lhs.num_children + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream); + + // Reset counters. + CUDA_CHECK(cudaMemsetAsync(counters, 0, lhs.num_children * sizeof(uint32_t), stream), stream); + + // Second pass: recompute intersection, but this time write to output. + if (!bvh_valid) { + intersect_input_rects<<>>(lhs.entries_buffer, rhs.entries_buffer, lhs.offsets, out_offsets, rhs.offsets, lhs.num_entries, rhs.num_entries, lhs.num_children, rhs.num_children, counters, d_valid_rects); + } else { + query_input_bvh<<>>(lhs.entries_buffer, lhs.offsets, my_bvh.root, my_bvh.childLeft, my_bvh.childRight, my_bvh.indices, my_bvh.labels, my_bvh.boxes, lhs.num_entries, my_bvh.num_leaves, lhs.num_children, out_offsets, counters, d_valid_rects); + } + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } + + // Prefix sum an array of Rects or RectDescs by volume. + template + template + void GPUMicroOp::volume_prefix_sum(const out_t* d_rects, size_t total_rects, size_t* &d_prefix_rects, size_t& num_pts, Arena &my_arena, cudaStream_t stream) + { + d_prefix_rects = my_arena.alloc(total_rects+1); + CUDA_CHECK(cudaMemsetAsync(d_prefix_rects, 0, sizeof(size_t), stream), stream); + + size_t prev = my_arena.mark(); + + // Build the CUB transform‐iterator. + using VolIter = cub::TransformInputIterator< + size_t, // output type + RectVolumeOp, // functor + const out_t* // underlying input iterator + >; + VolIter d_volumes(d_rects, RectVolumeOp()); + + void* d_temp = nullptr; + size_t rect_temp_bytes = 0; + cub::DeviceScan::InclusiveSum( + /* d_temp_storage */ nullptr, + /* temp_bytes */ rect_temp_bytes, + /* d_in */ d_volumes, + /* d_out */ d_prefix_rects + 1, // shift by one so prefix[1]..prefix[n] + /* num_items */ total_rects, stream); + + d_temp = reinterpret_cast(my_arena.alloc(rect_temp_bytes)); + cub::DeviceScan::InclusiveSum( + /* d_temp_storage */ d_temp, + /* temp_bytes */ rect_temp_bytes, + /* d_in */ d_volumes, + /* d_out */ d_prefix_rects + 1, + /* num_items */ total_rects, stream); + + + //Number of points across all rectangles (also our total output count). + CUDA_CHECK(cudaMemcpyAsync(&num_pts, &d_prefix_rects[total_rects], sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + my_arena.rollback(prev); + } + + template + struct SegmentedMax { + __device__ __forceinline__ + HiFlag operator()(HiFlag a, HiFlag b) const { + // if b.head==1, start new segment at b; otherwise merge with running max + return b.head + ? b + : HiFlag{ a.hi > b.hi ? a.hi : b.hi , a.head }; + } + }; + + struct SegmentedSum { + __device__ __forceinline__ + DeltaFlag operator()(DeltaFlag a, DeltaFlag b) const { + // if b.head==1, start new segment at b; otherwise merge with running max + return b.head + ? b + : DeltaFlag{ a.delta + b.delta , a.head }; + } + }; + + struct CustomSum + { + template + __device__ __forceinline__ + T operator()(const T &a, const T &b) const { + return b+a; + } + }; + + + /* + * Input: An array of rectangles (potentially overlapping) with associated + * src indices, where all the rectangles with a given src idx together represent an exact covering + * of the partitioning output for that index. + * Output: A disjoint, coalesced array of rectangles sorted by src idx that it then sends off + * to the send output function, which constructs the final sparsity map. + * Approach: The difficult part is constructing a disjoint covering. To do so, collect all the corners from all the + * rectangles as the unique "boundaries" for each dimension and mark them with the parity for the number of dimensions + * in which they are the hi+1 coord (we add 1 to make intervals half-open). This means that if you prefix sum in each dimension, + * for any given rectangle anything internal will sum to 1, and anything external will sum to 0. To understand the intuition, + * see the illustration below for the rectangle [(0,0), (2,2)] + * Corners: (0,0), (0,3), (3,0), (3,3) + * Parities: 0 hi-> +1, 1 hi -> -1, 1 hi -> -1, 2 hi -> +1 + * Computation: + * Initial Markings + * 0 1 2 3 4 ... + * 0 +1 -1 + * 1 + * 2 + * 3 -1 +1 + * 4 + * ... + * Prefix sum by Y + * 0 1 2 3 4 ... + * 0 +1 -1 + * 1 1 -1 + * 2 1 -1 + * 3 0 0 + * 4 0 0 + * ... + * Prefix sum by X + * 0 1 2 3 4 ... + * 0 +1 1 1 0 0 ... + * 1 1 1 1 0 0 ... + * 2 1 1 1 0 0 ... + * 3 0 0 0 0 0 ... + * 4 0 0 0 0 0 ... + * ... + * Note that all the points in the rectangle end up labeled 1, and all the points outside labeled 0. In the actual computation, we use segments + * rather than points, where a segment accounts for all points between two consecutive boundaries. Because a prefix sum is a linear operator, when + * we extend the computation above to multiple overlapping rectangles, you end up with included segments labeled with a count of how many rectangles include them, + * and excluded segments labeled with 0. Thus, for the last dimension, we emit all segments with sums > 0 as disjoint output rectangles. We can then dump these + * into the sort + coalesce pipeline. + */ + template + template + void GPUMicroOp::complete_rect_pipeline(RectDesc* d_rects, size_t total_rects, RectDesc* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap) + { + + //1D case is much simpler + if (N==1) { + this->complete1d_pipeline(d_rects, total_rects, d_out_rects, out_rects, my_arena, ctr, getIndex, getMap); + return; + } + NVTX_DEPPART(complete_rect_pipeline); + cudaStream_t stream = Cuda::get_task_cuda_stream(); + + Memory my_mem; + bool found = find_memory(my_mem, Memory::GPU_FB_MEM); + assert(found); + + RegionInstance srcs_instance = this->realm_malloc(4*total_rects*sizeof(int32_t), my_mem); + RegionInstance crds_instance = this->realm_malloc(4*total_rects*sizeof(T), my_mem); + RegionInstance heads_instance = this->realm_malloc(2*total_rects * sizeof(uint8_t), my_mem); + RegionInstance sum_instance = this->realm_malloc(2*total_rects * sizeof(size_t), my_mem); + + RegionInstance B_src_inst[N]; + RegionInstance B_coord_inst[N]; + + size_t *B_starts[N]; + size_t *B_ends[N]; + + T* B_coord[N]; + size_t B_size[N]; + + RegionInstance B_ptrs_instance = this->realm_malloc(2 * N * sizeof(size_t*), my_mem); + size_t** B_start_ptrs = reinterpret_cast(AffineAccessor(B_ptrs_instance, 0).base); + size_t** B_end_ptrs = reinterpret_cast(AffineAccessor(B_ptrs_instance, 0).base) + N; + + RegionInstance B_coord_ptrs_instance = this->realm_malloc(N * sizeof(T*), my_mem); + T** B_coord_ptrs = reinterpret_cast(AffineAccessor(B_coord_ptrs_instance, 0).base); + + int threads_per_block = 256; + size_t grid_size = (total_rects + threads_per_block - 1) / threads_per_block; + + RegionInstance tmp_instance; + size_t orig_tmp = 0; + void *tmp_storage = nullptr; + + //Our first step is to find all the unique "boundaries" in each dimension (lo coord or hi+1 coord) + { + NVTX_DEPPART(mark_endpoints); + for (int d = 0; d < N; ++d) { + + //We need the coordinates to be sorted by our curent dim and separated by src idx + grid_size = (total_rects + threads_per_block - 1) / threads_per_block; + uint32_t* d_srcs_in = reinterpret_cast(AffineAccessor(srcs_instance, 0).base); + uint32_t* d_srcs_out = reinterpret_cast(AffineAccessor(srcs_instance, 0).base) + 2* total_rects; + T* d_coord_keys_in = reinterpret_cast(AffineAccessor(crds_instance,0).base); + T* d_coord_keys_out = reinterpret_cast(AffineAccessor(crds_instance,0).base) + 2 * total_rects; + mark_endpoints<<>>(d_rects, total_rects, d, d_srcs_in, d_coord_keys_in); + KERNEL_CHECK(stream); + size_t temp_bytes; + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_srcs_in, d_srcs_out, + 2 * total_rects, 0, 8*sizeof(T), stream); + if (temp_bytes > orig_tmp) { + if (orig_tmp > 0) { + tmp_instance.destroy(); + } + tmp_instance = this->realm_malloc(temp_bytes, my_mem); + orig_tmp = temp_bytes; + tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); + } + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_srcs_in, d_srcs_out, + 2 * total_rects, 0, 8*sizeof(T), stream); + std::swap(d_srcs_in, d_srcs_out); + std::swap(d_coord_keys_in, d_coord_keys_out); + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, + d_srcs_in, d_srcs_out, + d_coord_keys_in, d_coord_keys_out, + 2 * total_rects, 0, 8*sizeof(uint32_t), stream); + if (temp_bytes > orig_tmp) { + if (orig_tmp > 0) { + tmp_instance.destroy(); + } + tmp_instance = this->realm_malloc(temp_bytes, my_mem); + orig_tmp = temp_bytes; + tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); + } + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, + d_srcs_in, d_srcs_out, + d_coord_keys_in, d_coord_keys_out, + 2 * total_rects, 0, 8*sizeof(uint32_t), stream); + + //Now mark the unique keys + grid_size = (2*total_rects + threads_per_block - 1) / threads_per_block; + uint8_t * d_heads = reinterpret_cast(AffineAccessor(heads_instance, 0).base); + size_t *d_output = reinterpret_cast(AffineAccessor(sum_instance, 0).base); + mark_heads<<>>(d_srcs_out, d_coord_keys_out, 2 * total_rects, d_heads); + KERNEL_CHECK(stream); + + cub::DeviceScan::ExclusiveSum(nullptr, temp_bytes, d_heads, d_output, 2 * total_rects, stream); + if (temp_bytes > orig_tmp) { + if (orig_tmp > 0) { + tmp_instance.destroy(); + } + tmp_instance = this->realm_malloc(temp_bytes, my_mem); + orig_tmp = temp_bytes; + tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); + } + cub::DeviceScan::ExclusiveSum(tmp_storage, temp_bytes, d_heads, d_output, 2 * total_rects, stream); + + size_t num_unique; + uint8_t last_bit; + CUDA_CHECK(cudaMemcpyAsync(&num_unique, &d_output[2*total_rects-1], sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaMemcpyAsync(&last_bit, &d_heads[2*total_rects-1], sizeof(uint8_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + num_unique += last_bit; + + //Collect all the data we'll need later for this dimension - starts/ends by src, unique boundaries, unique boundaries count + B_coord_inst[d] = this->realm_malloc(num_unique * sizeof(T), my_mem); + B_src_inst[d] = this->realm_malloc(2*ctr.size() * sizeof(size_t), my_mem); + B_starts[d] = reinterpret_cast(AffineAccessor(B_src_inst[d], 0).base); + B_ends[d] = reinterpret_cast(AffineAccessor(B_src_inst[d], 0).base) + ctr.size(); + B_coord[d] = reinterpret_cast(AffineAccessor(B_coord_inst[d], 0).base); + B_size[d] = num_unique; + CUDA_CHECK(cudaMemsetAsync(B_starts[d], 0, ctr.size() * sizeof(size_t), stream), stream); + CUDA_CHECK(cudaMemsetAsync(B_ends[d], 0, ctr.size() * sizeof(size_t), stream), stream); + scatter_unique<<>>(d_srcs_out, d_coord_keys_out, d_output, d_heads, 2 * total_rects, B_starts[d], B_ends[d], B_coord[d]); + KERNEL_CHECK(stream); + std::vector d_starts_host(ctr.size()), d_ends_host(ctr.size()); + CUDA_CHECK(cudaMemcpyAsync(d_starts_host.data(), B_starts[d], ctr.size() * sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaMemcpyAsync(d_ends_host.data(), B_ends[d], ctr.size() * sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t i = 1; i < ctr.size(); i++) { + if (d_starts_host[i] < d_ends_host[i-1]) { + d_starts_host[i] = d_ends_host[i-1]; + d_ends_host[i] = d_ends_host[i-1]; + } + } + CUDA_CHECK(cudaMemcpyAsync(B_starts[d], d_starts_host.data(), ctr.size() * sizeof(size_t), cudaMemcpyHostToDevice, stream), stream); + CUDA_CHECK(cudaMemcpyAsync(B_ends[d], d_ends_host.data(), ctr.size() * sizeof(size_t), cudaMemcpyHostToDevice, stream), stream); + } + + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } + srcs_instance.destroy(); + crds_instance.destroy(); + heads_instance.destroy(); + sum_instance.destroy(); + + + //We need the arrays themselves on the device + CUDA_CHECK(cudaMemcpyAsync(B_coord_ptrs, B_coord, N * sizeof(T*), cudaMemcpyHostToDevice, stream), stream); + CUDA_CHECK(cudaMemcpyAsync(B_start_ptrs, B_starts, N * sizeof(size_t*), cudaMemcpyHostToDevice, stream), stream); + CUDA_CHECK(cudaMemcpyAsync(B_end_ptrs, B_ends, N * sizeof(size_t*), cudaMemcpyHostToDevice, stream), stream); + + //Next up, we generate all the corners of all the rectangles and mark them by parity + size_t num_corners = (1 << N); + RegionInstance corners_instance = this->realm_malloc(2 * num_corners * total_rects * sizeof(CornerDesc), my_mem); + CornerDesc* d_corners_in = reinterpret_cast*>(AffineAccessor(corners_instance, 0).base); + CornerDesc* d_corners_out = reinterpret_cast*>(AffineAccessor(corners_instance, 0).base) + num_corners * total_rects; + + populate_corners<<>>(d_rects, total_rects, d_corners_in); + KERNEL_CHECK(stream); + + + // We have a LOT of bookkeeping to do + std::set RLE_alloc_events; + + size_t alloc_size_1 = std::max({sizeof(size_t), sizeof(T), sizeof(int32_t), sizeof(DeltaFlag)}); + + RegionInstance shared_instance = this->realm_malloc(2 * num_corners * total_rects * alloc_size_1, my_mem); + + RegionInstance flags_instance = this->realm_malloc(num_corners * total_rects * sizeof(uint8_t), my_mem); + + RegionInstance exc_sum_instance = this->realm_malloc(num_corners * total_rects * sizeof(size_t), my_mem); + + size_t per_elem_size = 2*alloc_size_1 + sizeof(uint8_t) + sizeof(size_t); + + size_t* d_src_keys_in = reinterpret_cast(AffineAccessor(shared_instance, 0).base); + size_t* d_src_keys_out = reinterpret_cast(AffineAccessor(shared_instance, 0).base) + num_corners * total_rects; + T* d_coord_keys_in = reinterpret_cast(AffineAccessor(shared_instance, 0).base); + T* d_coord_keys_out = reinterpret_cast(AffineAccessor(shared_instance, 0).base) + num_corners * total_rects; + int32_t* d_deltas = reinterpret_cast(AffineAccessor(shared_instance, 0).base); + int32_t* d_deltas_out = reinterpret_cast(AffineAccessor(shared_instance, 0).base) + num_corners * total_rects; + DeltaFlag* d_delta_flags_in = reinterpret_cast(AffineAccessor(shared_instance, 0).base); + DeltaFlag* d_delta_flags_out = reinterpret_cast(AffineAccessor(shared_instance, 0).base) + num_corners * total_rects; + uint8_t* d_flags = reinterpret_cast(AffineAccessor(flags_instance, 0).base); + size_t* d_exc_sum = reinterpret_cast(AffineAccessor(exc_sum_instance, 0).base); + + RegionInstance seg_bound_instance; + size_t* seg_starts; + size_t* seg_ends; + + RegionInstance seg_counters; + uint32_t* d_seg_counters; + + RegionInstance seg_counters_out; + uint32_t* d_seg_counters_out; + + grid_size = (num_corners * total_rects + threads_per_block - 1) / threads_per_block; + + //We need to reduce duplicate corners by their parity, so we sort to get duplicates next to each other and then reduce by key + { + NVTX_DEPPART(sort_corners); + for (int dim = 0; dim < N; dim++) { + build_coord_key<<>>(d_coord_keys_in, d_corners_in, num_corners * total_rects, dim); + KERNEL_CHECK(stream); + size_t temp_bytes; + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_corners_in, d_corners_out, + num_corners * total_rects, 0, 8*sizeof(T), stream); + if (temp_bytes > orig_tmp) { + tmp_instance.destroy(); + tmp_instance = this->realm_malloc(temp_bytes, my_mem); + orig_tmp = temp_bytes; + tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); + } + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_corners_in, d_corners_out, + num_corners * total_rects, 0, 8*sizeof(T), stream); + + std::swap(d_corners_in, d_corners_out); + + } + } + + size_t temp_bytes; + build_src_key<<>>(d_src_keys_in, d_corners_in, num_corners * total_rects); + KERNEL_CHECK(stream); + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, + d_src_keys_in, d_src_keys_out, + d_corners_in, d_corners_out, + num_corners * total_rects, 0, 8*sizeof(size_t), stream); + if (temp_bytes > orig_tmp) { + tmp_instance.destroy(); + tmp_instance = this->realm_malloc(temp_bytes, my_mem); + orig_tmp = temp_bytes; + tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); + } + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, + d_src_keys_in, d_src_keys_out, + d_corners_in, d_corners_out, + num_corners * total_rects, 0, 8*sizeof(size_t), stream); + + std::swap(d_corners_in, d_corners_out); + get_delta<<>>(d_deltas, d_corners_in, num_corners * total_rects); + KERNEL_CHECK(stream); + + RegionInstance num_runs_instance = this->realm_malloc(sizeof(int), my_mem); + int* d_num_runs = reinterpret_cast(AffineAccessor(num_runs_instance, 0).base); + + //See above, we have custom equality and reduction operators for CornerDesc + CustomSum red_op; + cub::DeviceReduce::ReduceByKey( + nullptr, temp_bytes, + d_corners_in, d_corners_out, + d_deltas, d_deltas_out, + d_num_runs, + red_op, + /*num_items=*/(int) (num_corners * total_rects), + /*stream=*/stream); + + if (temp_bytes > orig_tmp) { + tmp_instance.destroy(); + tmp_instance = this->realm_malloc(temp_bytes, my_mem); + orig_tmp = temp_bytes; + tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); + } + cub::DeviceReduce::ReduceByKey( + tmp_storage, temp_bytes, + d_corners_in, d_corners_out, + d_deltas, d_deltas_out, + d_num_runs, + red_op, + /*num_items=*/(int) (num_corners * total_rects), + /*stream=*/stream); + + int num_unique_corners; + CUDA_CHECK(cudaMemcpyAsync(&num_unique_corners, d_num_runs, sizeof(int), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + num_runs_instance.destroy(); + + grid_size = (num_unique_corners + threads_per_block - 1) / threads_per_block; + set_delta<<>>(d_deltas_out, d_corners_out, num_unique_corners); + KERNEL_CHECK(stream); + + std::swap(d_corners_out, d_corners_in); + + size_t num_intermediate = num_unique_corners; + size_t num_segments; + + //This is where the real work is done. In each dimension, we do a segmented prefix sum of the parity markings keyed on (src idx, {every dim but d}) for all active segments. + // Then, for each unique boundary b in dim d, for each segment s keyed on (src idx, {every dim but d}), we evaluate s's prefix sum value at b. If nonzero, we emit a segment + // for s between b and the next boundary in d with all the other coords set to s's coords. These become the active segments for the next pass. In the last pass (d = 0), rather + // than emitting segments, we emit rectangles for all segments with nonzero prefix sums (in fact they must also be nonnegative - recall the model is > 0 for included, 0 for excluded + // by the end). + { + NVTX_DEPPART(collapse_higher_dims); + for (int d = N-1; d >= 0; d--) { + grid_size = (num_intermediate + threads_per_block - 1) / threads_per_block; + + //Our least significant sort is by d. + build_coord_key<<>>(d_coord_keys_in, d_corners_in, num_intermediate, d); + KERNEL_CHECK(stream); + size_t temp_bytes; + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_corners_in, d_corners_out, + num_intermediate, 0, 8*sizeof(T), stream); + if (temp_bytes > orig_tmp) { + tmp_instance.destroy(); + tmp_instance = this->realm_malloc(temp_bytes, my_mem); + orig_tmp = temp_bytes; + tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); + } + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_corners_in, d_corners_out, + num_intermediate, 0, 8*sizeof(T), stream); + + std::swap(d_corners_in, d_corners_out); + + //We need to key segments on every dimension but d and src idx, so we do a series of stable sorts to get there + for (int dim = 0; dim < N; dim++) { + if (dim == d) { + continue; + } + build_coord_key<<>>(d_coord_keys_in, d_corners_in, num_intermediate, dim); + KERNEL_CHECK(stream); + size_t temp_bytes; + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_corners_in, d_corners_out, + num_intermediate, 0, 8*sizeof(T), stream); + if (temp_bytes > orig_tmp) { + tmp_instance.destroy(); + tmp_instance = this->realm_malloc(temp_bytes, my_mem); + orig_tmp = temp_bytes; + tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); + } + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_corners_in, d_corners_out, + num_intermediate, 0, 8*sizeof(T), stream); + + std::swap(d_corners_in, d_corners_out); + + } + + build_src_key<<>>(d_src_keys_in, d_corners_in, num_intermediate); + KERNEL_CHECK(stream); + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, + d_src_keys_in, d_src_keys_out, + d_corners_in, d_corners_out, + num_intermediate, 0, 8*sizeof(size_t), stream); + if (temp_bytes > orig_tmp) { + tmp_instance.destroy(); + tmp_instance = this->realm_malloc(temp_bytes, my_mem); + orig_tmp = temp_bytes; + tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); + } + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, + d_src_keys_in, d_src_keys_out, + d_corners_in, d_corners_out, + num_intermediate, 0, 8*sizeof(size_t), stream); + + std::swap(d_corners_in, d_corners_out); + + //This serves 2 purposes + // 1) Our segmented prefix sum needs to know where to start and stop + // 2) We need to know how many unique segments (keyed on (src_idx, {every dimension but d}) we have + mark_deltas_heads<<>>(d_corners_in, num_intermediate, d, d_flags, d_delta_flags_in); + KERNEL_CHECK(stream); + + cub::DeviceScan::InclusiveSum(nullptr, temp_bytes, d_flags, d_exc_sum, num_intermediate, stream); + if (temp_bytes > orig_tmp) { + if (orig_tmp > 0) { + tmp_instance.destroy(); + } + tmp_instance = this->realm_malloc(temp_bytes, my_mem); + orig_tmp = temp_bytes; + tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); + } + cub::DeviceScan::InclusiveSum(tmp_storage, temp_bytes, d_flags, d_exc_sum, num_intermediate, stream); + + CUDA_CHECK(cudaMemcpyAsync(&num_segments, &d_exc_sum[num_intermediate-1], sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + //Mark the beginning and end of each segment for our kernel to use in binary search + seg_bound_instance = this->realm_malloc(2 * num_segments * sizeof(size_t), my_mem); + seg_starts = reinterpret_cast(AffineAccessor(seg_bound_instance, 0).base); + seg_ends = reinterpret_cast(AffineAccessor(seg_bound_instance, 0).base) + num_segments; + + seg_boundaries<<>>(d_flags, d_exc_sum, num_intermediate, seg_starts, seg_ends); + KERNEL_CHECK(stream); + + //Segmented prefix sum using our flags constructed above + cub::DeviceScan::InclusiveScan( + /*d_temp=*/ nullptr, + /*bytes=*/ temp_bytes, + /*in=*/ d_delta_flags_in, + /*out=*/ d_delta_flags_out, + /*op=*/ SegmentedSum(), + /*num_items=*/ num_intermediate, + /*stream=*/ stream + ); + + if (temp_bytes > orig_tmp) { + tmp_instance.destroy(); + tmp_instance = this->realm_malloc(temp_bytes, my_mem); + orig_tmp = temp_bytes; + tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); + } + + cub::DeviceScan::InclusiveScan( + /*d_temp=*/ tmp_storage, + /*bytes=*/ temp_bytes, + /*in=*/ d_delta_flags_in, + /*out=*/ d_delta_flags_out, + /*op=*/ SegmentedSum(), + /*num_items=*/ num_intermediate, + /*stream=*/ stream + ); + + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + //Per usual, we do a count + emit pass to track active segments and limit memory usage. If the evaluated prefix sum for a boundary within a segment + //is 0, we can skip it because it won't contribute anything to future sums and also won't be emitted. + seg_counters = this->realm_malloc(num_segments * sizeof(uint32_t), my_mem); + d_seg_counters = reinterpret_cast(AffineAccessor(seg_counters, 0).base); + CUDA_CHECK(cudaMemsetAsync(d_seg_counters, 0, num_segments * sizeof(uint32_t), stream), stream); + + grid_size = ((num_segments*B_size[d]) + threads_per_block - 1) / threads_per_block; + count_segments<<>>(d_delta_flags_out, seg_starts, seg_ends, B_starts[d], B_ends[d], d_corners_in, B_coord[d], B_size[d], num_segments, d, d_seg_counters); + KERNEL_CHECK(stream); + + seg_counters_out = this->realm_malloc(num_segments * sizeof(uint32_t), my_mem); + d_seg_counters_out = reinterpret_cast(AffineAccessor(seg_counters_out, 0).base); + + cub::DeviceScan::ExclusiveSum(nullptr, temp_bytes, d_seg_counters, d_seg_counters_out, num_segments, stream); + if (temp_bytes > orig_tmp) { + if (orig_tmp > 0) { + tmp_instance.destroy(); + } + tmp_instance = this->realm_malloc(temp_bytes, my_mem); + orig_tmp = temp_bytes; + tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); + } + cub::DeviceScan::ExclusiveSum(tmp_storage, temp_bytes, d_seg_counters, d_seg_counters_out, num_segments, stream); + + uint32_t next_round; + uint32_t last_count; + CUDA_CHECK(cudaMemcpyAsync(&next_round, &d_seg_counters_out[num_segments-1], sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaMemcpyAsync(&last_count, &d_seg_counters[num_segments-1], sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + next_round += last_count; + if (out_rects > 0 && (next_round + last_count) * per_elem_size > out_rects) { + shared_instance.destroy(); + flags_instance.destroy(); + exc_sum_instance.destroy(); + seg_bound_instance.destroy(); + seg_counters.destroy(); + seg_counters_out.destroy(); + corners_instance.destroy(); + out_rects = std::numeric_limits::max(); + return; + } + + num_intermediate = next_round; + + //In this case we exit out to emit rectangles rather than segments + if (d==0) { + break; + } + + RegionInstance next_corners_instance = this->realm_malloc(2 * next_round * sizeof(CornerDesc), my_mem); + CornerDesc* d_next_corners = reinterpret_cast*>(AffineAccessor(next_corners_instance, 0).base); + CUDA_CHECK(cudaMemsetAsync(d_seg_counters, 0, num_segments*sizeof(uint32_t), stream), stream); + + write_segments<<>>(d_delta_flags_out, seg_starts, seg_ends, B_starts[d], B_ends[d], d_corners_in, B_coord[d], d_seg_counters_out, B_size[d], num_segments, d, d_seg_counters, d_next_corners); + KERNEL_CHECK(stream); + + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + corners_instance.destroy(); + corners_instance = next_corners_instance; + d_corners_in = d_next_corners; + d_corners_out = d_next_corners + next_round; + + //The segment count in each iter is not monotonic, so we have to realloc each time + + shared_instance.destroy(); + flags_instance.destroy(); + exc_sum_instance.destroy(); + seg_bound_instance.destroy(); + seg_counters.destroy(); + seg_counters_out.destroy(); + + shared_instance = this->realm_malloc(2 * num_intermediate * alloc_size_1, my_mem); + flags_instance = this->realm_malloc(num_intermediate * sizeof(uint8_t), my_mem); + exc_sum_instance = this->realm_malloc(num_intermediate * sizeof(size_t), my_mem); + + d_src_keys_in = reinterpret_cast(AffineAccessor(shared_instance, 0).base); + d_src_keys_out = reinterpret_cast(AffineAccessor(shared_instance, 0).base) + num_intermediate; + d_coord_keys_in = reinterpret_cast(AffineAccessor(shared_instance, 0).base); + d_coord_keys_out = reinterpret_cast(AffineAccessor(shared_instance, 0).base) + num_intermediate; + d_deltas = reinterpret_cast(AffineAccessor(shared_instance, 0).base); + d_deltas_out = reinterpret_cast(AffineAccessor(shared_instance, 0).base) + num_intermediate; + + d_flags = reinterpret_cast(AffineAccessor(flags_instance, 0).base); + d_exc_sum = reinterpret_cast(AffineAccessor(exc_sum_instance, 0).base); + d_delta_flags_in = reinterpret_cast(AffineAccessor(shared_instance, 0).base); + d_delta_flags_out = reinterpret_cast(AffineAccessor(shared_instance, 0).base) + num_intermediate; + + } + } + + + //For our last dim, we emit rectangles rather than segments. These rectangles are a disjoint, precise covering of the original set. + RegionInstance rects_out_instance = this->realm_malloc(2 * num_intermediate * sizeof(RectDesc), my_mem); + RectDesc* d_rects_out = reinterpret_cast*>(AffineAccessor(rects_out_instance, 0).base); + RectDesc* d_rects_in = reinterpret_cast*>(AffineAccessor(rects_out_instance, 0).base) + num_intermediate; + CUDA_CHECK(cudaMemsetAsync(d_seg_counters, 0, num_segments*sizeof(uint32_t), stream), stream); + + write_segments<<>>(d_delta_flags_out, seg_starts, seg_ends, B_start_ptrs, B_end_ptrs, d_corners_in, B_coord_ptrs, d_seg_counters_out, B_size[0], num_segments, d_seg_counters, d_rects_out); + KERNEL_CHECK(stream); + + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + //Don't need these anymore + flags_instance.destroy(); + exc_sum_instance.destroy(); + seg_bound_instance.destroy(); + seg_counters.destroy(); + seg_counters_out.destroy(); + corners_instance.destroy(); + for (int d = 0; d < N; d++) { + B_coord_inst[d].destroy(); + B_src_inst[d].destroy(); + } + B_ptrs_instance.destroy(); + B_coord_ptrs_instance.destroy(); + + std::swap(d_rects_out, d_rects_in); + + shared_instance.destroy(); + size_t alloc_size_2 = max(sizeof(size_t), sizeof(T)); + + shared_instance = this->realm_malloc(2 * num_intermediate * alloc_size_2, my_mem); + + d_src_keys_in = reinterpret_cast(AffineAccessor(shared_instance, 0).base); + d_src_keys_out = reinterpret_cast(AffineAccessor(shared_instance, 0).base) + num_intermediate; + d_coord_keys_in = reinterpret_cast(AffineAccessor(shared_instance, 0).base); + d_coord_keys_out = reinterpret_cast(AffineAccessor(shared_instance, 0).base) + num_intermediate; + + RegionInstance break_points_instance = this->realm_malloc(num_intermediate * sizeof(uint8_t), my_mem); + uint8_t* break_points = reinterpret_cast(AffineAccessor(break_points_instance, 0).base); + + size_t* group_ids = reinterpret_cast(AffineAccessor(shared_instance, 0).base); + + //Now that we have disjoint rectangles, we can do our usual sort and coalesce pass + size_t last = INT_MAX; + { + NVTX_DEPPART(compact_disjoint_rects); + while (last > num_intermediate) { + last = num_intermediate; + + bool done = false; + for (int dim = 1; !done; dim++) { + if (dim == N) { + dim = 0; // wrap around to 0 + done = true; + } + grid_size = (num_intermediate + threads_per_block - 1) / threads_per_block; + + build_lo_key<<>>(d_coord_keys_in, d_rects_in, num_intermediate, dim); + KERNEL_CHECK(stream); + size_t temp_bytes; + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_rects_in, d_rects_out, + num_intermediate, 0, 8*sizeof(T), stream); + if (temp_bytes > orig_tmp) { + tmp_instance.destroy(); + tmp_instance = this->realm_malloc(temp_bytes, my_mem); + orig_tmp = temp_bytes; + tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); + } + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_rects_in, d_rects_out, + num_intermediate, 0, 8*sizeof(T), stream); + + std::swap(d_rects_in, d_rects_out); + for (int d = 0; d < N; d++) { + if (d == dim) { + continue; + } + build_hi_key<<>>(d_coord_keys_in, d_rects_in, num_intermediate, d); + KERNEL_CHECK(stream); + size_t temp_bytes; + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_rects_in, d_rects_out, + num_intermediate, 0, 8*sizeof(T), stream); + if (temp_bytes > orig_tmp) { + tmp_instance.destroy(); + tmp_instance = this->realm_malloc(temp_bytes, my_mem); + orig_tmp = temp_bytes; + tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); + } + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_rects_in, d_rects_out, + num_intermediate, 0, 8*sizeof(T), stream); + + std::swap(d_rects_in, d_rects_out); + build_lo_key<<>>(d_coord_keys_in, d_rects_in, num_intermediate, d); + KERNEL_CHECK(stream); + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_rects_in, d_rects_out, + num_intermediate, 0, 8*sizeof(T), stream); + if (temp_bytes > orig_tmp) { + tmp_instance.destroy(); + tmp_instance = this->realm_malloc(temp_bytes, my_mem); + orig_tmp = temp_bytes; + tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); + } + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_rects_in, d_rects_out, + num_intermediate, 0, 8*sizeof(T), stream); + + std::swap(d_rects_in, d_rects_out); + + } + + build_src_key<<>>(d_src_keys_in, d_rects_in, num_intermediate); + KERNEL_CHECK(stream); + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, + d_src_keys_in, d_src_keys_out, + d_rects_in, d_rects_out, + num_intermediate, 0, 8*sizeof(size_t), stream); + if (temp_bytes > orig_tmp) { + tmp_instance.destroy(); + tmp_instance = this->realm_malloc(temp_bytes, my_mem); + orig_tmp = temp_bytes; + tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); + } + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, + d_src_keys_in, d_src_keys_out, + d_rects_in, d_rects_out, + num_intermediate, 0, 8*sizeof(size_t), stream); + + std::swap(d_rects_in, d_rects_out); + + mark_breaks_dim<<>>(d_rects_in, break_points, num_intermediate, dim); + KERNEL_CHECK(stream); + + cub::DeviceScan::InclusiveSum(nullptr, temp_bytes, break_points, group_ids, num_intermediate, stream); + + if (temp_bytes > orig_tmp) { + tmp_instance.destroy(); + tmp_instance = this->realm_malloc(temp_bytes, my_mem); + orig_tmp = temp_bytes; + tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); + } + + cub::DeviceScan::InclusiveSum(tmp_storage, temp_bytes, break_points, group_ids, num_intermediate, stream); + + size_t last_grp; + CUDA_CHECK(cudaMemcpyAsync(&last_grp, &group_ids[num_intermediate-1], sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + init_rects_dim<<>>(d_rects_in, break_points, group_ids, d_rects_out, num_intermediate, dim); + KERNEL_CHECK(stream); + + num_intermediate = last_grp; + std::swap(d_rects_in, d_rects_out); + } + } + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } + + heads_instance.destroy(); + shared_instance.destroy(); + tmp_instance.destroy(); + + //And... we're done + if (out_rects > 0) { + d_out_rects = d_rects_in; + out_rects = num_intermediate; + } else { + this->send_output(d_rects_in, num_intermediate, my_arena, ctr, getIndex, getMap); + rects_out_instance.destroy(); + } + + } + + /* + * Input: An array of 1D rectangles (potentially overlapping) with associated + * src indices, where all the rectangles with a given src idx together represent an exact covering + * of the partitioning output for that index. + * Output: A disjoint, coalesced array of rectangles sorted by src idx that it then sends off + * to the send output function, which constructs the final sparsity map. + * Approach: The canonical 1D rectangle merge, in parallel. Sort the rectangles by (src_idx, lo). Then + * prefix max by hi segmented by src_idx to find overlapping rectangles. Then, RLE by starting a new rectangle + * when in a new src or lo > current max hi and merging otherwise. + */ + template + template + void GPUMicroOp::complete1d_pipeline(RectDesc* d_rects, size_t total_rects, RectDesc* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap) + { + + NVTX_DEPPART(complete1d_pipeline); + cudaStream_t stream = Cuda::get_task_cuda_stream(); + + RectDesc* d_rects_in = d_rects; + + size_t bytes_T = total_rects * sizeof(T); + size_t bytes_S = total_rects * sizeof(size_t); + size_t bytes_HF = total_rects * sizeof(HiFlag); + size_t max_bytes = std::max({bytes_T, bytes_HF, bytes_S}); + + char* aux_ptr = my_arena.alloc(2 * max_bytes); + + uint8_t* break_points = my_arena.alloc(total_rects); + size_t* group_ids = my_arena.alloc(total_rects); + + T* d_keys_in = reinterpret_cast(aux_ptr); + T* d_keys_out = reinterpret_cast(aux_ptr + max_bytes); + + size_t* d_src_keys_in = reinterpret_cast(aux_ptr); + size_t* d_src_keys_out = reinterpret_cast(aux_ptr + max_bytes); + + HiFlag* d_hi_flags_in = reinterpret_cast*>(aux_ptr); + HiFlag* d_hi_flags_out = reinterpret_cast*>(aux_ptr + max_bytes); + + size_t num_intermediate = total_rects; + + const size_t prev = my_arena.mark(); + RectDesc* d_rects_out = my_arena.alloc>(total_rects); + + size_t t1=0, t2 = 0, t3 = 0, t4 = 0; + cub::DeviceRadixSort::SortPairs(nullptr, t1, + d_keys_in, d_keys_out, d_rects_in, d_rects_out, num_intermediate, + 0, 8*sizeof(T), stream); + // exclusive scan + cub::DeviceScan::ExclusiveScan(nullptr, t2, + d_hi_flags_in, d_hi_flags_out, + SegmentedMax(), HiFlag{std::numeric_limits::min(), 0}, + num_intermediate, stream); + // inclusive sum + cub::DeviceScan::InclusiveSum(nullptr, t3, + break_points, group_ids, + num_intermediate, stream); + + cub::DeviceRadixSort::SortPairs(nullptr, t4, d_src_keys_in, d_src_keys_out, d_rects_in, d_rects_out, num_intermediate, 0, 8*sizeof(size_t), stream); + + size_t temp_bytes = std::max({t1, t2, t3, t4}); + size_t use_bytes = temp_bytes; + void *temp_storage = my_arena.alloc(temp_bytes); + + int threads_per_block = 256; + size_t grid_size = (num_intermediate + threads_per_block - 1) / threads_per_block; + + //Sort the rectangles keyed by (src, lo) + { + NVTX_DEPPART(sort_rects); + + build_lo_key<<>>(d_keys_in, d_rects_in, num_intermediate, 0); + KERNEL_CHECK(stream); + cub::DeviceRadixSort::SortPairs(temp_storage, use_bytes, d_keys_in, d_keys_out, d_rects_in, d_rects_out, num_intermediate, 0, 8*sizeof(T), stream); + std::swap(d_rects_in, d_rects_out); + + build_src_key<<>>(d_src_keys_in, d_rects_in, num_intermediate); + KERNEL_CHECK(stream); + + use_bytes = temp_bytes; + cub::DeviceRadixSort::SortPairs(temp_storage, use_bytes, d_src_keys_in, d_src_keys_out, d_rects_in, d_rects_out, num_intermediate, 0, 8*sizeof(size_t), stream); + std::swap(d_rects_in, d_rects_out); + } + + //Prefix max by hi segmented by src, then RLE to merge. + { + NVTX_DEPPART(run_length_encode); + build_hi_flag<<>>(d_hi_flags_in, d_rects_in, num_intermediate, 0); + KERNEL_CHECK(stream); + + + use_bytes = temp_bytes; + cub::DeviceScan::ExclusiveScan( + /*d_temp=*/ temp_storage, + /*bytes=*/ use_bytes, + /*in=*/ d_hi_flags_in, + /*out=*/ d_hi_flags_out, + /*op=*/ SegmentedMax(), + HiFlag{std::numeric_limits::min(), 0}, + /*num_items=*/ num_intermediate, + /*stream=*/ stream + ); + + threads_per_block = 256; + grid_size = (num_intermediate + threads_per_block - 1) / threads_per_block; + mark_breaks_dim<<>>(d_hi_flags_in, d_hi_flags_out, d_rects_in, break_points, num_intermediate, 0); + KERNEL_CHECK(stream); + use_bytes = temp_bytes; + cub::DeviceScan::InclusiveSum(temp_storage, use_bytes, break_points, group_ids, num_intermediate, stream); + + size_t last_grp; + CUDA_CHECK(cudaMemcpyAsync(&last_grp, &group_ids[num_intermediate-1], sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + my_arena.rollback(prev); + my_arena.flip_parity(); + assert(my_arena.get_parity()); + my_arena.reset(true); + d_rects_out = my_arena.alloc>(last_grp); + my_arena.commit(true); + + init_rects_dim<<>>(d_rects_in, d_hi_flags_out, break_points, group_ids, d_rects_out, num_intermediate, 0); + KERNEL_CHECK(stream); + + num_intermediate = last_grp; + std::swap(d_rects_in, d_rects_out); + + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } + + if (out_rects > 0) { + d_out_rects = d_rects_in; + out_rects = num_intermediate; + } else { + this->send_output(d_rects_in, num_intermediate, my_arena, ctr, getIndex, getMap); + } + } + + /* + * Input: An array of points (potentially with duplicates) with associated + * src indices, where all the points with a given src idx together represent an exact covering + * of the partitioning output for that index. + * Output: A disjoint, coalesced array of rectangles sorted by src idx that it then sends off + * to the send output function, which constructs the final sparsity map. + * Approach: Sort the points by (x0,x1,...,xN-1,src) (right is MSB). Convert them to singleton rects. + * Run-length encode along each dimension (N-1...0). + */ + template + template + void GPUMicroOp::complete_pipeline(PointDesc* d_points, size_t total_pts, RectDesc* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap) + { + + NVTX_DEPPART(complete_pipeline); + + size_t prev = my_arena.mark(); + + cudaStream_t stream = Cuda::get_task_cuda_stream(); + + size_t bytes_T = total_pts * sizeof(T); + size_t bytes_S = total_pts * sizeof(size_t); + size_t bytes_R = total_pts * sizeof(RectDesc); + size_t bytes_p = total_pts * sizeof(PointDesc); + size_t max_aux_bytes = std::max({bytes_T, bytes_S, bytes_R}); + size_t max_pg_bytes = std::max({bytes_p, bytes_S}); + + + // Instance shared by coordinate keys, source keys, and rectangle outputs + char* aux_ptr = my_arena.alloc(2 * max_aux_bytes); + + //Instance shared by group ids (RLE) and intermediate points in sorting + char* pg_ptr = my_arena.alloc(max_pg_bytes); + + uint8_t* break_points = my_arena.alloc(total_pts); + + T* d_keys_in = reinterpret_cast(aux_ptr); + T* d_keys_out = reinterpret_cast(aux_ptr + max_aux_bytes); + + PointDesc* d_points_in = d_points; + PointDesc* d_points_out = reinterpret_cast*>(pg_ptr); + + size_t* group_ids = reinterpret_cast(pg_ptr); + + RectDesc* d_rects_in = reinterpret_cast*>(aux_ptr); + RectDesc *d_rects_out = reinterpret_cast*>(aux_ptr + max_aux_bytes); + + size_t* d_src_keys_in = reinterpret_cast(aux_ptr); + size_t* d_src_keys_out = reinterpret_cast(aux_ptr + max_aux_bytes); + + size_t t1=0, t2=0, t3=0; + cub::DeviceRadixSort::SortPairs(nullptr, t1, d_keys_in, d_keys_out, d_points_in, d_points_out, total_pts, 0, 8*sizeof(T), stream); + cub::DeviceRadixSort::SortPairs(nullptr, t2, d_src_keys_in, d_src_keys_out, d_points_in, d_points_out, total_pts, 0, 8*sizeof(size_t), stream); + cub::DeviceScan::InclusiveSum(nullptr, t3, break_points, group_ids, total_pts, stream); + + //Temporary storage instance shared by CUB operations. + size_t temp_bytes = std::max({t1, t2, t3}); + void *temp_storage = my_arena.alloc(temp_bytes); + + + //Sort along each dimension from LSB to MSB (0 to N-1) + size_t use_bytes = temp_bytes; + + { + NVTX_DEPPART(sort_valid_points); + for (int dim = 0; dim < N; ++dim) { + build_coord_key<<>>(d_keys_in, d_points_in, total_pts, dim); + KERNEL_CHECK(stream); + cub::DeviceRadixSort::SortPairs(temp_storage, use_bytes, d_keys_in, d_keys_out, d_points_in, d_points_out, total_pts, 0, 8*sizeof(T), stream); + std::swap(d_keys_in, d_keys_out); + std::swap(d_points_in, d_points_out); + } + + //Sort by source index now to keep individual partitions separate + build_src_key<<>>(d_src_keys_in, d_points_in, total_pts); + KERNEL_CHECK(stream); + use_bytes = temp_bytes; + cub::DeviceRadixSort::SortPairs(temp_storage, use_bytes, d_src_keys_in, d_src_keys_out, d_points_in, d_points_out, total_pts, 0, 8*sizeof(size_t), stream); + } + + + points_to_rects<<>>(d_points_out, d_rects_in, total_pts); + KERNEL_CHECK(stream); + + size_t num_intermediate = total_pts; + + { + NVTX_DEPPART(run_length_encode); + + for (int dim = N-1; dim >= 0; --dim) { + + // Step 1: Mark rectangle starts + // e.g. [1, 2, 4, 5, 6, 8] -> [1, 0, 1, 0, 0, 1] + mark_breaks_dim<<>>(d_rects_in, break_points, num_intermediate, dim); + KERNEL_CHECK(stream); + + // Step 2: Inclusive scan of break points to get group ids + // e.g. [1, 0, 1, 0, 0, 1] -> [1, 1, 2, 2, 2, 3] + use_bytes = temp_bytes; + cub::DeviceScan::InclusiveSum(temp_storage, use_bytes, break_points, group_ids, num_intermediate, stream); + + //Determine new number of intermediate rectangles + size_t last_grp; + CUDA_CHECK(cudaMemcpyAsync(&last_grp, &group_ids[num_intermediate-1], sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + //Step 3: Write output rectangles, where rect starts write lo and rect ends write hi + init_rects_dim<<>>(d_rects_in, break_points, group_ids, d_rects_out, num_intermediate, dim); + KERNEL_CHECK(stream); + + num_intermediate = last_grp; + std::swap(d_rects_in, d_rects_out); + } + my_arena.rollback(prev); + d_out_rects = my_arena.alloc>(num_intermediate); + CUDA_CHECK(cudaMemcpyAsync(d_out_rects, d_rects_in, num_intermediate * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } + + if (out_rects==1) { + out_rects = num_intermediate; + } else { + this->send_output(d_rects_in, num_intermediate, my_arena, ctr, getIndex, getMap); + my_arena.rollback(prev); + } + } + + /* + * Input: An array of disjoint rectangles sorted by src idx. + * Output: Fills the sparsity output for each src with a host region instance + * containing the entries/approx entries and calls gpu_finalize on the SparsityMapImpl. + * Approach: Segments the rectangles by their src idx and copies them back to the host, + */ + + template + template + void GPUMicroOp::send_output(RectDesc* d_rects, size_t total_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap) + { + NVTX_DEPPART(send_output); + + size_t prev = my_arena.mark(); + + cudaStream_t stream = Cuda::get_task_cuda_stream(); + + std::set output_allocs; + + SparsityMapEntry* final_entries = my_arena.alloc>(total_rects); + Rect* final_rects = my_arena.alloc>(total_rects); + + size_t* d_starts = my_arena.alloc(2 * ctr.size()); + size_t* d_ends = d_starts + ctr.size(); + + CUDA_CHECK(cudaMemsetAsync(d_starts, 0, ctr.size()*sizeof(size_t),stream), stream); + CUDA_CHECK(cudaMemsetAsync(d_ends, 0, ctr.size()*sizeof(size_t),stream), stream); + + + //Convert RectDesc to SparsityMapEntry and determine where each src's rectangles start and end. + build_final_output<<>>(d_rects, final_entries, final_rects, d_starts, d_ends, total_rects); + KERNEL_CHECK(stream); + + + //Copy starts and ends back to host and handle empty partitions + std::vector d_starts_host(ctr.size()), d_ends_host(ctr.size()); + CUDA_CHECK(cudaMemcpyAsync(d_starts_host.data(), d_starts, ctr.size() * sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaMemcpyAsync(d_ends_host.data(), d_ends, ctr.size() * sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t i = 1; i < ctr.size(); i++) { + if (d_starts_host[i] < d_ends_host[i-1]) { + d_starts_host[i] = d_ends_host[i-1]; + d_ends_host[i] = d_ends_host[i-1]; + } + } + + if (!this->exclusive) { + for (auto const& elem : ctr) { + size_t idx = getIndex(elem); + auto mapOpj = getMap(elem); + SparsityMapImpl *impl = SparsityMapImpl::lookup(mapOpj); + if (d_ends_host[idx] > d_starts_host[idx]) { + size_t end = d_ends_host[idx]; + size_t start = d_starts_host[idx]; + std::vector> h_rects(end - start); + CUDA_CHECK(cudaMemcpyAsync(h_rects.data(), final_rects + start, (end - start) * sizeof(Rect), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + impl->contribute_dense_rect_list(h_rects, true); + } else { + impl->contribute_nothing(); + } + } + } else { + Memory sysmem; + assert(find_memory(sysmem, Memory::SYSTEM_MEM)); + + //Use provided lambdas to iterate over sparsity output container (map or vector) + for (auto const& elem : ctr) { + size_t idx = getIndex(elem); + auto mapOpj = getMap(elem); + SparsityMapImpl *impl = SparsityMapImpl::lookup(mapOpj); + if (d_ends_host[idx] > d_starts_host[idx]) { + size_t end = d_ends_host[idx]; + size_t start = d_starts_host[idx]; + RegionInstance entries = this->realm_malloc((end - start) * sizeof(SparsityMapEntry), sysmem); + SparsityMapEntry *h_entries = reinterpret_cast *>(AffineAccessor(entries, 0).base); + CUDA_CHECK(cudaMemcpyAsync(h_entries, final_entries + start, (end - start) * sizeof(SparsityMapEntry), cudaMemcpyDeviceToHost, stream), stream); + + Rect *approx_rects; + size_t num_approx; + if (end - start <= ((size_t) DeppartConfig::cfg_max_rects_in_approximation)) { + approx_rects = final_rects + start; + num_approx = end - start; + } else { + //TODO: Maybe add a better GPU approx here when given more rectangles + //Use CUB to compute a bad approx on the GPU (union of all rectangles) + approx_rects = my_arena.alloc>(1); + num_approx = 1; + void* d_temp = nullptr; + size_t temp_sz = 0; + Rect identity_rect; + for(int d=0; d::max(); + identity_rect.hi[d] = std::numeric_limits::min(); + } + cub::DeviceReduce::Reduce( + d_temp, temp_sz, + final_rects + start, + approx_rects, + (end - start), + UnionRectOp(), + identity_rect, + stream + ); + d_temp = reinterpret_cast(my_arena.alloc(temp_sz)); + cub::DeviceReduce::Reduce( + d_temp, temp_sz, + final_rects + start, + approx_rects, + end - start, + UnionRectOp(), + identity_rect, + stream + ); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } + RegionInstance approx_entries = this->realm_malloc(num_approx * sizeof(Rect), sysmem); + SparsityMapEntry *h_approx_entries = reinterpret_cast *>(AffineAccessor(approx_entries, 0).base); + CUDA_CHECK(cudaMemcpyAsync(h_approx_entries, approx_rects, num_approx * sizeof(Rect), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + impl->set_instance(entries, end - start); + impl->set_approx_instance(approx_entries, num_approx); + } + } + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (auto const& elem : ctr) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(getMap(elem)); + impl->gpu_finalize(); + } + } + my_arena.rollback(prev); + } + + +} \ No newline at end of file diff --git a/src/realm/deppart/partitions_gpu_kernels.hpp b/src/realm/deppart/partitions_gpu_kernels.hpp new file mode 100644 index 0000000000..f3c1dd514e --- /dev/null +++ b/src/realm/deppart/partitions_gpu_kernels.hpp @@ -0,0 +1,811 @@ +#pragma once +#include "realm/deppart/partitions.h" + +namespace Realm { + +template +__device__ __forceinline__ size_t bsearch(const T* arr, size_t len, T val) { + size_t low = 0, high = len; + while (low < high) { + size_t mid = low + ((high - low) >> 1); + if (arr[mid + 1] <= val) + low = mid + 1; + else + high = mid; + } + return low; +} + +template +__global__ void subtract_const( + T* d_data, + size_t num_elems, + T value +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= num_elems) return; + d_data[idx] = d_data[idx] <= value ? 0 : d_data[idx] - value; +} + +// Intersect all instance rectangles with all parent rectangles in parallel. +// Used for both count and emit depending on whether the output array is null. + +template +__global__ void intersect_input_rects( + const SparsityMapEntry* d_lhs_entries, + const SparsityMapEntry* d_rhs_entries, + const size_t *d_lhs_offsets, + const uint32_t *d_lhs_prefix, + const size_t* d_rhs_offsets, + size_t numLHSRects, + size_t numRHSRects, + size_t numLHSChildren, + size_t numRHSChildren, + uint32_t *d_lhs_counters, + out_t* d_rects +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numLHSRects * numRHSRects) return; + size_t idx_x = idx % numRHSRects; + size_t idx_y = idx / numRHSRects; + assert(idx_x < numRHSRects); + assert(idx_y < numLHSRects); + const SparsityMapEntry rhs_entry = d_rhs_entries[idx_x]; + const SparsityMapEntry lhs_entry = d_lhs_entries[idx_y]; + Rect rect_output = lhs_entry.bounds.intersection(rhs_entry.bounds); + if (rect_output.empty()) { + return; + } + size_t lhs_idx = bsearch(d_lhs_offsets, numLHSChildren, idx_y); + uint32_t local = atomicAdd(&d_lhs_counters[lhs_idx], 1); + if (d_rects != nullptr) { + // If d_rects is not null, we write the output rect + uint32_t out_idx = d_lhs_prefix[lhs_idx] + local; + if constexpr (std::is_same_v>) { + d_rects[out_idx].src_idx = bsearch(d_rhs_offsets, numRHSChildren, idx_x); + d_rects[out_idx].rect = rect_output; + } else { + d_rects[out_idx] = rect_output; + } + } +} + +template +__device__ __forceinline__ uint64_t bvh_morton_code(const Rect& rect, + const Rect& globalBounds) { + // bits per axis (floor) + constexpr int bits = 64 / N; + constexpr uint64_t maxQ = (bits == 64 ? ~0ULL + : (1ULL << bits) - 1); + + uint64_t coords[N]; +#pragma unroll + for(int d = 0; d < N; ++d) { + // 1) compute centroid in dimension d + float center = 0.5f * (float(rect.lo[d]) + float(rect.hi[d]) + 1.0f); + + // 2) normalize into [0,1] using globalBounds + float span = float(globalBounds.hi[d] + 1 - globalBounds.lo[d]); + float norm = (center - float(globalBounds.lo[d])) / span; + + // 3) quantize to [0 … maxQ] + uint64_t q = uint64_t(norm * float(maxQ) + 0.5f); + coords[d] = (q > maxQ ? maxQ : q); + } + + // 4) interleave bits MSB→LSB across all dims + uint64_t code = 0; + for(int b = bits - 1; b >= 0; --b) { +#pragma unroll + for(int d = 0; d < N; ++d) { + code = (code << 1) | ((coords[d] >> b) & 1ULL); + } + } + + return code; +} + +template +__global__ void bvh_build_morton_codes( + const SparsityMapEntry* d_targets_entries, + const size_t* d_offsets_rects, + const Rect* d_global_bounds, + size_t total_rects, + size_t num_targets, + uint64_t* d_morton_codes, + uint64_t* d_indices, + uint64_t* d_targets_indices) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= total_rects) return; + const auto &entry = d_targets_entries[idx]; + d_morton_codes[idx] = bvh_morton_code(entry.bounds, *d_global_bounds); + d_indices[idx] = idx; + if (d_offsets_rects != nullptr) { + d_targets_indices[idx] = bsearch(d_offsets_rects, num_targets, idx); + } +} + + __global__ +void bvh_build_radix_tree_kernel( + const uint64_t *morton, // [n] + const uint64_t *leafIdx, // [n] (unused here but kept for symmetry) + int n, + int *childLeft, // [2n−1] + int *childRight, // [2n−1] + int *parent); // [2n−1], pre‐initialized to −1 + +__global__ +void bvh_build_root_kernel( + int *root, + int *parent, + size_t total_rects); + +template +__global__ +void bvh_init_leaf_boxes_kernel( + const SparsityMapEntry *rects, // [G] all flattened Rects + const uint64_t *leafIdx, // [n] maps leaf→orig Rect index + size_t total_rects, + Rect *boxes) // [(2n−1)] +{ + int k = blockIdx.x*blockDim.x + threadIdx.x; + if (k >= total_rects) return; + + size_t orig = leafIdx[k]; + boxes[k + total_rects - 1] = rects[orig].bounds; +} + +template +__global__ +void bvh_merge_internal_boxes_kernel( + size_t total_rects, + const int *childLeft, // [(2n−1)] + const int *childRight, // [(2n−1)] + const int *parent, // [(2n−1)] + Rect *boxes, // [(2n−1)×N] + int *visitCount) // [(2n−1)] initialized to zero +{ + int leaf = blockIdx.x*blockDim.x + threadIdx.x; + if (leaf >= total_rects) return; + + int cur = leaf + total_rects - 1; + int p = parent[cur]; + + while(p >= 0) { + // increment visit count; the second arrival merges + int prev = atomicAdd(&visitCount[p], 1); + if (prev == 1) { + // both children ready, do the merge + int c0 = childLeft[p], c1 = childRight[p]; + boxes[p] = boxes[c0].union_bbox(boxes[c1]); + // climb + cur = p; + p = parent[cur]; + } else { + // first child arrived, wait for sibling + break; + } + } +} + +template +__global__ +void query_input_bvh( + SparsityMapEntry* queries, + size_t* d_query_offsets, + int root, + int *childLeft, + int *childRight, + uint64_t *indices, + uint64_t *labels, + Rect *boxes, + size_t numQueries, + size_t numBoxes, + size_t numLHSChildren, + uint32_t* d_inst_prefix, + uint32_t* d_inst_counters, + out_t *d_rects +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numQueries) return; + Rect in_rect = queries[idx].bounds; + size_t lhs_idx = bsearch(d_query_offsets, numLHSChildren, idx); + + constexpr int MAX_STACK = 64; // max stack size for BVH traversal + int stack[MAX_STACK]; + int sp = 0; + + // start at the root + stack[sp++] = -1; + int node = root; + do + { + + int left = childLeft[node]; + int right = childRight[node]; + + bool overlapL = boxes[left].overlaps(in_rect); + bool overlapR = boxes[right].overlaps(in_rect); + + if (overlapL && left >= numBoxes - 1) { + uint64_t rect_idx = indices[left - (numBoxes - 1)]; + uint32_t local = atomicAdd(&d_inst_counters[lhs_idx], 1); + if (d_rects != nullptr) { + uint32_t out_idx = d_inst_prefix[lhs_idx] + local; + Rect out_rect = boxes[left].intersection(in_rect); + if constexpr (std::is_same_v>) { + d_rects[out_idx].rect = out_rect; + d_rects[out_idx].src_idx = labels[rect_idx]; + } else { + d_rects[out_idx] = out_rect; + } + } + } + if (overlapR && right >= numBoxes - 1) { + uint64_t rect_idx = indices[right - (numBoxes - 1)]; + uint32_t local = atomicAdd(&d_inst_counters[lhs_idx], 1); + if (d_rects != nullptr) { + uint32_t out_idx = d_inst_prefix[lhs_idx] + local; + Rect out_rect = boxes[right].intersection(in_rect); + if constexpr (std::is_same_v>) { + d_rects[out_idx].rect = out_rect; + d_rects[out_idx].src_idx = labels[rect_idx]; + } else { + d_rects[out_idx] = out_rect; + } + } + } + + bool traverseL = overlapL && left < numBoxes - 1; + bool traverseR = overlapR && right < numBoxes - 1; + + if (!traverseL && !traverseR) { + node = stack[--sp]; + } else { + node = (traverseL ? left : right); + if (traverseL && traverseR) { + stack[sp++] = right; + } + } + } while (node != -1); +} + +template +struct CornerDesc { + uint32_t src_idx; + T coord[N]; + int32_t delta; + + // Equality for ReduceByKey: compare key fields only (src_idx, coords) + __host__ __device__ __forceinline__ + bool operator==(const CornerDesc& rhs) const { + if (src_idx != rhs.src_idx) return false; + for (int d = 0; d < N; ++d) + if (coord[d] != rhs.coord[d]) return false; + return true; + } +}; + +template +__global__ void mark_endpoints(const RectDesc* d_rects, + size_t M, + int dim, + uint32_t* d_src_keys, + T* d_crd_keys) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= M) return; + d_src_keys[2*i] = d_rects[i].src_idx; + d_src_keys[2*i+1] = d_rects[i].src_idx; + d_crd_keys[2*i] = d_rects[i].rect.lo[dim]; + d_crd_keys[2*i+1] = d_rects[i].rect.hi[dim] + 1; +} + +template +__global__ void mark_heads(const uint32_t* d_src_keys, + const T* d_crd_keys, + size_t M, + uint8_t* d_heads) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= M) return; + if (i==0) d_heads[0] = 1; + else { + d_heads[i] = d_src_keys[i] != d_src_keys[i-1] || d_crd_keys[i] != d_crd_keys[i-1]; + } +} + +template +__global__ void seg_boundaries(const uint8_t* d_flags, + const T* d_exc_sum, + size_t M, + size_t *d_starts, + size_t *d_ends) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= M) return; + if (d_flags[i]) { + d_starts[d_exc_sum[i]-1] = i; + } + if (i== M-1 || d_flags[i+1]) { + d_ends[d_exc_sum[i]-1] = i + 1; + } +} + +template +__global__ void scatter_unique(const uint32_t* d_src_keys, + const T* d_crd_keys, + const size_t* d_output, + const uint8_t* d_heads, + size_t M, + size_t *d_starts, + size_t *d_ends, + T* d_boundaries) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= M) return; + size_t u = d_output[i] - (d_heads[i] ? 0 : 1); + d_boundaries[u] = d_crd_keys[i]; + if (i == 0 || d_src_keys[i] != d_src_keys[i-1]) { + d_starts[d_src_keys[i]] = u; + } + if (i== M-1 || d_src_keys[i] != d_src_keys[i+1]) { + d_ends[d_src_keys[i]] = u + 1; + } +} + +template +__global__ void mark_deltas_heads(const CornerDesc* d_corners, + size_t M, + int dim, + uint8_t* d_heads, + DeltaFlag* d_deltas) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= M) return; + uint8_t head = 1; + if (i>0) { + head = 0; + for (int j = 0; j < N; j++) { + if (j== dim) continue; + if (d_corners[i].coord[j] != d_corners[i-1].coord[j]) { + head = 1; + break; + } + } + head = head || d_corners[i].src_idx != d_corners[i-1].src_idx; + } + d_heads[i] = head; + d_deltas[i].delta = d_corners[i].delta; + d_deltas[i].head = head; +} + +// For each segment and each boundary, determine whether to emit a new subsegment +template +__global__ void count_segments(const DeltaFlag* d_delta_flags, + const size_t *d_segment_starts, + const size_t *d_segment_ends, + const size_t *d_boundary_starts, + const size_t *d_boundary_ends, + const CornerDesc* d_corners, + const T* d_boundaries, + size_t num_boundaries, + size_t num_segments, + int dim, + uint32_t *seg_counters) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= num_segments * num_boundaries) return; + size_t bnd_idx = i % num_boundaries; + size_t seg_idx = i / num_boundaries; + int my_src = d_corners[d_segment_starts[seg_idx]].src_idx; + + //No boundaries for this src + if (d_boundary_starts[my_src]>= d_boundary_ends[my_src]) return; + + //This boundary is not a subsegment start for this segment's src + if (bnd_idx < d_boundary_starts[my_src] || bnd_idx >= d_boundary_ends[my_src]-1) return; + + //Binary search the segment to find the first subsegment whose start is > boundary + size_t low = d_segment_starts[seg_idx]; + size_t high = d_segment_ends[seg_idx]; + while (low < high) { + int mid = (low + high) / 2; + if (d_corners[mid].coord[dim] <= d_boundaries[bnd_idx]) { + low = mid + 1; + } else { + high = mid; + } + } + + //The prefix sum for this boundary within this segment is the delta of the corner just before it (if any) + int my_delta = (low == d_segment_starts[seg_idx] ? 0 : d_delta_flags[low-1].delta); + + //We emit if it's non-zero, and strengthen the requirement to > 0 for dim 0. + if (my_delta != 0 && (dim !=0 || my_delta > 0)) { + atomicAdd(&seg_counters[seg_idx], 1); + } +} + +//Do the same computation as above, but this time emit the actual subsegment +template +__global__ void write_segments(const DeltaFlag* d_delta_flags, + const size_t *d_segment_starts, + const size_t *d_segment_ends, + const size_t *d_boundary_starts, + const size_t *d_boundary_ends, + const CornerDesc* d_corners, + const T* d_boundaries, + const uint32_t *seg_offsets, + size_t num_boundaries, + size_t num_segments, + int dim, + uint32_t *seg_counters, + CornerDesc* d_out_corners) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= num_segments * num_boundaries) return; + size_t bnd_idx = i % num_boundaries; + size_t seg_idx = i / num_boundaries; + int my_src = d_corners[d_segment_starts[seg_idx]].src_idx; + if (d_boundary_starts[my_src]>= d_boundary_ends[my_src]) return; + if (bnd_idx < d_boundary_starts[my_src] || bnd_idx >= d_boundary_ends[my_src]-1) return; + size_t low = d_segment_starts[seg_idx]; + size_t high = d_segment_ends[seg_idx]; + while (low < high) { + int mid = (low + high) / 2; + if (d_corners[mid].coord[dim] <= d_boundaries[bnd_idx]) { + low = mid + 1; + } else { + high = mid; + } + } + int my_delta = (low == d_segment_starts[seg_idx] ? 0 : d_delta_flags[low-1].delta); + + //To emit, we keep everything the same except the current dim - set that to the boundary value + if (my_delta != 0 && (dim !=0 || my_delta > 0)) { + uint32_t my_idx = seg_offsets[seg_idx] + atomicAdd(&seg_counters[seg_idx], 1); + CornerDesc my_corner = d_corners[low-1]; + my_corner.coord[dim] = d_boundaries[bnd_idx]; + my_corner.delta = my_delta; + d_out_corners[my_idx] = my_corner; + } +} + +//Again, do the same computation as above, but this time emit the actual rectangle +template +__global__ void write_segments(const DeltaFlag* d_delta_flags, + const size_t *d_segment_starts, + const size_t *d_segment_ends, + size_t **d_boundary_starts, + size_t **d_boundary_ends, + const CornerDesc* d_corners, + T** d_boundaries, + const uint32_t *seg_offsets, + size_t num_boundaries, + size_t num_segments, + uint32_t *seg_counters, + RectDesc* d_out_rects) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= num_segments * num_boundaries) return; + size_t bnd_idx = i % num_boundaries; + size_t seg_idx = i / num_boundaries; + int my_src = d_corners[d_segment_starts[seg_idx]].src_idx; + if (d_boundary_starts[0][my_src]>= d_boundary_ends[0][my_src]) return; + if (bnd_idx < d_boundary_starts[0][my_src] || bnd_idx >= d_boundary_ends[0][my_src]-1) return; + + size_t low = d_segment_starts[seg_idx]; + size_t high = d_segment_ends[seg_idx]; + while (low < high) { + int mid = (low + high) / 2; + if (d_corners[mid].coord[0] <= d_boundaries[0][bnd_idx]) { + low = mid + 1; + } else { + high = mid; + } + } + int my_delta = (low == d_segment_starts[seg_idx] ? 0 : d_delta_flags[low-1].delta); + if (my_delta==0) return; + int my_corner_idx = low - 1; + uint32_t my_idx = seg_offsets[seg_idx] + atomicAdd(&seg_counters[seg_idx], 1); + RectDesc my_output; + my_output.src_idx = my_src; + my_output.rect.lo[0] = d_boundaries[0][bnd_idx]; + + //Remember we marked each boundary as hi+1, so need to revert + my_output.rect.hi[0] = d_boundaries[0][bnd_idx+1] - 1; + + //For every other dimension, map segment -> rect by finding the two boundaries that surround the segment's corner + for (int d = 1; d < N; d++) { + low = d_boundary_starts[d][my_src]; + high = d_boundary_ends[d][my_src]; + while (low < high) { + int mid = (low + high) / 2; + if (d_boundaries[d][mid] <= d_corners[my_corner_idx].coord[d]) { + low = mid + 1; + } else { + high = mid; + } + } + my_output.rect.lo[d] = d_boundaries[d][low-1]; + my_output.rect.hi[d] = d_boundaries[d][low] - 1; + } + d_out_rects[my_idx] = my_output; +} + + template + __global__ void populate_corners(const RectDesc* __restrict__ d_rects, + size_t M, + CornerDesc* __restrict__ d_corners) +{ + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= M) return; + + const auto& r = d_rects[i]; // assumes r.rect.lo[d], r.rect.hi[d], r.src_idx + const uint32_t src = r.src_idx; + + const size_t corners_per_rect = size_t(1) << N; + const size_t base = i * corners_per_rect; + + // emit 2^N corners. Each 1 in the mask -> use hi[d]+1, each 0 -> use lo[d] + for (unsigned mask = 0; mask < corners_per_rect; ++mask) { + CornerDesc c; + c.src_idx = src; + // sign = +1 for even popcount(mask), -1 for odd + c.delta = (__popc(mask) & 1) ? -1 : +1; + + #pragma unroll + for (int d = 0; d < N; ++d) { + const T lo = r.rect.lo[d]; + const T hip1 = r.rect.hi[d] + T(1); // half-open (hi+1) + c.coord[d] = ( (mask & (1u << d)) ? hip1 : lo ); + } + + d_corners[base + mask] = c; + } +} + + +template +__global__ void build_coord_key(T* d_keys, + const PointDesc* d_pts, + size_t M, + int dim) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i < M) d_keys[i] = d_pts[i].point[dim]; +} + + +template +__global__ void build_coord_key(T* d_keys, + const CornerDesc* d_corners, + size_t M, + int dim) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i < M) d_keys[i] = d_corners[i].coord[dim]; +} + +template +__global__ void get_delta(int32_t* d_deltas, + const CornerDesc* d_corners, + size_t M) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i < M) d_deltas[i] = d_corners[i].delta; +} + +template +__global__ void set_delta(const int32_t* d_deltas, + CornerDesc* d_corners, + size_t M) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i < M) d_corners[i].delta = d_deltas[i]; +} + + + template +__global__ void build_lo_key(T* d_keys, + const RectDesc* d_rects, + size_t M, + int dim) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i < M) d_keys[i] = d_rects[i].rect.lo[dim]; +} + + template +__global__ void build_hi_key(T* d_keys, + const RectDesc* d_rects, + size_t M, + int dim) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i < M) d_keys[i] = d_rects[i].rect.hi[dim]; +} + + template +__global__ void build_hi_flag(HiFlag* d_flags, + const RectDesc* d_rects, + size_t M, + int dim) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= M) return; + d_flags[i].hi = d_rects[i].rect.hi[dim]; + d_flags[i].head = i==0 || d_rects[i].src_idx != d_rects[i-1].src_idx; +} + + template +__global__ void build_src_key(size_t* d_keys, + const RectDesc* d_rects, + size_t M) { + size_t i = blockIdx.x*blockDim.x + threadIdx.x; + if(i < M) d_keys[i] = d_rects[i].src_idx; +} + + template +__global__ void build_src_key(size_t* d_keys, + const CornerDesc *d_corners, + size_t M) { + size_t i = blockIdx.x*blockDim.x + threadIdx.x; + if(i < M) d_keys[i] = d_corners[i].src_idx; +} + +template +__global__ void build_src_key(size_t* d_keys, + const PointDesc* d_pts, + size_t M) { + size_t i = blockIdx.x*blockDim.x + threadIdx.x; + if(i < M) d_keys[i] = d_pts[i].src_idx; +} + + +template +__global__ +void points_to_rects(const PointDesc* pts, + RectDesc* rects, + size_t M) +{ + size_t i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= M) return; + rects[i].src_idx = pts[i].src_idx; + rects[i].rect.lo = pts[i].point; + rects[i].rect.hi = pts[i].point; +} + +// 1) mark breaks on RectDesc array at pass d +// Starts a new rectangle if src or lo/hi in any dimension but d doesn't match, +// or if dim d doesn't match or advance by +1 +//NOTE: ONLY WORKS IF WE STARTED WITH DISJOINT RECTANGLES +template +__global__ +void mark_breaks_dim(const RectDesc* in, + uint8_t* brk, + size_t M, + int d) +{ + size_t i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= M) return; + if(i == 0) { brk[0] = 1; return; } + + const auto &p = in[i].rect, &q = in[i-1].rect; + bool split = (in[i].src_idx != in[i-1].src_idx); + + // more‐significant dims 0..d-1 must match [lo,hi] + #pragma unroll + for(int k = 0; k < d && !split; ++k) + if(p.lo[k] != q.lo[k] || p.hi[k] != q.hi[k]) split = true; + + // already‐processed dims d+1..N-1 must match [lo,hi] + #pragma unroll + for(int k = d+1; k < N && !split; ++k) + if((p.lo[k] != q.lo[k]) || (p.hi[k] != q.hi[k])) + split = true; + + // current dim d must equal or advance by +1 in lo + if(!split && (p.lo[d] != (q.hi[d] + 1)) && (p.lo[d] != q.lo[d])) + split = true; + + brk[i] = split ? 1 : 0; +} + +//1) Mark breaks for 1D rectangle merge - if low > hi + 1, must start new rect + template +__global__ +void mark_breaks_dim(const HiFlag* hi_flag_in, + const HiFlag* hi_flag_out, + const RectDesc* in, + uint8_t* brk, + size_t M, + int d) +{ + size_t i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= M) return; + brk[i] = hi_flag_in[i].head || in[i].rect.lo[d] > hi_flag_out[i].hi + 1; +} + +// 2) Write output rectangles for ND disjoint rects RLE +// Starts write lo, ends write hi, everyone else no-ops +template +__global__ +void init_rects_dim(const RectDesc* in, + const uint8_t* brk, + const size_t* gid, + RectDesc* out, + size_t M, + int d) +{ + size_t i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= M) return; + + bool is_end = (i == M-1) || (gid[i+1] != gid[i]); + if (!brk[i] && !is_end) return; + + size_t g = gid[i] - 1; // zero-based rectangle index + const Rect &r = in[i].rect; + out[g].src_idx = in[i].src_idx; + + #pragma unroll + for(int k = 0; k < N; ++k) { + if (brk[i]) { + out[g].rect.lo[k] = r.lo[k]; + } + if (is_end) { + out[g].rect.hi[k] = r.hi[k]; + } + } +} + + // 2) Write output rectangles for 1D rects RLE + // Starts write lo, ends write max(hi, prefix max hi) because the max was exclusive + template + __global__ + void init_rects_dim(const RectDesc* in, + const HiFlag *hi_flag_out, + const uint8_t* brk, + const size_t* gid, + RectDesc* out, + size_t M, + int d) +{ + size_t i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= M) return; + + bool is_end = (i == M-1) || (gid[i+1] != gid[i]); + if (!brk[i] && !is_end) return; + + size_t g = gid[i] - 1; // zero-based + const auto &r = in[i].rect; + out[g].src_idx = in[i].src_idx; + + // copy dims ≠ d +#pragma unroll + for(int k = 0; k < N; ++k) { + if (brk[i]) { + out[g].rect.lo[k] = r.lo[k]; + } + if (k != d || (brk[i] && is_end)) { + out[g].rect.hi[k] = r.hi[k]; + } else if (is_end) { + out[g].rect.hi[k] = r.hi[k] > hi_flag_out[i].hi ? r.hi[k] : hi_flag_out[i].hi; + } + } +} + +//Convert RectDesc to sparsity output and determine [d_start[i], d_end[i]) for each src i +template +__global__ +void build_final_output(const RectDesc* d_rects, + SparsityMapEntry* d_entries_out, + Rect* d_rects_out, + size_t* d_starts, + size_t* d_ends, + size_t numRects) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numRects) return; + d_rects_out[idx] = d_rects[idx].rect; + d_entries_out[idx].bounds = d_rects[idx].rect; + d_entries_out[idx].sparsity.id = 0; + d_entries_out[idx].bitmap = 0; + + //Checks if we're the first value for a given src + if (idx == 0 || d_rects[idx].src_idx != d_rects[idx-1].src_idx) { + d_starts[d_rects[idx].src_idx] = idx; + } + + //Checks if we're the last value for a given src + if (idx == numRects-1 || d_rects[idx].src_idx != d_rects[idx+1].src_idx) { + d_ends[d_rects[idx].src_idx] = idx+1; + } +} + +} // namespace Realm \ No newline at end of file diff --git a/src/realm/deppart/setops.cc b/src/realm/deppart/setops.cc index 2ab367f13a..d8cdbc902d 100644 --- a/src/realm/deppart/setops.cc +++ b/src/realm/deppart/setops.cc @@ -1073,15 +1073,14 @@ namespace Realm { bitmask.add_rect(it->bounds); } else { SparsityMapImpl *impl = SparsityMapImpl::lookup(it->sparsity); - const std::vector >& entries = impl->get_entries(); - for(typename std::vector >::const_iterator it2 = entries.begin(); - it2 != entries.end(); - it2++) { - Rect isect = it->bounds.intersection(it2->bounds); + span> entries = impl->get_entries(); + for(size_t i = 0; i < entries.size(); i++) { + SparsityMapEntry entry = entries[i]; + Rect isect = it->bounds.intersection(entry.bounds); if(isect.empty()) continue; - assert(!it2->sparsity.exists()); - assert(it2->bitmap == 0); + assert(!entry.sparsity.exists()); + assert(entry.bitmap == 0); bitmask.add_rect(isect); } } @@ -1440,15 +1439,14 @@ namespace Realm { todo.push_back(lhs.bounds); } else { SparsityMapImpl *l_impl = SparsityMapImpl::lookup(lhs.sparsity); - const std::vector >& entries = l_impl->get_entries(); - for(typename std::vector >::const_iterator it = entries.begin(); - it != entries.end(); - it++) { - Rect isect = lhs.bounds.intersection(it->bounds); + span> entries = l_impl->get_entries(); + for(size_t i = 0; i < entries.size(); i++) { + SparsityMapEntry entry = entries[i]; + Rect isect = lhs.bounds.intersection(entry.bounds); if(isect.empty()) continue; - assert(!it->sparsity.exists()); - assert(it->bitmap == 0); + assert(!entry.sparsity.exists()); + assert(entry.bitmap == 0); todo.push_back(isect); } } diff --git a/src/realm/deppart/sparsity_impl.cc b/src/realm/deppart/sparsity_impl.cc index e1cf66c2c9..c674a98b32 100644 --- a/src/realm/deppart/sparsity_impl.cc +++ b/src/realm/deppart/sparsity_impl.cc @@ -353,6 +353,7 @@ namespace Realm { if(map_impl.compare_exchange(impl, new_impl)) { map_deleter = [](void *map_impl) { + delete static_cast *>(map_impl); }; return new_impl; @@ -416,36 +417,30 @@ namespace Realm { // full cross-product test for now - for larger rectangle lists, consider // an acceleration structure? if(approx) { - const std::vector> &rects1 = get_approx_rects(); - const std::vector> &rects2 = other->get_approx_rects(); - for(typename std::vector>::const_iterator it1 = rects1.begin(); - it1 != rects1.end(); it1++) { - Rect isect = it1->intersection(bounds); + span> rects1 = get_approx_rects(); + span> rects2 = other->get_approx_rects(); + for(size_t i = 0; i < rects1.size(); i++) { + Rect isect = rects1[i].intersection(bounds); if(isect.empty()) continue; - for(typename std::vector>::const_iterator it2 = rects2.begin(); - it2 != rects2.end(); it2++) { - if(it2->overlaps(isect)) + for(size_t j = 0; j < rects2.size(); j++) { + if(rects2[j].overlaps(isect)) return true; } } } else { - const std::vector> &entries1 = get_entries(); - const std::vector> &entries2 = other->get_entries(); - for(typename std::vector>::const_iterator it1 = - entries1.begin(); - it1 != entries1.end(); it1++) { - Rect isect = it1->bounds.intersection(bounds); + span> entries1 = get_entries(); + span> entries2 = other->get_entries(); + for(size_t i = 0; i < entries1.size(); i++) { + Rect isect = entries1[i].bounds.intersection(bounds); if(isect.empty()) continue; - for(typename std::vector>::const_iterator it2 = - entries2.begin(); - it2 != entries2.end(); it2++) { - if(!it2->bounds.overlaps(isect)) + for(size_t j = 0; j < entries2.size(); j++) { + if(!entries2[j].bounds.overlaps(isect)) continue; // TODO: handle further sparsity in either side - assert(!it1->sparsity.exists() && (it1->bitmap == 0) && - !it2->sparsity.exists() && (it2->bitmap == 0)); + assert(!entries1[i].sparsity.exists() && (entries1[i].bitmap == 0) && + !entries2[j].sparsity.exists() && (entries2[j].bitmap == 0)); return true; } } @@ -907,6 +902,18 @@ namespace Realm { , sparsity_comm(_sparsity_comm) {} +template +SparsityMapImpl::~SparsityMapImpl(void) +{ + //We are responsible for our instances + //if (this->entries_instance.exists()) { + // this->entries_instance.destroy(); + //} + //if (this->approx_instance.exists()) { + // this->approx_instance.destroy(); + //} +} + template inline /*static*/ SparsityMapImpl * SparsityMapImpl::lookup(SparsityMap sparsity) @@ -1192,8 +1199,7 @@ namespace Realm { old_data.swap(this->entries); size_t i = 0; size_t n = 0; - typename std::vector>::const_iterator old_it = - old_data.begin(); + typename std::vector>::iterator old_it = old_data.begin(); while((i < count) && (old_it != old_data.end())) { if(rects[i].hi[0] < (old_it->bounds.lo[0] - 1)) { this->entries.resize(n + 1); @@ -1494,17 +1500,16 @@ namespace Realm { assert(false); // scan the entry list, sending bitmaps first and making a list of rects std::vector> rects; - for(typename std::vector>::const_iterator it = - this->entries.begin(); - it != this->entries.end(); it++) { - if(it->bitmap) { + for(size_t i = 0; i < this->get_entries().size(); i++) { + const SparsityMapEntry &entry = this->get_entries()[i]; + if(entry.bitmap) { // TODO: send bitmap assert(0); - } else if(it->sparsity.exists()) { + } else if(entry.sparsity.exists()) { // TODO: ? assert(0); } else { - rects.push_back(it->bounds); + rects.push_back(entry.bounds); } } @@ -1557,7 +1562,7 @@ namespace Realm { }; template - static void compute_approximation(const std::vector> &entries, + static void compute_approximation(const span> &entries, std::vector> &approx_rects, int max_rects) { size_t n = entries.size(); @@ -1579,7 +1584,7 @@ namespace Realm { } template - static void compute_approximation(const std::vector> &entries, + static void compute_approximation(const span> &entries, std::vector> &approx_rects, int max_rects) { int n = entries.size(); @@ -1693,6 +1698,9 @@ namespace Realm { template void SparsityMapImpl::finalize(void) { + + this->from_gpu = false; + // in order to organize the data a little better and handle common coalescing // cases, we do N sort/merging passes, with each dimension appearing last // in the sort order at least once (so that we can merge in that dimension) @@ -1748,7 +1756,7 @@ namespace Realm { // now that we've got our entries nice and tidy, build a bounded approximation of them if(true /*ID(me).sparsity_creator_node() == Network::my_node_id*/) { assert(!this->approx_valid.load()); - compute_approximation(this->entries, this->approx_rects, + compute_approximation(span>(this->entries.data(), this->entries.size()), this->approx_rects, DeppartConfig::cfg_max_rects_in_approximation); this->approx_valid.store_release(true); } @@ -1830,6 +1838,117 @@ namespace Realm { if(trigger_precise.exists()) GenEventImpl::trigger(trigger_precise, false /*!poisoned*/); + + } + + + //Here, we copy everything the CPU finalize does except manipulating the entries further + //and we indicate that the sparsity map was constructed from the cpu + + template + void SparsityMapImpl::gpu_finalize(void) + { + this->from_gpu = true; + + if(true /*ID(me).sparsity_creator_node() == Network::my_node_id*/) { + assert(!this->approx_valid.load()); + this->approx_valid.store_release(true); + } + + { + LoggerMessage msg = log_part.info(); + if(msg.is_active()) { + msg << "finalizing " << me << "(" << this << "), " << this->entries.size() + << " entries"; + for(size_t i = 0; i < this->entries.size(); i++) + msg << "\n [" << i << "]: bounds=" << this->entries[i].bounds + << " sparsity=" << this->entries[i].sparsity + << " bitmap=" << this->entries[i].bitmap; + } + } + +#ifdef DEBUG_PARTITIONING + std::cout << "finalizing " << this << ", " << this->entries.size() << " entries" + << std::endl; + for(size_t i = 0; i < this->entries.size(); i++) + std::cout << " [" << i << "]: bounds=" << this->entries[i].bounds + << " sparsity=" << this->entries[i].sparsity + << " bitmap=" << this->entries[i].bitmap << std::endl; +#endif + NodeSet sendto_precise, sendto_approx; + Event trigger_precise = Event::NO_EVENT; + Event trigger_approx = Event::NO_EVENT; + std::vector precise_waiters_copy, approx_waiters_copy; + { + AutoLock<> al(mutex); + + assert(!this->entries_valid.load()); + this->entries_valid.store_release(true); + + precise_requested = false; + if(precise_ready_event.exists()) { + trigger_precise = precise_ready_event; + precise_ready_event = Event::NO_EVENT; + } + + precise_waiters_copy.swap(precise_waiters); + approx_waiters_copy.swap(approx_waiters); + + remote_precise_waiters.swap(sendto_precise); + remote_approx_waiters.swap(sendto_approx); + } + + for(std::vector::const_iterator it = + precise_waiters_copy.begin(); + it != precise_waiters_copy.end(); it++) + (*it)->sparsity_map_ready(this, true); + + for(std::vector::const_iterator it = + approx_waiters_copy.begin(); + it != approx_waiters_copy.end(); it++) + (*it)->sparsity_map_ready(this, false); + + if(!sendto_approx.empty()) { + for(NodeID i = 0; (i <= Network::max_node_id) && !sendto_approx.empty(); i++) + if(sendto_approx.contains(i)) { + bool also_precise = sendto_precise.contains(i); + if(also_precise) + sendto_precise.remove(i); + remote_data_reply(i, also_precise, true); + sendto_approx.remove(i); + } + } + + if(!sendto_precise.empty()) { + for(NodeID i = 0; (i <= Network::max_node_id) && !sendto_precise.empty(); i++) + if(sendto_precise.contains(i)) { + remote_data_reply(i, true, false); + sendto_precise.remove(i); + } + } + + if(trigger_approx.exists()) + GenEventImpl::trigger(trigger_approx, false /*!poisoned*/); + + if(trigger_precise.exists()) + GenEventImpl::trigger(trigger_precise, false /*!poisoned*/); + } + + + //Allows a GPU deppart client to set the entries directly with a host region instance + template + void SparsityMapImpl::set_instance(RegionInstance _entries_instance, size_t size) + { + this->entries_instance = _entries_instance; + this->num_entries = size; + } + + //Allows a GPU deppart client to set the approx rects directly with a host region instance + template + void SparsityMapImpl::set_approx_instance(RegionInstance _approx_instance, size_t size) + { + this->approx_instance = _approx_instance; + this->num_approx = size; } template diff --git a/src/realm/deppart/sparsity_impl.h b/src/realm/deppart/sparsity_impl.h index 4a3ed14349..2618f4decc 100644 --- a/src/realm/deppart/sparsity_impl.h +++ b/src/realm/deppart/sparsity_impl.h @@ -109,6 +109,8 @@ namespace Realm { SparsityMapImpl(SparsityMap _me, NodeSet &subscribers, SparsityMapCommunicator *_sparsity_comm); + ~SparsityMapImpl(); + // actual implementation - SparsityMapPublicImpl's version just calls this one Event make_valid(bool precise = true); @@ -136,6 +138,10 @@ namespace Realm { void remote_data_request(NodeID requestor, bool send_precise, bool send_approx); void remote_data_reply(NodeID requestor, bool send_precise, bool send_approx); + void set_instance(RegionInstance _entries_instance, size_t size); + void set_approx_instance(RegionInstance _approx_instance, size_t size); + void gpu_finalize(void); + SparsityMap me; struct RemoteSparsityRequest { diff --git a/src/realm/deppart/untemplated_gpu_kernels.cu b/src/realm/deppart/untemplated_gpu_kernels.cu new file mode 100644 index 0000000000..a45e8f8962 --- /dev/null +++ b/src/realm/deppart/untemplated_gpu_kernels.cu @@ -0,0 +1,119 @@ +#include "realm/deppart/partitions.h" + +namespace Realm { + +__device__ __forceinline__ +int bvh_common_prefix(const uint64_t *morton, const uint64_t *leafIdx, int i, int j, int n) { + if (j < 0 || j >= n) return -1; + uint64_t x = morton[i] ^ morton[j]; + uint64_t y = leafIdx[i] ^ leafIdx[j]; + if (x == 0) { + return 64 + __clzll(y); + } + return __clzll(x); +} + +__global__ +void bvh_build_radix_tree_kernel( + const uint64_t *morton, // [n] + const uint64_t *leafIdx, // [n] (unused here but kept for symmetry) + int n, + int *childLeft, // [2n−1] + int *childRight, // [2n−1] + int *parent) // [2n−1], pre‐initialized to −1 +{ + int idx = blockIdx.x*blockDim.x + threadIdx.x; + int i = idx; + if (i >= n-1) return; // we only build n−1 internal nodes + + int left, right; + int dL = bvh_common_prefix(morton, leafIdx, i, i-1, n); + int dR = bvh_common_prefix(morton, leafIdx, i, i+1, n); + int d = (dR > dL ? +1 : -1); + int deltaMin = (dR > dL ? dL : dR); + + // 3) find j by exponential + binary search + int l_max = 2; + int delta = -1; + int i_tmp = i + d * l_max; + if (0 <= i_tmp && i_tmp < n) { + delta = bvh_common_prefix(morton, leafIdx, i, i_tmp, n); + } + while (delta > deltaMin) { + l_max <<= 1; + i_tmp = i + d * l_max; + delta = -1; + if (0 <= i_tmp && i_tmp < n) { + delta = bvh_common_prefix(morton, leafIdx, i, i_tmp, n); + } + } + int l = 0; + int t = (l_max) >> 1; + while (t > 0) { + i_tmp = i + d*(l + t); + delta = -1; + if (0 <= i_tmp && i_tmp < n) { + delta = bvh_common_prefix(morton, leafIdx, i, i_tmp, n); + } + if (delta > deltaMin) { + l += t; + } + t >>= 1; + } + if (d < 0) { + right = i; + left = i + d*l; + } else { + left = i; + right = i + d*l; + } + + int gamma; + if (morton[left] == morton[right] && leafIdx[left] == leafIdx[right]) { + gamma = (left+right) >> 1; + } else { + int deltaNode = bvh_common_prefix(morton, leafIdx, left, right, n); + int split = left; + int stride = right - left; + do { + stride = (stride + 1) >> 1; + int middle = split + stride; + if (middle < right) { + int delta = bvh_common_prefix(morton, leafIdx, left, middle, n); + if (delta > deltaNode) { + split = middle; + } + } + } while (stride > 1); + gamma = split; + } + + int left_node = gamma; + int right_node = gamma + 1; + if (left == gamma) { + left_node += n-1; + } + if (right == gamma + 1) { + right_node += n-1; + } + + childLeft [idx] = left_node; + childRight[idx] = right_node; + parent[left_node] = idx; + parent[right_node] = idx; +} + +__global__ +void bvh_build_root_kernel( + int *root, + int *parent, + size_t total_rects) { + + int tid = blockIdx.x*blockDim.x + threadIdx.x; + if (tid >= 2 * total_rects - 1) return; + if (parent[tid] == -1) { + *root = tid; + } +} + +} \ No newline at end of file diff --git a/src/realm/indexspace.h b/src/realm/indexspace.h index 842213c467..b61a77d689 100644 --- a/src/realm/indexspace.h +++ b/src/realm/indexspace.h @@ -29,6 +29,7 @@ #include "realm/realm_c.h" #include "realm/realm_config.h" +#include "realm/realm_assert.h" #include "realm/sparsity.h" #include "realm/dynamic_templates.h" @@ -782,6 +783,17 @@ namespace Realm { Event wait_on = Event::NO_EVENT) const; ///@} + ///@{ + /// + + template + REALM_PUBLIC_API Event gpu_subspaces_by_image( + const DomainTransform &domain_transform, + const std::vector> &sources, + std::vector> &images, const ProfilingRequestSet &reqs, + std::pair &sizes, RegionInstance buffer = RegionInstance::NO_INST, Event wait_on = Event::NO_EVENT) const; + ///@} + ///@{ /** * Computes subspaces of this index space by determining what subsets are @@ -813,6 +825,14 @@ namespace Realm { std::vector> &images, const ProfilingRequestSet &reqs, Event wait_on = Event::NO_EVENT) const; + template + REALM_PUBLIC_API Event gpu_subspaces_by_image( + const std::vector, Point>> + &field_data, + const std::vector> &sources, + std::vector> &images, const ProfilingRequestSet &reqs, + std::pair &sizes, RegionInstance buffer = RegionInstance::NO_INST, Event wait_on = Event::NO_EVENT) const; + // range versions template REALM_PUBLIC_API Event create_subspace_by_image( diff --git a/src/realm/indexspace.inl b/src/realm/indexspace.inl index cb0a83e6cb..c633aa5e46 100644 --- a/src/realm/indexspace.inl +++ b/src/realm/indexspace.inl @@ -488,13 +488,12 @@ namespace Realm { SparsityMapPublicImpl *impl = sparsity.impl(); // if we don't have the data, it's too late - somebody should have waited - // we should have the metadata valid REALM_ASSERT(impl->is_valid(precise)); // always use precise info if it's available if(impl->is_valid(true /*precise*/)) { IndexSpace result; - const std::vector> &entries = impl->get_entries(); + span> entries = impl->get_entries(); // three cases: // 1) empty index space if(entries.empty()) { @@ -534,7 +533,7 @@ namespace Realm { log_dpops.info() << "tighten: " << *this << " = " << result; return result; } else { - const std::vector> &approx_rects = impl->get_approx_rects(); + span> approx_rects = impl->get_approx_rects(); // two cases: // 1) empty index space @@ -561,7 +560,7 @@ namespace Realm { // the index of the entry that contains the point, or the first one to appear after // that point template - static size_t bsearch_map_entries(const std::vector> &entries, + static size_t bsearch_map_entries(const span> &entries, const Point &p) { assert(N == 1); @@ -592,41 +591,40 @@ namespace Realm { if(dense()) return true; - SparsityMapPublicImpl *impl = sparsity.impl(); - const std::vector> &entries = impl->get_entries(); + SparsityMapPublicImpl *impl = sparsity.impl(); + span> entries = impl->get_entries(); if(N == 1) { // binary search to find the element we want - size_t idx = bsearch_map_entries(entries, p); - if(idx >= entries.size()) - return false; + size_t idx = bsearch_map_entries(entries, p); + if(idx >= entries.size()) return false; - const SparsityMapEntry &e = entries[idx]; + const SparsityMapEntry& e = entries[idx]; // the search guaranteed we're below the upper bound of the returned entry, // but we might be below the lower bound if(p[0] < e.bounds.lo[0]) - return false; + return false; if(e.sparsity.exists()) { - assert(0); + assert(0); } if(e.bitmap != 0) { - assert(0); + assert(0); } return true; } else { - for(typename std::vector>::const_iterator it = - entries.begin(); - it != entries.end(); it++) { - if(!it->bounds.contains(p)) - continue; - if(it->sparsity.exists()) { - assert(0); - } else if(it->bitmap != 0) { - assert(0); - } else { - return true; - } + for(size_t i = 0; i < entries.size(); i++) { + SparsityMapEntry entry = entries[i]; + if(!entry.bounds.contains(p)) { + continue; + } + if(entry.sparsity.exists()) { + assert(0); + } else if(entry.bitmap != 0) { + assert(0); + } else { + return true; + } } } @@ -644,21 +642,19 @@ namespace Realm { if(!dense()) { // test against sparsity map too size_t total_volume = 0; - SparsityMapPublicImpl *impl = sparsity.impl(); - const std::vector> &entries = impl->get_entries(); - for(typename std::vector>::const_iterator it = - entries.begin(); - it != entries.end(); it++) { - if(!it->bounds.overlaps(r)) - continue; - if(it->sparsity.exists()) { - assert(0); - } else if(it->bitmap != 0) { - assert(0); - } else { - Rect isect = it->bounds.intersection(r); + SparsityMapPublicImpl *impl = sparsity.impl(); + span> entries = impl->get_entries(); + for(size_t i = 0; i < entries.size(); i++) { + SparsityMapEntry entry = entries[i]; + if(!entry.bounds.overlaps(r)) continue; + if(entry.sparsity.exists()) { + assert(0); + } else if(entry.bitmap != 0) { + assert(0); + } else { + Rect isect = entry.bounds.intersection(r); total_volume += isect.volume(); - } + } } // did we miss anything? @@ -678,22 +674,20 @@ namespace Realm { if(!dense()) { // test against sparsity map too - SparsityMapPublicImpl *impl = sparsity.impl(); - const std::vector> &entries = impl->get_entries(); - for(typename std::vector>::const_iterator it = - entries.begin(); - it != entries.end(); it++) { - if(!it->bounds.overlaps(r)) - continue; - if(it->sparsity.exists()) { - assert(0); - } else if(it->bitmap != 0) { - assert(0); - } else { - return true; - } + SparsityMapPublicImpl *impl = sparsity.impl(); + span> entries = impl->get_entries(); + for(size_t i = 0; i < entries.size(); i++) { + SparsityMapEntry entry = entries[i]; + if(!entry.bounds.overlaps(r)) continue; + if(entry.sparsity.exists()) { + assert(0); + } else if(entry.bitmap != 0) { + assert(0); + } else { + return true; + } } - + return false; } @@ -732,15 +726,15 @@ namespace Realm { size_t total = 0; SparsityMapPublicImpl *impl = sparsity.impl(); - const std::vector> &entries = impl->get_entries(); - for(typename std::vector>::const_iterator it = entries.begin(); - it != entries.end(); it++) { - Rect isect = bounds.intersection(it->bounds); + span> entries = impl->get_entries(); + for(size_t i = 0; i < entries.size(); i++) { + SparsityMapEntry entry = entries[i]; + Rect isect = bounds.intersection(entry.bounds); if(isect.empty()) continue; - if(it->sparsity.exists()) { + if(entry.sparsity.exists()) { assert(0); - } else if(it->bitmap != 0) { + } else if(entry.bitmap != 0) { assert(0); } else { total += isect.volume(); @@ -764,19 +758,20 @@ namespace Realm { if(dense()) return true; - SparsityMapPublicImpl *impl = sparsity.impl(); - const std::vector> &approx_rects = impl->get_approx_rects(); - for(typename std::vector>::const_iterator it = approx_rects.begin(); - it != approx_rects.end(); it++) - if(it->contains(p)) - return true; + SparsityMapPublicImpl *impl = sparsity.impl(); + span> approx_rects = impl->get_approx_rects(); + for(size_t i = 0; i < approx_rects.size(); i++) { + Rect entry = approx_rects[i]; + if(entry.contains(p)) + return true; + } // no entries matched, so the point is definitely not contained in this space return false; } template - inline bool IndexSpace::contains_all_approx(const Rect &r) const + inline bool IndexSpace::contains_all_approx(const Rect& r) const { // test on bounding box first if(!bounds.contains(r)) @@ -786,14 +781,14 @@ namespace Realm { if(dense()) return true; - SparsityMapPublicImpl *impl = sparsity.impl(); - const std::vector> &approx_rects = impl->get_approx_rects(); - for(typename std::vector>::const_iterator it = approx_rects.begin(); - it != approx_rects.end(); it++) { - if(it->contains(r)) - return true; - if(it->overlaps(r)) - assert(0); + SparsityMapPublicImpl *impl = sparsity.impl(); + span> approx_rects = impl->get_approx_rects(); + for(size_t i = 0; i < approx_rects.size(); i++) { + Rect entry = approx_rects[i]; + if(entry.contains(r)) + return true; + if(entry.overlaps(r)) + assert(0); } // no entries matched, so the point is definitely not contained in this space @@ -811,12 +806,12 @@ namespace Realm { if(dense()) return true; - SparsityMapPublicImpl *impl = sparsity.impl(); - const std::vector> &approx_rects = impl->get_approx_rects(); - for(typename std::vector>::const_iterator it = approx_rects.begin(); - it != approx_rects.end(); it++) { - if(it->overlaps(r)) - return true; + SparsityMapPublicImpl *impl = sparsity.impl(); + span> approx_rects = impl->get_approx_rects(); + for(size_t i = 0; i < approx_rects.size(); i++) { + Rect entry = approx_rects[i]; + if(entry.overlaps(r)) + return true; } // no entries matched, so the point is definitely not contained in this space @@ -838,29 +833,27 @@ namespace Realm { return contains_any_approx(other.bounds); // both sparse case can be expensive... - SparsityMapPublicImpl *impl = sparsity.impl(); - SparsityMapPublicImpl *other_impl = other.sparsity.impl(); + SparsityMapPublicImpl *impl = sparsity.impl(); + SparsityMapPublicImpl *other_impl = other.sparsity.impl(); // overlap can only be within intersecion of bounds - Rect isect = bounds.intersection(other.bounds); + Rect isect = bounds.intersection(other.bounds); return impl->overlaps(other_impl, isect, true /*approx*/); } - // approximage number of points in index space (may be less than volume of bounding box, - // but larger than + // approximage number of points in index space (may be less than volume of bounding box, but larger than // actual volume) template - inline size_t IndexSpace::volume_approx(void) const + inline size_t IndexSpace::volume_approx(void) const { if(dense()) return bounds.volume(); size_t total = 0; - SparsityMapPublicImpl *impl = sparsity.impl(); - const std::vector> &approx_rects = impl->get_approx_rects(); - for(typename std::vector>::const_iterator it = approx_rects.begin(); - it != approx_rects.end(); it++) - total += it->volume(); + SparsityMapPublicImpl *impl = sparsity.impl(); + span> approx_rects = impl->get_approx_rects(); + for(size_t i = 0; i < approx_rects.size(); i++) + total += approx_rects[i].volume(); return total; } @@ -981,6 +974,18 @@ namespace Realm { images, reqs, wait_on); } + template + template + inline Event IndexSpace::gpu_subspaces_by_image( + const std::vector, Point>> &field_data, + const std::vector> &sources, + std::vector> &images, const ProfilingRequestSet &reqs, + std::pair &sizes, RegionInstance buffer, Event wait_on) const + { + return gpu_subspaces_by_image(DomainTransform(field_data), sources, + images, reqs, sizes, buffer, wait_on); + } + template template inline Event IndexSpace::create_subspaces_by_image( @@ -1320,7 +1325,7 @@ namespace Realm { rect = Rect::make_empty(); - const std::vector> &entries = s_impl->get_entries(); + span> entries = s_impl->get_entries(); // find the first entry that overlaps our restriction - speed this up with a // binary search on the low end of the restriction if we're 1-D @@ -1356,7 +1361,7 @@ namespace Realm { // TODO: handle iteration within a sparsity entry // move onto the next sparsity entry (that overlaps our restriction) - const std::vector> &entries = s_impl->get_entries(); + const span> entries = s_impl->get_entries(); for(cur_entry++; cur_entry < entries.size(); cur_entry++) { const SparsityMapEntry &e = entries[cur_entry]; rect = restriction.intersection(e.bounds); diff --git a/src/realm/inst_layout.inl b/src/realm/inst_layout.inl index 0ee4db6960..acb2896e41 100644 --- a/src/realm/inst_layout.inl +++ b/src/realm/inst_layout.inl @@ -90,13 +90,13 @@ namespace Realm { // we need precise data for non-dense index spaces (the original // 'bounds' on the IndexSpace is often VERY conservative) SparsityMapPublicImpl *impl = is.sparsity.impl(); - const std::vector> &entries = impl->get_entries(); + span> entries = impl->get_entries(); if(!entries.empty()) { // TODO: set some sort of threshold for merging entries - typename std::vector>::const_iterator it = entries.begin(); - Rect bbox = is.bounds.intersection(it->bounds); - while(++it != entries.end()) - bbox = bbox.union_bbox(is.bounds.intersection(it->bounds)); + size_t i = 0; + Rect bbox = is.bounds.intersection(entries[i].bounds); + while(++i < entries.size()) + bbox = bbox.union_bbox(is.bounds.intersection(entries[i].bounds)); if(!bbox.empty()) piece_bounds.push_back(bbox); } diff --git a/src/realm/sparsity.h b/src/realm/sparsity.h index 1dc402a709..bf46284c6d 100644 --- a/src/realm/sparsity.h +++ b/src/realm/sparsity.h @@ -205,7 +205,7 @@ namespace Realm { * @return the entries of this sparsity map */ REALM_PUBLIC_API - const std::vector> &get_entries(void); + const span> get_entries(void); /** * Get the approximate rectangles of this sparsity map. @@ -215,7 +215,7 @@ namespace Realm { * @return the approximate rectangles of this sparsity map */ REALM_PUBLIC_API - const std::vector> &get_approx_rects(void); + const span> get_approx_rects(void); /** * Check if this sparsity map overlaps another sparsity map. @@ -246,8 +246,23 @@ namespace Realm { protected: atomic entries_valid{false}, approx_valid{false}; - std::vector> entries; - std::vector> approx_rects; + + //BOTH RegionInstance and vector are returned as a span + //only on can be valid (i.e. only finalize or gpu_finalize can be called, not both) + + //Stores rectangles for CPU deppart (easy manipulation for sort/merge entries) + std::vector > entries; + std::vector > approx_rects; + + //Stores rectangles for GPU deppart (allows fast copy after merged on GPU) + RegionInstance entries_instance = RegionInstance::NO_INST; + size_t num_entries = 0; + + RegionInstance approx_instance = RegionInstance::NO_INST; + size_t num_approx = 0; + + //Tracks whether to use instance or vector + bool from_gpu = false; }; }; // namespace Realm diff --git a/src/realm/sparsity.inl b/src/realm/sparsity.inl index a4a72fec05..7ff00ef552 100644 --- a/src/realm/sparsity.inl +++ b/src/realm/sparsity.inl @@ -18,9 +18,9 @@ // sparsity maps for Realm // nop, but helps IDEs +#include "realm/inst_layout.h" #include "realm/sparsity.h" -#include "realm/realm_assert.h" #include "realm/serialize.h" TEMPLATE_TYPE_IS_SERIALIZABLE2(int N, typename T, Realm::SparsityMap); @@ -84,19 +84,37 @@ namespace Realm { } template - inline const std::vector> & - SparsityMapPublicImpl::get_entries(void) + inline const span> SparsityMapPublicImpl::get_entries(void) { REALM_ASSERT(entries_valid.load_acquire()); - return entries; + if(from_gpu) { + if (num_entries == 0) { + return span>(); + } + return span>( + reinterpret_cast *>(entries_instance.pointer_untyped( + 0, num_entries * sizeof(SparsityMapEntry))), + num_entries); + } else { + return span>(entries.data(), entries.size()); + } } template - inline const std::vector> & - SparsityMapPublicImpl::get_approx_rects(void) + inline const span> SparsityMapPublicImpl::get_approx_rects(void) { REALM_ASSERT(approx_valid.load_acquire()); - return approx_rects; + if(from_gpu) { + if (num_approx == 0) { + return span>(); + } + return span>( + reinterpret_cast *>( + approx_instance.pointer_untyped(0, num_approx * sizeof(Rect))), + num_approx); + } else { + return span>(approx_rects.data(), approx_rects.size()); + } } }; // namespace Realm diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a6213d8b46..e166888637 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -439,6 +439,10 @@ if(TEST_USE_GPU) task_stream "${REALM_TEST_DIR}/task_stream.cc" "${REALM_TEST_DIR}/task_stream_gpu.cu" ) target_link_libraries(task_stream ${TEST_GPU_LIBS}) + set(gpu_deppart_1d_ARGS -ll:gpu 1) + set(gpu_deppart_1d_RESOURCE_LOCK gpu) + add_integration_test(gpu_deppart_1d "${REALM_TEST_DIR}/gpu_deppart_1d.cc") + target_link_libraries(gpu_deppart_1d ${TEST_GPU_LIBS}) endif() #### C API tests diff --git a/tests/deppart.cc b/tests/deppart.cc index e33708daf0..815f2cb490 100644 --- a/tests/deppart.cc +++ b/tests/deppart.cc @@ -41,6 +41,10 @@ enum { TOP_LEVEL_TASK = Processor::TASK_ID_FIRST_AVAILABLE + 0, INIT_CIRCUIT_DATA_TASK, + INIT_BASIC_DATA_TASK, + INIT_TILE_DATA_TASK, + INIT_RANGE_DATA_TASK, + INIT_2D_DATA_TASK, INIT_PENNANT_DATA_TASK, INIT_MINIAERO_DATA_TASK, }; @@ -87,14 +91,14 @@ void dump_sparse_index_space(const char *pfx, IndexSpace is) if(!is.sparsity.exists()) return; SparsityMapPublicImpl *impl = is.sparsity.impl(); - const std::vector> &entries = impl->get_entries(); - for(typename std::vector>::const_iterator it = entries.begin(); - it != entries.end(); it++) { - std::cout << " " << it->bounds; - if(it->bitmap) - std::cout << " bitmap(" << it->bitmap << ")"; - if(it->sparsity.exists()) - std::cout << " sparsity(" << it->sparsity << ")"; + span> entries = impl->get_entries(); + for(size_t i = 0; i < entries.size(); i++) { + SparsityMapEntry entry = entries[i]; + std::cout << " " << entry.bounds; + if(entry.bitmap) + std::cout << " bitmap(" << entry.bitmap << ")"; + if(entry.sparsity.exists()) + std::cout << " sparsity(" << entry.sparsity << ")"; std::cout << "\n"; } } @@ -161,6 +165,1571 @@ int find_split(const std::vector &cuts, T v) return 0; } +/* + * Basic test - create a graph, partition it by + * node subgraph id and then check that the partitioning + * is correct + */ +class BasicTest : public TestInterface { +public: + // graph config parameters + int num_nodes = 1000; + int num_edges = 1000; + int num_pieces = 4; + std::string filename; + + BasicTest(int argc, const char *argv[]) + { + for(int i = 1; i < argc; i++) { + + if(!strcmp(argv[i], "-p")) { + num_pieces = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-n")) { + num_nodes = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-e")) { + num_edges = atoi(argv[++i]); + continue; + } + } + + + if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0) { + log_app.error() << "Invalid config: nodes=" << num_nodes << " edges=" << num_edges << " pieces=" << num_pieces << "\n"; + exit(1); + } + } + + struct InitDataArgs { + int index; + RegionInstance ri_nodes; + RegionInstance ri_edges; + }; + + enum PRNGStreams + { + NODE_SUBGRAPH_STREAM, + }; + + // assign subgraph ids to nodes + void random_node_data(int idx, int &subgraph) + { + if(random_colors) + subgraph = + Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); + else + subgraph = idx * num_pieces / num_nodes; + } + + void random_edge_data(int idx, int& src, int& dst) + { + src = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes); + dst = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, num_nodes); + } + + static void init_data_task_wrapper(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) + { + BasicTest *me = (BasicTest *)testcfg; + me->init_data_task(args, arglen, p); + } + + //Each piece has a task to initialize its data + void init_data_task(const void *args, size_t arglen, Processor p) + { + const InitDataArgs &i_args = *(const InitDataArgs *)args; + + log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes + << ")"; + + i_args.ri_nodes.fetch_metadata(p).wait(); + i_args.ri_edges.fetch_metadata(p).wait(); + + IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>(); + IndexSpace<1> is_edges = i_args.ri_edges.get_indexspace<1>(); + + log_app.debug() << "N: " << is_nodes; + log_app.debug() << "E: " << is_edges; + + //For each node in the graph, mark it with a random (or deterministic) subgraph id + { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) { + int subgraph; + random_node_data(i, subgraph); + a_piece_id.write(i, subgraph); + } + + AffineAccessor,1> a_src(i_args.ri_edges, 0 /* offset */); + AffineAccessor,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */); + + for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) { + int src, dst; + random_edge_data(i, src, dst); + a_src.write(i, Point<1>(src)); + a_dst.write(i, Point<1>(dst)); + } + } + + //Optionally print out the assigned subgraph ids + if(show_graph) { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) + log_app.info() << "piece_id[" << i << "] = " << a_piece_id.read(i) << "\n"; + + AffineAccessor,1> a_src(i_args.ri_edges, 0 /* offset */); + AffineAccessor,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */); + + for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) + log_app.info() << "src, dst[" << i << "] = " << a_src.read(i) << ", " << a_dst.read(i) << "\n"; + } + } + + IndexSpace<1> is_nodes, is_edges; + std::vector ri_nodes, ri_edges; + std::vector, int> > piece_id_field_data; + std::vector, Point<1> > > src_node_field_data, dst_node_field_data; + + virtual void print_info(void) + { + printf("Realm dependent partitioning test - basic: %d nodes, %d edges, %d pieces\n", + (int)num_nodes, (int) num_edges, (int)num_pieces); + } + + virtual Event initialize_data(const std::vector &memories, + const std::vector &procs) + { + // now create index space for nodes + is_nodes = Rect<1>(0, num_nodes - 1); + is_edges = Rect<1>(0, num_edges - 1); + + // equal partition is used to do initial population of edges and nodes + std::vector > ss_nodes_eq; + std::vector > ss_edges_eq; + + log_app.info() << "Creating equal subspaces\n"; + + is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); + is_edges.create_equal_subspaces(num_pieces, 1, ss_edges_eq, Realm::ProfilingRequestSet()).wait(); + + log_app.debug() << "Initial partitions:"; + for(size_t i = 0; i < ss_nodes_eq.size(); i++) + log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i]; + for(size_t i = 0; i < ss_edges_eq.size(); i++) + log_app.debug() << " Edges #" << i << ": " << ss_edges_eq[i]; + + // create instances for each of these subspaces + std::vector node_fields, edge_fields; + node_fields.push_back(sizeof(int)); // piece_id + assert(sizeof(int) == sizeof(Point<1>)); + edge_fields.push_back(sizeof(Point<1>)); // src_node + edge_fields.push_back(sizeof(Point<1>)); // dst_node + + ri_nodes.resize(num_pieces); + piece_id_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_nodes_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i], + node_fields, 0 /*SOA*/, + Realm::ProfilingRequestSet()) + .wait(); + ri_nodes[i] = ri; + + piece_id_field_data[i].index_space = ss_nodes_eq[i]; + piece_id_field_data[i].inst = ri_nodes[i]; + piece_id_field_data[i].field_offset = 0; + } + + + // Fire off tasks to initialize data + ri_edges.resize(num_pieces); + src_node_field_data.resize(num_pieces); + dst_node_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_edges_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, + memories[i % memories.size()], + ss_edges_eq[i], + edge_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_edges[i] = ri; + + src_node_field_data[i].index_space = ss_edges_eq[i]; + src_node_field_data[i].inst = ri_edges[i]; + src_node_field_data[i].field_offset = 0 * sizeof(Point<1>); + + dst_node_field_data[i].index_space = ss_edges_eq[i]; + dst_node_field_data[i].inst = ri_edges[i]; + dst_node_field_data[i].field_offset = 1 * sizeof(Point<1>); + } + + // fire off tasks to initialize data + std::set events; + for(int i = 0; i < num_pieces; i++) { + Processor p = procs[i % procs.size()]; + InitDataArgs args; + args.index = i; + args.ri_nodes = ri_nodes[i]; + args.ri_edges = ri_edges[i]; + Event e = p.spawn(INIT_BASIC_DATA_TASK, &args, sizeof(args)); + events.insert(e); + } + + return Event::merge_events(events); + } + + // the outputs of our partitioning will be: + // p_nodes - nodes partitioned by subgraph id (from GPU) + // p_nodes_cpu - nodes partitioned by subgraph id (from CPU) + + + std::vector > p_nodes, p_rd; + std::vector > p_edges, p_preimage_edges; + + std::vector > p_nodes_cpu, p_rd_cpu; + std::vector > p_edges_cpu, p_preimage_edges_cpu; + + virtual Event perform_partitioning(void) + { + // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU + // Ensure that the results are identical + + std::vector colors(num_pieces); + for(int i = 0; i < num_pieces; i++) + colors[i] = i; + + // We need a GPU memory for GPU partitioning + Memory gpu_memory; + bool found_gpu_memory = false; + Machine machine = Machine::get_machine(); + std::set all_memories; + machine.get_all_memories(all_memories); + for(Memory memory : all_memories) { + if(memory.kind() == Memory::GPU_FB_MEM) { + gpu_memory = memory; + found_gpu_memory = true; + break; + } + } + if (!found_gpu_memory) { + log_app.error() << "No GPU memory found for partitioning test\n"; + return Event::NO_EVENT; + } + std::vector edge_fields; + edge_fields.push_back(sizeof(Point<1>)); + edge_fields.push_back(sizeof(Point<1>)) ; + std::vector node_fields; + node_fields.push_back(sizeof(int)); + + std::vector, Point<1> > > src_field_data_gpu; + std::vector, Point<1> > > dst_field_data_gpu; + std::vector, int> > piece_field_data_gpu; + piece_field_data_gpu.resize(num_pieces); + src_field_data_gpu.resize(num_pieces); + dst_field_data_gpu.resize(num_pieces); + for (int i = 0; i < num_pieces; i++) { + RegionInstance src_gpu_instance; + RegionInstance dst_gpu_instance; + RegionInstance piece_gpu_instance; + RegionInstance::create_instance(src_gpu_instance, + gpu_memory, + src_node_field_data[i].index_space, + edge_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(dst_gpu_instance, + gpu_memory, + dst_node_field_data[i].index_space, + edge_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(piece_gpu_instance, + gpu_memory, + piece_id_field_data[i].index_space, + node_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + CopySrcDstField src_gpu_field, src_cpu_field, dst_gpu_field, dst_cpu_field, piece_gpu_field, piece_cpu_field; + src_gpu_field.inst = src_gpu_instance; + src_gpu_field.size = sizeof(Point<1>); + src_gpu_field.field_id = 0; + src_cpu_field.inst = src_node_field_data[i].inst; + src_cpu_field.size = sizeof(Point<1>); + src_cpu_field.field_id = 0; + dst_gpu_field.inst = dst_gpu_instance; + dst_gpu_field.size = sizeof(Point<1>); + dst_gpu_field.field_id = sizeof(Point<1>); + dst_cpu_field.inst = dst_node_field_data[i].inst; + dst_cpu_field.size = sizeof(Point<1>); + dst_cpu_field.field_id = sizeof(Point<1>); + piece_gpu_field.inst = piece_gpu_instance; + piece_gpu_field.size = sizeof(int); + piece_gpu_field.field_id = 0; + piece_cpu_field.inst = piece_id_field_data[i].inst; + piece_cpu_field.size = sizeof(int); + piece_cpu_field.field_id = 0; + std::vector src_cpu_data, src_gpu_data, dst_cpu_data, dst_gpu_data, piece_cpu_data, piece_gpu_data; + src_cpu_data.push_back(src_cpu_field); + dst_cpu_data.push_back(dst_cpu_field); + src_gpu_data.push_back(src_gpu_field); + dst_gpu_data.push_back(dst_gpu_field); + piece_gpu_data.push_back(piece_gpu_field); + piece_cpu_data.push_back(piece_cpu_field); + Event copy_event = src_node_field_data[i].index_space.copy(src_cpu_data, src_gpu_data, Realm::ProfilingRequestSet()); + copy_event.wait(); + Event second_copy_event = dst_node_field_data[i].index_space.copy(dst_cpu_data, dst_gpu_data, Realm::ProfilingRequestSet()); + second_copy_event.wait(); + Event third_copy_event = piece_id_field_data[i].index_space.copy(piece_cpu_data, piece_gpu_data, Realm::ProfilingRequestSet()); + third_copy_event.wait(); + src_field_data_gpu[i].inst = src_gpu_instance; + src_field_data_gpu[i].index_space = src_node_field_data[i].index_space; + src_field_data_gpu[i].field_offset = 0; + dst_field_data_gpu[i].inst = dst_gpu_instance; + dst_field_data_gpu[i].index_space = dst_node_field_data[i].index_space; + dst_field_data_gpu[i].field_offset = 1 * sizeof(Point<1>); + piece_field_data_gpu[i].inst = piece_gpu_instance; + piece_field_data_gpu[i].index_space = piece_id_field_data[i].index_space; + piece_field_data_gpu[i].field_offset = 0; + } + wait_on_events = true; + log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; + std::vector > p_garbage_nodes, p_garbage_edges, p_garbage_rd, p_garbage_preimage_edges; + Event e01 = is_nodes.create_subspaces_by_field(piece_field_data_gpu, + colors, + p_garbage_nodes, + Realm::ProfilingRequestSet()); + if (wait_on_events) e01.wait(); + Event e02 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_garbage_nodes, + p_garbage_edges, + Realm::ProfilingRequestSet(), + e01); + if(wait_on_events) e02.wait(); + std::pair estimate; + Event _e = is_nodes.gpu_subspaces_by_image(src_field_data_gpu, + p_garbage_edges, + p_garbage_rd, + Realm::ProfilingRequestSet(), + estimate, + RegionInstance::NO_INST, + e02); + std::cout << "Minimum size: " << estimate.first << " bytes, " + << "Maximum size: " << estimate.second << " bytes\n"; + + // an image of p_edges through out_node gives us all the shared nodes, along + // with some private nodes + const char* val = std::getenv("TILE_SIZE"); // or any env var + size_t tile_size = 100000000; //default + if (val) { + tile_size = atoi(val); + } + std::vector byte_fields = {sizeof(char)}; + IndexSpace<1> instance_index_space(Rect<1>(0, tile_size-1)); + RegionInstance buffer; + RegionInstance::create_instance(buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + estimate.first = tile_size; + Event e03 = is_nodes.gpu_subspaces_by_image(src_field_data_gpu, + p_garbage_edges, + p_garbage_rd, + Realm::ProfilingRequestSet(), + estimate, + buffer, + e02); + if(wait_on_events) e03.wait(); + + Event e04 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_garbage_rd, + p_garbage_preimage_edges, + Realm::ProfilingRequestSet(), + e03); + e04.wait(); + log_app.info() << "warming up complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU Partitioning " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU By Field " << Clock::current_time_in_microseconds() << "\n"; + Event e1 = is_nodes.create_subspaces_by_field(piece_field_data_gpu, + colors, + p_nodes, + Realm::ProfilingRequestSet()); + if(wait_on_events) e1.wait(); + log_app.info() << "GPU By Field complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU Preimage " << Clock::current_time_in_microseconds() << "\n"; + // now compute p_edges based on the color of their in_node (i.e. a preimage) + Event e2 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_nodes, + p_edges, + Realm::ProfilingRequestSet(), + e1); + if(wait_on_events) e2.wait(); + log_app.info() << "GPU Preimage complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU Image " << Clock::current_time_in_microseconds() << "\n"; + + // an image of p_edges through out_node gives us all the shared nodes, along + // with some private nodes + Event e3 = is_nodes.gpu_subspaces_by_image(src_field_data_gpu, + p_edges, + p_rd, + Realm::ProfilingRequestSet(), + estimate, + buffer, + e2); + if(wait_on_events) e3.wait(); + log_app.info() << "GPU Image complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting second GPU preimage " << Clock::current_time_in_microseconds() << "\n"; + + Event e4 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_rd, + p_preimage_edges, + Realm::ProfilingRequestSet(), + e3); + e4.wait(); + log_app.info() << "Second GPU preimage complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "GPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting CPU Partitioning " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting CPU By Field " << Clock::current_time_in_microseconds() << "\n"; + Event e5 = is_nodes.create_subspaces_by_field(piece_id_field_data, + colors, + p_nodes_cpu, + Realm::ProfilingRequestSet()); + if(wait_on_events) e5.wait(); + log_app.info() << "CPU By Field complete " << Clock::current_time_in_microseconds() << "\n"; + // now compute p_edges based on the color of their in_node (i.e. a preimage) + log_app.info() << "Starting CPU Preimage " << Clock::current_time_in_microseconds() << "\n"; + Event e6 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_nodes_cpu, + p_edges_cpu, + Realm::ProfilingRequestSet(), + e5); + if(wait_on_events) e6.wait(); + log_app.info() << "CPU Preimage complete " << Clock::current_time_in_microseconds() << "\n"; + + // an image of p_edges through out_node gives us all the shared nodes, along + // with some private nodes + log_app.info() << "Starting CPU Image " << Clock::current_time_in_microseconds() << "\n"; + Event e7 = is_nodes.create_subspaces_by_image(src_node_field_data, + p_edges_cpu, + p_rd_cpu, + Realm::ProfilingRequestSet(), + e6); + if(wait_on_events) e7.wait(); + log_app.info() << "CPU Image complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting second CPU preimage " << Clock::current_time_in_microseconds() << "\n"; + + Event e8 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_rd_cpu, + p_preimage_edges_cpu, + Realm::ProfilingRequestSet(), + e7); + e8.wait(); + log_app.info() << "Second CPU preimage complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; + return e8; + } + + virtual int perform_dynamic_checks(void) + { + // Nothing to do here + return 0; + } + + virtual int check_partitioning(void) + { + int errors = 0; + + if (!p_nodes.size()) { + return 0; + } + + log_app.info() << "Checking correctness of partitioning " << "\n"; + + for(int i = 0; i < num_pieces; i++) { + for(IndexSpaceIterator<1> it(p_nodes[i]); it.valid; it.step()) { + for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_nodes_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra byfield point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for(IndexSpaceIterator<1> it(p_nodes_cpu[i]); it.valid; it.step()) { + for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_nodes[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing byfield point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_edges[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_edges_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_edges_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_edges[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_rd[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_rd_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra image node " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_rd_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_rd[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing image node " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_preimage_edges[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_preimage_edges_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra second preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_preimage_edges_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_preimage_edges[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing second preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + + } + return errors; + } +}; + +class TileTest : public TestInterface { +public: + // graph config parameters + int num_nodes = 1000; + int num_edges = 1000; + int num_pieces = 4; + int num_tiles = 1; + std::string filename; + + TileTest(int argc, const char *argv[]) + { + for(int i = 1; i < argc; i++) { + + if(!strcmp(argv[i], "-p")) { + num_pieces = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-n")) { + num_nodes = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-e")) { + num_edges = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-t")) { + num_tiles = atoi(argv[++i]); + continue; + } + } + + + if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0) { + log_app.error() << "Invalid config: nodes=" << num_nodes << " edges=" << num_edges << " pieces=" << num_pieces << "\n"; + exit(1); + } + } + + struct InitDataArgs { + int index; + RegionInstance ri_nodes; + RegionInstance ri_edges; + }; + + enum PRNGStreams + { + NODE_SUBGRAPH_STREAM, + }; + + // assign subgraph ids to nodes + void random_node_data(int idx, int &subgraph) + { + if(random_colors) + subgraph = + Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); + else + subgraph = idx * num_pieces / num_nodes; + } + + void random_edge_data(int idx, int& src, int& dst) + { + src = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes); + dst = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, num_nodes); + } + + static void init_data_task_wrapper(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) + { + TileTest *me = (TileTest *)testcfg; + me->init_data_task(args, arglen, p); + } + + //Each piece has a task to initialize its data + void init_data_task(const void *args, size_t arglen, Processor p) + { + const InitDataArgs &i_args = *(const InitDataArgs *)args; + + log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes + << ")"; + + i_args.ri_nodes.fetch_metadata(p).wait(); + i_args.ri_edges.fetch_metadata(p).wait(); + + IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>(); + IndexSpace<1> is_edges = i_args.ri_edges.get_indexspace<1>(); + + log_app.debug() << "N: " << is_nodes; + log_app.debug() << "E: " << is_edges; + + //For each node in the graph, mark it with a random (or deterministic) subgraph id + { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) { + int subgraph; + random_node_data(i, subgraph); + a_piece_id.write(i, subgraph); + } + + AffineAccessor,1> a_src(i_args.ri_edges, 0 /* offset */); + AffineAccessor,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */); + + for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) { + int src, dst; + random_edge_data(i, src, dst); + a_src.write(i, Point<1>(src)); + a_dst.write(i, Point<1>(dst)); + } + } + + //Optionally print out the assigned subgraph ids + if(show_graph) { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) + log_app.info() << "piece_id[" << i << "] = " << a_piece_id.read(i) << "\n"; + + AffineAccessor,1> a_src(i_args.ri_edges, 0 /* offset */); + AffineAccessor,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */); + + for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) + log_app.info() << "src, dst[" << i << "] = " << a_src.read(i) << ", " << a_dst.read(i) << "\n"; + } + } + + IndexSpace<1> is_nodes, is_edges; + std::vector ri_nodes, ri_edges; + std::vector, int> > piece_id_field_data; + std::vector, Point<1> > > src_node_field_data, dst_node_field_data; + + virtual void print_info(void) + { + printf("Realm dependent partitioning test - tile: %d nodes, %d edges, %d pieces, %d tiles\n", + (int)num_nodes, (int) num_edges, (int)num_pieces, (int)num_tiles); + } + + virtual Event initialize_data(const std::vector &memories, + const std::vector &procs) + { + // now create index space for nodes + is_nodes = Rect<1>(0, num_nodes - 1); + is_edges = Rect<1>(0, num_edges - 1); + // equal partition is used to do initial population of edges and nodes + std::vector > ss_nodes_eq; + std::vector > ss_edges_eq; + + log_app.info() << "Creating equal subspaces\n"; + + is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); + is_edges.create_equal_subspaces(num_pieces, 1, ss_edges_eq, Realm::ProfilingRequestSet()).wait(); + + log_app.debug() << "Initial partitions:"; + for(size_t i = 0; i < ss_nodes_eq.size(); i++) + log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i]; + for(size_t i = 0; i < ss_edges_eq.size(); i++) + log_app.debug() << " Edges #" << i << ": " << ss_edges_eq[i]; + + // create instances for each of these subspaces + std::vector node_fields, edge_fields; + node_fields.push_back(sizeof(int)); // piece_id + assert(sizeof(int) == sizeof(Point<1>)); + edge_fields.push_back(sizeof(Point<1>)); // src_node + edge_fields.push_back(sizeof(Point<1>)); // dst_node + + ri_nodes.resize(num_pieces); + piece_id_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_nodes_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i], + node_fields, 0 /*SOA*/, + Realm::ProfilingRequestSet()) + .wait(); + ri_nodes[i] = ri; + + piece_id_field_data[i].index_space = ss_nodes_eq[i]; + piece_id_field_data[i].inst = ri_nodes[i]; + piece_id_field_data[i].field_offset = 0; + } + + + // Fire off tasks to initialize data + ri_edges.resize(num_pieces); + src_node_field_data.resize(num_pieces); + dst_node_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_edges_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, + memories[i % memories.size()], + ss_edges_eq[i], + edge_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_edges[i] = ri; + + src_node_field_data[i].index_space = ss_edges_eq[i]; + src_node_field_data[i].inst = ri_edges[i]; + src_node_field_data[i].field_offset = 0 * sizeof(Point<1>); + + dst_node_field_data[i].index_space = ss_edges_eq[i]; + dst_node_field_data[i].inst = ri_edges[i]; + dst_node_field_data[i].field_offset = 1 * sizeof(Point<1>); + } + + // fire off tasks to initialize data + std::set events; + for(int i = 0; i < num_pieces; i++) { + Processor p = procs[i % procs.size()]; + InitDataArgs args; + args.index = i; + args.ri_nodes = ri_nodes[i]; + args.ri_edges = ri_edges[i]; + Event e = p.spawn(INIT_TILE_DATA_TASK, &args, sizeof(args)); + events.insert(e); + } + + return Event::merge_events(events); + } + + // the outputs of our partitioning will be: + // p_nodes - nodes partitioned by subgraph id (from GPU) + // p_nodes_cpu - nodes partitioned by subgraph id (from CPU) + + + std::vector > p_nodes, p_rd; + std::vector > p_edges, p_preimage_edges; + + std::vector > p_nodes_cpu, p_rd_cpu; + std::vector > p_edges_cpu, p_preimage_edges_cpu; + + virtual Event perform_partitioning(void) + { + // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU + // Ensure that the results are identical + + std::vector colors(num_pieces); + for(int i = 0; i < num_pieces; i++) + colors[i] = i; + + // We need a GPU memory for GPU partitioning + Memory gpu_memory; + bool found_gpu_memory = false; + Machine machine = Machine::get_machine(); + std::set all_memories; + machine.get_all_memories(all_memories); + for(Memory memory : all_memories) { + if(memory.kind() == Memory::GPU_FB_MEM) { + gpu_memory = memory; + found_gpu_memory = true; + break; + } + } + if (!found_gpu_memory) { + log_app.error() << "No GPU memory found for partitioning test\n"; + return Event::NO_EVENT; + } + std::vector edge_fields; + edge_fields.push_back(sizeof(Point<1>)); + edge_fields.push_back(sizeof(Point<1>)) ; + std::vector node_fields; + node_fields.push_back(sizeof(int)); + + std::vector, Point<1> > > src_field_data_gpu; + std::vector, Point<1> > > dst_field_data_gpu; + std::vector, int> > piece_field_data_gpu; + piece_field_data_gpu.resize(num_pieces); + src_field_data_gpu.resize(num_pieces); + dst_field_data_gpu.resize(num_pieces); + for (int i = 0; i < num_pieces; i++) { + RegionInstance src_gpu_instance; + RegionInstance dst_gpu_instance; + RegionInstance piece_gpu_instance; + RegionInstance::create_instance(src_gpu_instance, + gpu_memory, + src_node_field_data[i].index_space, + edge_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(dst_gpu_instance, + gpu_memory, + dst_node_field_data[i].index_space, + edge_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(piece_gpu_instance, + gpu_memory, + piece_id_field_data[i].index_space, + node_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + CopySrcDstField src_gpu_field, src_cpu_field, dst_gpu_field, dst_cpu_field, piece_gpu_field, piece_cpu_field; + src_gpu_field.inst = src_gpu_instance; + src_gpu_field.size = sizeof(Point<1>); + src_gpu_field.field_id = 0; + src_cpu_field.inst = src_node_field_data[i].inst; + src_cpu_field.size = sizeof(Point<1>); + src_cpu_field.field_id = 0; + dst_gpu_field.inst = dst_gpu_instance; + dst_gpu_field.size = sizeof(Point<1>); + dst_gpu_field.field_id = sizeof(Point<1>); + dst_cpu_field.inst = dst_node_field_data[i].inst; + dst_cpu_field.size = sizeof(Point<1>); + dst_cpu_field.field_id = sizeof(Point<1>); + piece_gpu_field.inst = piece_gpu_instance; + piece_gpu_field.size = sizeof(int); + piece_gpu_field.field_id = 0; + piece_cpu_field.inst = piece_id_field_data[i].inst; + piece_cpu_field.size = sizeof(int); + piece_cpu_field.field_id = 0; + std::vector src_cpu_data, src_gpu_data, dst_cpu_data, dst_gpu_data, piece_cpu_data, piece_gpu_data; + src_cpu_data.push_back(src_cpu_field); + dst_cpu_data.push_back(dst_cpu_field); + src_gpu_data.push_back(src_gpu_field); + dst_gpu_data.push_back(dst_gpu_field); + piece_gpu_data.push_back(piece_gpu_field); + piece_cpu_data.push_back(piece_cpu_field); + Event copy_event = src_node_field_data[i].index_space.copy(src_cpu_data, src_gpu_data, Realm::ProfilingRequestSet()); + copy_event.wait(); + Event second_copy_event = dst_node_field_data[i].index_space.copy(dst_cpu_data, dst_gpu_data, Realm::ProfilingRequestSet()); + second_copy_event.wait(); + Event third_copy_event = piece_id_field_data[i].index_space.copy(piece_cpu_data, piece_gpu_data, Realm::ProfilingRequestSet()); + third_copy_event.wait(); + src_field_data_gpu[i].inst = src_gpu_instance; + src_field_data_gpu[i].index_space = src_node_field_data[i].index_space; + src_field_data_gpu[i].field_offset = 0; + dst_field_data_gpu[i].inst = dst_gpu_instance; + dst_field_data_gpu[i].index_space = dst_node_field_data[i].index_space; + dst_field_data_gpu[i].field_offset = 1 * sizeof(Point<1>); + piece_field_data_gpu[i].inst = piece_gpu_instance; + piece_field_data_gpu[i].index_space = piece_id_field_data[i].index_space; + piece_field_data_gpu[i].field_offset = 0; + } + wait_on_events = true; + log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; + std::vector > p_garbage_nodes, p_garbage_edges, p_garbage_rd, p_garbage_preimage_edges; + Event e01 = is_nodes.create_subspaces_by_field(piece_field_data_gpu, + colors, + p_garbage_nodes, + Realm::ProfilingRequestSet()); + if (wait_on_events) e01.wait(); + Event e02 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_garbage_nodes, + p_garbage_edges, + Realm::ProfilingRequestSet(), + e01); + if(wait_on_events) e02.wait(); + + // an image of p_edges through out_node gives us all the shared nodes, along + // with some private nodes + Event e03 = is_nodes.create_subspaces_by_image(src_field_data_gpu, + p_garbage_edges, + p_garbage_rd, + Realm::ProfilingRequestSet(), + e02); + if(wait_on_events) e03.wait(); + + Event e04 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_garbage_rd, + p_garbage_preimage_edges, + Realm::ProfilingRequestSet(), + e03); + e04.wait(); + log_app.info() << "warming up complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU Partitioning " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU By Field " << Clock::current_time_in_microseconds() << "\n"; + Event e1 = is_nodes.create_subspaces_by_field(piece_field_data_gpu, + colors, + p_nodes, + Realm::ProfilingRequestSet()); + if(wait_on_events) e1.wait(); + log_app.info() << "GPU By Field complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU Preimage " << Clock::current_time_in_microseconds() << "\n"; + // now compute p_edges based on the color of their in_node (i.e. a preimage) + Event e2 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_nodes, + p_edges, + Realm::ProfilingRequestSet(), + e1); + if(wait_on_events) e2.wait(); + log_app.info() << "GPU Preimage complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU Image " << Clock::current_time_in_microseconds() << "\n"; + + // an image of p_edges through out_node gives us all the shared nodes, along + // with some private nodes + Event e3 = is_nodes.create_subspaces_by_image(src_field_data_gpu, + p_edges, + p_rd, + Realm::ProfilingRequestSet(), + e2); + if(wait_on_events) e3.wait(); + log_app.info() << "GPU Image complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting second GPU preimage " << Clock::current_time_in_microseconds() << "\n"; + + Event e4 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_rd, + p_preimage_edges, + Realm::ProfilingRequestSet(), + e3); + e4.wait(); + log_app.info() << "Second GPU preimage complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "GPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting CPU Partitioning " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting CPU By Field " << Clock::current_time_in_microseconds() << "\n"; + Event e5 = is_nodes.create_subspaces_by_field(piece_id_field_data, + colors, + p_nodes_cpu, + Realm::ProfilingRequestSet()); + if(wait_on_events) e5.wait(); + log_app.info() << "CPU By Field complete " << Clock::current_time_in_microseconds() << "\n"; + // now compute p_edges based on the color of their in_node (i.e. a preimage) + log_app.info() << "Starting CPU Preimage " << Clock::current_time_in_microseconds() << "\n"; + Event e6 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_nodes_cpu, + p_edges_cpu, + Realm::ProfilingRequestSet(), + e5); + if(wait_on_events) e6.wait(); + log_app.info() << "CPU Preimage complete " << Clock::current_time_in_microseconds() << "\n"; + + // an image of p_edges through out_node gives us all the shared nodes, along + // with some private nodes + log_app.info() << "Starting CPU Image " << Clock::current_time_in_microseconds() << "\n"; + Event e7 = is_nodes.create_subspaces_by_image(src_node_field_data, + p_edges_cpu, + p_rd_cpu, + Realm::ProfilingRequestSet(), + e6); + if(wait_on_events) e7.wait(); + log_app.info() << "CPU Image complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting second CPU preimage " << Clock::current_time_in_microseconds() << "\n"; + + Event e8 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_rd_cpu, + p_preimage_edges_cpu, + Realm::ProfilingRequestSet(), + e7); + e8.wait(); + log_app.info() << "Second CPU preimage complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; + return e8; + } + + virtual int perform_dynamic_checks(void) + { + // Nothing to do here + return 0; + } + + virtual int check_partitioning(void) + { + int errors = 0; + + if (!p_nodes.size()) { + return 0; + } + + log_app.info() << "Checking correctness of partitioning " << "\n"; + + for(int i = 0; i < num_pieces; i++) { + for(IndexSpaceIterator<1> it(p_nodes[i]); it.valid; it.step()) { + for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_nodes_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra byfield point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for(IndexSpaceIterator<1> it(p_nodes_cpu[i]); it.valid; it.step()) { + for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_nodes[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing byfield point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_edges[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_edges_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_edges_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_edges[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_rd[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_rd_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra image node " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_rd_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_rd[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing image node " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_preimage_edges[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_preimage_edges_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra second preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_preimage_edges_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_preimage_edges[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing second preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + + } + return errors; + } +}; + +class RangeTest : public TestInterface { +public: + // graph config parameters + int num_nodes = 1000; + int num_rects = 1000; + int max_rect_size = 10; + int num_pieces = 4; + std::string filename; + + RangeTest(int argc, const char *argv[]) + { + for(int i = 1; i < argc; i++) { + + if(!strcmp(argv[i], "-p")) { + num_pieces = atoi(argv[++i]); + continue; + } + + if(!strcmp(argv[i], "-n")) { + num_nodes = atoi(argv[++i]); + continue; + } + + if(!strcmp(argv[i], "-r")) { + num_rects = atoi(argv[++i]); + continue; + } + + if(!strcmp(argv[i], "-m")) { + max_rect_size = atoi(argv[++i]); + continue; + } + } + + + + if (num_nodes <= 0 || num_rects <= 0) { + log_app.error() << "Invalid graph dimensions in input file: rects=" << num_rects << " nodes=" << num_nodes; + exit(1); + } + + } + + + + struct InitDataArgs { + int index; + RegionInstance ri_nodes; + RegionInstance ri_rects; + }; + + enum PRNGStreams { + NODE_SUBGRAPH_STREAM, + }; + + void random_rect_data(int idx, int& subgraph) + { + if(random_colors) + subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); + else + subgraph = idx * num_pieces / num_rects; + } + + void random_node_data(int idx, int& subgraph) + { + if(true) + subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); + else + subgraph = idx * num_pieces / num_nodes; + } + + void initialize_rect_data(int idx, Rect<1> &rect, int max_rect_size = 10) + { + + int first = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes); + int amount = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, max_rect_size); + rect = Rect<1>(first, first + amount); + } + + + static void init_data_task_wrapper(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) + { + RangeTest *me = (RangeTest *)testcfg; + me->init_data_task(args, arglen, p); + } + + void init_data_task(const void *args, size_t arglen, Processor p) + { + const InitDataArgs& i_args = *(const InitDataArgs *)args; + + log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes << ", ri_rects=" << i_args.ri_rects << ")"; + + i_args.ri_nodes.fetch_metadata(p).wait(); + i_args.ri_rects.fetch_metadata(p).wait(); + + IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>(); + IndexSpace<1> is_rects = i_args.ri_rects.get_indexspace<1>(); + + log_app.debug() << "N: " << is_nodes; + log_app.debug() << "E: " << is_rects; + + //Write out colors and rectangles + + { + AffineAccessor a_rect_id(i_args.ri_rects, 0 /* offset */); + + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) { + int subgraph; + random_rect_data(i, subgraph); + a_rect_id.write(i, subgraph); + } + } + { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) { + int subgraph; + random_node_data(i, subgraph); + a_piece_id.write(i, subgraph); + } + } + + + { + + AffineAccessor, 1> a_rect_val(i_args.ri_rects, 1 * sizeof(int) /* offset */); + + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) { + Rect<1> rect; + initialize_rect_data(i, rect, max_rect_size); + a_rect_val.write(i, rect); + } + } + + if(show_graph) { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) + log_app.info() << "node_id[" << i << "] = " << a_piece_id.read(i) << "\n"; + + AffineAccessor a_rect_id(i_args.ri_rects, 0 * sizeof(Point<1>) /* offset */); + + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) + log_app.info() << "rect_id[" << i << "] = " << a_rect_id.read(i) << "\n"; + + AffineAccessor,1> a_rect_val(i_args.ri_rects, 1 * sizeof(int) /* offset */); + + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) + log_app.info() << "rect_val[" << i << "] = " << a_rect_val.read(i) << "\n"; + } + } + + IndexSpace<1> is_nodes, is_rects; + std::vector ri_nodes; + std::vector, int> > node_id_field_data; + std::vector ri_rects; + std::vector, int> > rect_id_field_data; + std::vector, Rect<1> > > rect_val_field_data; + + virtual void print_info(void) + { + printf("Realm dependent partitioning test - ranges: %d nodes, %d rects, %d pieces\n", + (int)num_nodes, (int)num_rects, (int)num_pieces); + } + + virtual Event initialize_data(const std::vector& memories, + const std::vector& procs) + { + // now create index spaces for nodes and edges + is_nodes = Rect<1>(0, num_nodes - 1); + is_rects = Rect<1>(0, num_rects - 1); + + // equal partition is used to do initial population of edges and nodes + std::vector > ss_nodes_eq; + std::vector > ss_rects_eq; + + log_app.info() << "Creating equal subspaces" << "\n"; + + is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); + is_rects.create_equal_subspaces(num_pieces, 1, ss_rects_eq, Realm::ProfilingRequestSet()).wait(); + + log_app.debug() << "Initial partitions:"; + for(size_t i = 0; i < ss_nodes_eq.size(); i++) + log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i]; + for(size_t i = 0; i < ss_rects_eq.size(); i++) + log_app.debug() << " Rects #" << i << ": " << ss_rects_eq[i]; + + // create instances for each of these subspaces + std::vector node_fields, rect_fields; + node_fields.push_back(sizeof(int)); // piece_id + rect_fields.push_back(sizeof(int)); // src_node + rect_fields.push_back(sizeof(Rect<1>)); // dst_node + + ri_nodes.resize(num_pieces); + node_id_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_nodes_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, + memories[i % memories.size()], + ss_nodes_eq[i], + node_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_nodes[i] = ri; + + node_id_field_data[i].index_space = ss_nodes_eq[i]; + node_id_field_data[i].inst = ri_nodes[i]; + node_id_field_data[i].field_offset = 0; + } + + ri_rects.resize(num_pieces); + rect_id_field_data.resize(num_pieces); + rect_val_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_rects_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, + memories[i % memories.size()], + ss_rects_eq[i], + rect_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_rects[i] = ri; + + rect_id_field_data[i].index_space = ss_rects_eq[i]; + rect_id_field_data[i].inst = ri_rects[i]; + rect_id_field_data[i].field_offset = 0; + + rect_val_field_data[i].index_space = ss_rects_eq[i]; + rect_val_field_data[i].inst = ri_rects[i]; + rect_val_field_data[i].field_offset = 1 * sizeof(int); + } + + // fire off tasks to initialize data + std::set events; + for(int i = 0; i < num_pieces; i++) { + Processor p = procs[i % procs.size()]; + InitDataArgs args; + args.index = i; + args.ri_nodes = ri_nodes[i]; + args.ri_rects = ri_rects[i]; + Event e = p.spawn(INIT_RANGE_DATA_TASK, &args, sizeof(args)); + events.insert(e); + } + + return Event::merge_events(events); + } + + // the outputs of our partitioning will be: + //p_colored_rects -> all of our rectangles marked with the color given by random_rect_data + //p_rects -> image range by p colored rects into nodes + + std::vector > p_colored_rects, p_rects; + std::vector > p_colored_rects_cpu, p_rects_cpu; + + virtual Event perform_partitioning(void) + { + + std::vector colors(num_pieces); + for(int i = 0; i < num_pieces; i++) + colors[i] = i; + + Memory gpu_memory; + bool found_gpu_memory = false; + Machine machine = Machine::get_machine(); + std::set all_memories; + machine.get_all_memories(all_memories); + for(auto& memory : all_memories) { + if(memory.kind() == Memory::GPU_FB_MEM) { + gpu_memory = memory; + found_gpu_memory = true; + break; + } + } + assert(found_gpu_memory); + std::vector rect_fields; + rect_fields.push_back(sizeof(int)); + rect_fields.push_back(sizeof(Rect<1>)); + std::vector node_fields; + node_fields.push_back(sizeof(int)); + + std::vector, int > > node_id_data_gpu; + std::vector, int > > rect_id_data_gpu; + std::vector, Rect<1>>> rect_val_data_gpu; + node_id_data_gpu.resize(num_pieces); + rect_id_data_gpu.resize(num_pieces); + rect_val_data_gpu.resize(num_pieces); + for (int i = 0; i < num_pieces; i++) { + RegionInstance node_id_instance; + RegionInstance rect_id_instance; + RegionInstance rect_val_instance; + RegionInstance::create_instance(node_id_instance, + gpu_memory, + node_id_field_data[i].index_space, + node_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(rect_id_instance, + gpu_memory, + rect_id_field_data[i].index_space, + rect_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(rect_val_instance, + gpu_memory, + rect_val_field_data[i].index_space, + rect_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + CopySrcDstField node_id_gpu_field, node_id_cpu_field, rect_id_gpu_field, rect_id_cpu_field, rect_val_gpu_field, rect_val_cpu_field; + node_id_gpu_field.inst = node_id_instance; + node_id_gpu_field.size = sizeof(int); + node_id_gpu_field.field_id = 0; + node_id_cpu_field.inst = node_id_field_data[i].inst; + node_id_cpu_field.size = sizeof(int); + node_id_cpu_field.field_id = 0; + rect_id_gpu_field.inst = rect_id_instance; + rect_id_gpu_field.size = sizeof(int); + rect_id_gpu_field.field_id = 0; + rect_id_cpu_field.inst = rect_id_field_data[i].inst; + rect_id_cpu_field.size = sizeof(int); + rect_id_cpu_field.field_id = 0; + rect_val_gpu_field.inst = rect_val_instance; + rect_val_gpu_field.size = sizeof(Rect<1>); + rect_val_gpu_field.field_id = sizeof(int); + rect_val_cpu_field.inst = rect_val_field_data[i].inst; + rect_val_cpu_field.size = sizeof(Rect<1>); + rect_val_cpu_field.field_id = sizeof(int); + std::vector node_id_gpu_data, node_id_cpu_data, rect_id_gpu_data, rect_id_cpu_data, rect_val_gpu_data, rect_val_cpu_data; + node_id_gpu_data.push_back(node_id_gpu_field); + node_id_cpu_data.push_back(node_id_cpu_field); + rect_id_gpu_data.push_back(rect_id_gpu_field); + rect_id_cpu_data.push_back(rect_id_cpu_field); + rect_val_gpu_data.push_back(rect_val_gpu_field); + rect_val_cpu_data.push_back(rect_val_cpu_field); + Event copy_event = node_id_field_data[i].index_space.copy(node_id_cpu_data, node_id_gpu_data, Realm::ProfilingRequestSet()); + copy_event.wait(); + Event second_copy_event = rect_id_field_data[i].index_space.copy(rect_id_cpu_data, rect_id_gpu_data, Realm::ProfilingRequestSet()); + second_copy_event.wait(); + Event third_copy_event = rect_val_field_data[i].index_space.copy(rect_val_cpu_data, rect_val_gpu_data, Realm::ProfilingRequestSet()); + third_copy_event.wait(); + node_id_data_gpu[i].inst = node_id_instance; + node_id_data_gpu[i].index_space = node_id_field_data[i].index_space; + node_id_data_gpu[i].field_offset = 0; + rect_id_data_gpu[i].inst = rect_id_instance; + rect_id_data_gpu[i].index_space = rect_id_field_data[i].index_space; + rect_id_data_gpu[i].field_offset = 0; + rect_val_data_gpu[i].inst = rect_val_instance; + rect_val_data_gpu[i].index_space = rect_val_field_data[i].index_space; + rect_val_data_gpu[i].field_offset = sizeof(int); + } + wait_on_events = true; + std::vector> p_garbage_rects, p_garbage_colors; + log_app.info() << "WARMING UP " << "\n"; + + Event e001 = is_rects.create_subspaces_by_field(rect_id_data_gpu, + colors, + p_garbage_colors, + Realm::ProfilingRequestSet()); + if (wait_on_events) e001.wait(); + Event e002 = is_nodes.create_subspaces_by_image(rect_val_data_gpu, + p_garbage_colors, + p_garbage_rects, + Realm::ProfilingRequestSet(), + e001); + if(wait_on_events) e002.wait(); + + log_app.info() << "FINISHED WARMING UP " << "\n"; + log_app.info() << "starting GPU partitioning " << Clock::current_time_in_microseconds() << "\n"; + + log_app.info() << "STARTING GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + + Event e01 = is_rects.create_subspaces_by_field(rect_id_data_gpu, + colors, + p_colored_rects, + Realm::ProfilingRequestSet()); + if (wait_on_events) e01.wait(); + + log_app.info() << "FINISHED GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "STARTING GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + Event e02 = is_nodes.create_subspaces_by_image(rect_val_data_gpu, + p_colored_rects, + p_rects, + Realm::ProfilingRequestSet(), + e01); + if(wait_on_events) e02.wait(); + + log_app.info() << "FINISHED GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "STARTING CPU partitioning " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "STARTING CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + Event e1 = is_rects.create_subspaces_by_field(rect_id_field_data, + colors, + p_colored_rects_cpu, + Realm::ProfilingRequestSet()); + if (wait_on_events) e1.wait(); + log_app.info() << "FINISHED CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "STARTING CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + Event e2 = is_nodes.create_subspaces_by_image(rect_val_field_data, + p_colored_rects_cpu, + p_rects_cpu, + Realm::ProfilingRequestSet(), + e1); + if(wait_on_events) e2.wait(); + log_app.info() << "FINISHED CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; + return e2; + } + + + + virtual int perform_dynamic_checks(void) + { + return 0; + } + + virtual int check_partitioning(void) + { + log_app.info() << "Checking correctness of partitioning " << "\n"; + int errors = 0; + + for (int i = 0; i < num_pieces; i++) { + for (IndexSpaceIterator<1> it(p_colored_rects[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_colored_rects_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra colored rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_colored_rects_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if(!p_colored_rects[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing colored rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_rects[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_rects_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_rects_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if(!p_rects[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + } + return errors; + } +}; + class MiniAeroTest : public TestInterface { public: enum ProblemType @@ -625,7 +2194,7 @@ class MiniAeroTest : public TestInterface { AffineAccessor a_cell_blockid(i_args.ri_cells, 0 /* offset */); for(int i = is_cells.bounds.lo[0]; i <= is_cells.bounds.hi[0]; i++) - std::cout << "Z[" << i << "]: blockid=" << a_cell_blockid.read(i) << std::endl; + std::cout << "Z[" << i << "]: blockid=" << a_cell_blockid.read(i) << "\n"; AffineAccessor, 1> a_face_left(i_args.ri_faces, 0 * sizeof(Point<1>) /* offset */); @@ -637,7 +2206,7 @@ class MiniAeroTest : public TestInterface { for(int i = is_faces.bounds.lo[0]; i <= is_faces.bounds.hi[0]; i++) std::cout << "S[" << i << "]:" << " left=" << a_face_left.read(i) << " right=" << a_face_right.read(i) - << " type=" << a_face_type.read(i) << std::endl; + << " type=" << a_face_type.read(i) << "\n"; } } @@ -1006,7 +2575,6 @@ class CircuitTest : public TestInterface { { AffineAccessor a_subckt_id(i_args.ri_nodes, 0 /* offset */); - // std::cout << "a_subckt_id = " << a_subckt_id << "\n"; for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) { int subckt; @@ -1021,9 +2589,6 @@ class CircuitTest : public TestInterface { AffineAccessor, 1> a_out_node(i_args.ri_edges, 1 * sizeof(Point<1>) /* offset */); - // std::cout << "a_in_node = " << a_in_node << "\n"; - // std::cout << "a_out_node = " << a_out_node << "\n"; - for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) { Point<1> in_node, out_node; random_edge_data(i, in_node, out_node); @@ -1036,19 +2601,19 @@ class CircuitTest : public TestInterface { AffineAccessor a_subckt_id(i_args.ri_nodes, 0 /* offset */); for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) - std::cout << "subckt_id[" << i << "] = " << a_subckt_id.read(i) << std::endl; + std::cout << "subckt_id[" << i << "] = " << a_subckt_id.read(i) << "\n"; AffineAccessor, 1> a_in_node(i_args.ri_edges, 0 * sizeof(Point<1>) /* offset */); for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) - std::cout << "in_node[" << i << "] = " << a_in_node.read(i) << std::endl; + std::cout << "in_node[" << i << "] = " << a_in_node.read(i) << "\n"; AffineAccessor, 1> a_out_node(i_args.ri_edges, 1 * sizeof(Point<1>) /* offset */); for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) - std::cout << "out_node[" << i << "] = " << a_out_node.read(i) << std::endl; + std::cout << "out_node[" << i << "] = " << a_out_node.read(i) << "\n"; } } @@ -1761,7 +3326,7 @@ class PennantTest : public TestInterface { AffineAccessor a_zone_color(i_args.ri_zones, 0 /* offset */); for(int i = is_zones.bounds.lo; i <= is_zones.bounds.hi; i++) - std::cout << "Z[" << i << "]: color=" << a_zone_color.read(i) << std::endl; + std::cout << "Z[" << i << "]: color=" << a_zone_color.read(i) << "\n"; AffineAccessor, 1> a_side_mapsz(i_args.ri_sides, 0 * sizeof(Point<1>) /* offset */); @@ -1777,7 +3342,7 @@ class PennantTest : public TestInterface { << " mapsz=" << a_side_mapsz.read(i) << " mapss3=" << a_side_mapss3.read(i) << " mapsp1=" << a_side_mapsp1.read(i) << " ok=" << a_side_ok.read(i) - << std::endl; + << "\n"; } } @@ -2831,6 +4396,21 @@ int main(int argc, char **argv) break; } + if(!strcmp(argv[i], "basic")) { + testcfg = new BasicTest(argc - i, const_cast(argv + i)); + break; + } + + if(!strcmp(argv[i], "tile")) { + testcfg = new TileTest(argc - i, const_cast(argv + i)); + break; + } + + if (!strcmp(argv[i], "range")) { + testcfg = new RangeTest(argc - i, const_cast(argv + i)); + break; + } + if(!strcmp(argv[i], "pennant")) { testcfg = new PennantTest(argc - i, const_cast(argv + i)); break; @@ -2867,6 +4447,9 @@ int main(int argc, char **argv) rt.register_task(TOP_LEVEL_TASK, top_level_task); rt.register_task(INIT_CIRCUIT_DATA_TASK, CircuitTest::init_data_task_wrapper); rt.register_task(INIT_PENNANT_DATA_TASK, PennantTest::init_data_task_wrapper); + rt.register_task(INIT_BASIC_DATA_TASK, BasicTest::init_data_task_wrapper); + rt.register_task(INIT_TILE_DATA_TASK, TileTest::init_data_task_wrapper); + rt.register_task(INIT_RANGE_DATA_TASK, RangeTest::init_data_task_wrapper); rt.register_task(INIT_MINIAERO_DATA_TASK, MiniAeroTest::init_data_task_wrapper); signal(SIGALRM, sigalrm_handler); diff --git a/tests/gpu_deppart_1d.cc b/tests/gpu_deppart_1d.cc new file mode 100644 index 0000000000..250a63f2df --- /dev/null +++ b/tests/gpu_deppart_1d.cc @@ -0,0 +1,327 @@ +/* + * Copyright 2025 Stanford University, NVIDIA + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include +#include "realm.h" +#include "realm/id.h" +#include "realm/machine.h" +#include "realm/cmdline.h" +#include "philox.h" + +using namespace Realm; + +#ifdef REALM_USE_CUDA +#include "realm/cuda/cuda_memcpy.h" +#include "realm/cuda/cuda_module.h" +#endif +#ifdef REALM_USE_HIP +#include "hip_cuda_compat/hip_cuda.h" +#include "realm/hip/hip_module.h" +#endif + +#ifdef REALM_USE_CUDA +using namespace Realm::Cuda; +#endif +#ifdef REALM_USE_HIP +using namespace Realm::Hip; +#endif + +Logger log_app("app"); + +// ---------------- Config (matches transpose_test style) ---------------- +namespace TestConfig { + int num_nodes = 1000; + int num_edges = 5000; + int num_pieces = 4; + int random = 0; // 0 deterministic, 1 random + unsigned long long seed = 123456789ULL; + int show = 0; // print assigned ids + int verify = 1; // do correctness check +}; +static const FieldID FID_SUBGRAPH = 0; +static const FieldID FID_SRC = 0; +static const FieldID FID_DST = sizeof(Point<1, int>); + +// ---------------- Small helpers (same idioms as transpose_test) -------- +template +static void fill_index_space(RegionInstance inst, + FieldID fid, + const IndexSpace& is, + Fn gen) +{ + AffineAccessor acc(inst, fid); + for (IndexSpaceIterator it(is); it.valid; it.step()) { + for (PointInRectIterator p(it.rect); p.valid; p.step()) + acc[p.p] = gen(p.p); + } +} + +template +static void copy_field(const IndexSpace& is, + RegionInstance src, RegionInstance dst, FieldID fid) +{ + std::vector srcs(1), dsts(1); + srcs[0].set_field(src, fid, sizeof(DT)); + dsts[0].set_field(dst, fid, sizeof(DT)); + is.copy(srcs, dsts, ProfilingRequestSet()).wait(); +} + +static void choose_cpu_and_gpu_mems(Memory& cpu_mem, Memory& gpu_mem, bool& have_gpu) +{ + have_gpu = false; + for (auto mem : Machine::MemoryQuery(Machine::get_machine())) { + if (!cpu_mem.exists() && (mem.kind() == Memory::SYSTEM_MEM)) + cpu_mem = mem; + if (!gpu_mem.exists() && (mem.kind() == Memory::GPU_FB_MEM)) { + gpu_mem = mem; + have_gpu = true; + } + } +} + +// For brevity, we use the simple vector layout helper (as in many Realm tests) +static Event make_instance(RegionInstance& ri, + Memory mem, + const IndexSpace<1,int>& is, + std::vector fields) +{ + return RegionInstance::create_instance(ri, mem, is, fields, + /*soa=*/0, ProfilingRequestSet()); +} + +// Compare two partitions index-space-by-index-space +static int compare_partitions(const std::vector>& A, + const std::vector>& B) +{ + int errors = 0; + if (A.size() != B.size()) return 1; + for (size_t i = 0; i < A.size(); i++) { + // Check A minus B + for (IndexSpaceIterator<1,int> it(A[i]); it.valid; it.step()) + for (PointInRectIterator<1,int> p(it.rect); p.valid; p.step()) + if (!B[i].contains(p.p)) { errors++; } + // Check B minus A + for (IndexSpaceIterator<1,int> it(B[i]); it.valid; it.step()) + for (PointInRectIterator<1,int> p(it.rect); p.valid; p.step()) + if (!A[i].contains(p.p)) { errors++; } + } + return errors; +} + +// ---------------- Top-level task (like transpose_test_gpu) -------------- +enum { + TOP_LEVEL_TASK = Processor::TASK_ID_FIRST_AVAILABLE + 300, +}; + +static void top_level_task(const void*, size_t, const void*, size_t, Processor) +{ + log_app.print() << "deppart_byfield_itest starting"; + + // Build the 1D node space [0 .. N-1] + IndexSpace<1,int> is_nodes(Rect<1,int>(0, TestConfig::num_nodes - 1)); + IndexSpace<1,int> is_edges(Rect<1, int>(0, TestConfig::num_edges - 1)); + + // Choose memories + Memory cpu_mem, gpu_mem; + bool have_gpu = false; + choose_cpu_and_gpu_mems(cpu_mem, gpu_mem, have_gpu); + if (!cpu_mem.exists()) { + log_app.fatal() << "No SYSTEM_MEM found"; + assert(0); + return; + } + if (!have_gpu) { + log_app.warning() << "No GPU_FB_MEM found; running CPU-only check."; + } + + // Create CPU instance holding subgraph ids + RegionInstance cpu_inst_nodes; + make_instance(cpu_inst_nodes, cpu_mem, is_nodes, {sizeof(int)}).wait(); + + RegionInstance cpu_inst_edges; + make_instance(cpu_inst_edges, cpu_mem, is_edges, {sizeof(Point<1, int>), sizeof(Point<1, int>)}).wait(); + + // Fill ids (deterministic or random) + auto gen_id = [&](Point<1,int> p)->int { + if (TestConfig::random) { + return Philox_2x32<>::rand_int(TestConfig::seed, + /*counter=*/p[0], + /*stream=*/0, + /*bound=*/TestConfig::num_pieces); + } else { + // even split + return int((long long)p[0] * TestConfig::num_pieces / TestConfig::num_nodes); + } + }; + fill_index_space<1,int,int>(cpu_inst_nodes, FID_SUBGRAPH, is_nodes, gen_id); + + auto gen_src = [&](Point<1,int> p)->Point<1, int> { + if (TestConfig::random) { + return Point<1, int>(Philox_2x32<>::rand_int(TestConfig::seed, + /*counter=*/p[0], + /*stream=*/0, + /*bound=*/TestConfig::num_nodes)); + } else { + return Point<1, int>(p[0] % TestConfig::num_nodes); + } + }; + + fill_index_space<1,int,Point<1,int>>(cpu_inst_edges, FID_SRC, is_edges, gen_src); + + auto gen_dst = [&](Point<1,int> p)->Point<1, int> { + if (TestConfig::random) { + return Point<1, int>(Philox_2x32<>::rand_int(TestConfig::seed, + /*counter=*/p[0]+TestConfig::num_edges, + /*stream=*/0, + /*bound=*/TestConfig::num_nodes)); + } else { + return Point<1, int>((p[0]+1) % TestConfig::num_nodes); + } + }; + + fill_index_space<1,int,Point<1,int>>(cpu_inst_edges, FID_DST, is_edges, gen_dst); + + if (TestConfig::show) { + AffineAccessor acc(cpu_inst_nodes, FID_SUBGRAPH); + for (IndexSpaceIterator<1,int> it(is_nodes); it.valid; it.step()) + for (PointInRectIterator<1,int> p(it.rect); p.valid; p.step()) + log_app.print() << "id[" << p.p << "]=" << acc[p.p]; + + AffineAccessor,1,int> acc_src(cpu_inst_edges, FID_SRC); + AffineAccessor,1,int> acc_dst(cpu_inst_edges, FID_DST); + for (IndexSpaceIterator<1,int> it(is_edges); it.valid; it.step()) + for (PointInRectIterator<1,int> p(it.rect); p.valid; p.step()) + log_app.print() << "edge[" << p.p << "]=" << acc_src[p.p] << "->" << acc_dst[p.p]; + } + + // Describe the field data (CPU) + FieldDataDescriptor, int> cpu_field_nodes; + cpu_field_nodes.index_space = is_nodes; + cpu_field_nodes.inst = cpu_inst_nodes; + cpu_field_nodes.field_offset = 0; + + FieldDataDescriptor, Point<1, int>> cpu_field_src; + cpu_field_src.index_space = is_edges; + cpu_field_src.inst = cpu_inst_edges; + cpu_field_src.field_offset = 0; + + FieldDataDescriptor, Point<1, int>> cpu_field_dst; + cpu_field_dst.index_space = is_edges; + cpu_field_dst.inst = cpu_inst_edges; + cpu_field_dst.field_offset = sizeof(Point<1,int>); + + std::vector, int>> cpu_nodes(1, cpu_field_nodes); + std::vector, Point<1, int>>> cpu_src(1, cpu_field_src); + std::vector, Point<1, int>>> cpu_dst(1, cpu_field_dst); + + + // Colors 0..num_pieces-1 + std::vector colors(TestConfig::num_pieces); + for (int i = 0; i < TestConfig::num_pieces; i++) colors[i] = i; + + // CPU partitioning + std::vector> p_cpu_nodes, p_cpu_edges, p_cpu_rd; + Event e_cpu_byfield = is_nodes.create_subspaces_by_field(cpu_nodes, colors, p_cpu_nodes, ProfilingRequestSet()); + Event e_cpu_bypreimage = is_edges.create_subspaces_by_preimage(cpu_dst, p_cpu_nodes, p_cpu_edges, ProfilingRequestSet(), e_cpu_byfield); + Event e_cpu_image = is_nodes.create_subspaces_by_image(cpu_src, p_cpu_edges, p_cpu_rd, ProfilingRequestSet(), e_cpu_bypreimage); + + // GPU path (optional if GPU exists) + std::vector> p_gpu_nodes, p_gpu_edges, p_gpu_rd; + if (have_gpu) { + RegionInstance gpu_inst_nodes, gpu_inst_edges; + make_instance(gpu_inst_nodes, gpu_mem, is_nodes, {sizeof(int)}).wait(); + make_instance(gpu_inst_edges, gpu_mem, is_edges, {sizeof(Point<1, int>), sizeof(Point<1, int>)}).wait(); + + // Copy field data CPU -> GPU + copy_field<1,int,int>(is_nodes, cpu_inst_nodes, gpu_inst_nodes, FID_SUBGRAPH); + copy_field<1,int,Point<1,int>>(is_edges, cpu_inst_edges, gpu_inst_edges, FID_SRC); + copy_field<1,int,Point<1,int>>(is_edges, cpu_inst_edges, gpu_inst_edges, FID_DST); + + // Describe the field data (CPU) + FieldDataDescriptor, int> gpu_field_nodes; + gpu_field_nodes.index_space = is_nodes; + gpu_field_nodes.inst = gpu_inst_nodes; + gpu_field_nodes.field_offset = 0; + + FieldDataDescriptor, Point<1, int>> gpu_field_src; + gpu_field_src.index_space = is_edges; + gpu_field_src.inst = gpu_inst_edges; + gpu_field_src.field_offset = 0; + + FieldDataDescriptor, Point<1, int>> gpu_field_dst; + gpu_field_dst.index_space = is_edges; + gpu_field_dst.inst = cpu_inst_edges; + gpu_field_dst.field_offset = sizeof(Point<1,int>); + + std::vector, int>> gpu_nodes(1, gpu_field_nodes); + std::vector, Point<1, int>>> gpu_src(1, gpu_field_src); + std::vector, Point<1, int>>> gpu_dst(1, gpu_field_dst); + + std::vector> p_gpu_nodes, p_gpu_edges, p_gpu_rd; + Event e_gpu_byfield = is_nodes.create_subspaces_by_field(gpu_nodes, colors, p_gpu_nodes, + ProfilingRequestSet()); + Event e_gpu_bypreimage = is_edges.create_subspaces_by_preimage(gpu_dst, p_gpu_nodes, p_gpu_edges, ProfilingRequestSet(), e_gpu_byfield); + Event e_gpu_image = is_nodes.create_subspaces_by_image(gpu_src, p_gpu_edges, p_gpu_rd, ProfilingRequestSet(), e_gpu_bypreimage); + + e_cpu_image.wait(); + e_gpu_image.wait(); + // Compare CPU vs GPU partitions + if (TestConfig::verify) { + int errs = compare_partitions(p_cpu_nodes, p_gpu_nodes) + + compare_partitions(p_cpu_edges, p_gpu_edges) + + compare_partitions(p_cpu_rd, p_gpu_rd); + if (errs) { + log_app.fatal() << "Mismatch between CPU and GPU partitions, errors=" << errs; + assert(0); + } + } + gpu_inst_nodes.destroy(); + gpu_inst_edges.destroy(); + } else { + e_cpu_image.wait(); + } + + // Cleanup + cpu_inst_nodes.destroy(); + cpu_inst_edges.destroy(); + is_nodes.destroy(); + is_edges.destroy(); + + log_app.print() << "deppart_1d_itest: PASS"; +} + +// ---------------- Main (same as transpose_test pattern) ----------------- +int main(int argc, char** argv) +{ + Runtime rt; + rt.init(&argc, &argv); + + // Parse simple flags similar to the example + CommandLineParser cp; + cp.add_option_int("-n", TestConfig::num_nodes) + .add_option_int("-e", TestConfig::num_edges) + .add_option_int("-p", TestConfig::num_pieces) + .add_option_int("-random", TestConfig::random) + .add_option_int("-show", TestConfig::show) + .add_option_int("-verify", TestConfig::verify); + bool ok = cp.parse_command_line(argc, const_cast(argv)); + assert(ok); + + rt.register_task(TOP_LEVEL_TASK, top_level_task); + + Processor p = Machine::ProcessorQuery(Machine::get_machine()) + .only_kind(Processor::LOC_PROC) + .first(); + assert(p.exists()); + + Event e = rt.collective_spawn(p, TOP_LEVEL_TASK, nullptr, 0); + rt.shutdown(e); + rt.wait_for_shutdown(); + return 0; +} \ No newline at end of file From 45d9973b7dbc972f377129efcb8c8ba9975c0147 Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Tue, 20 Jan 2026 17:03:41 -0800 Subject: [PATCH 02/32] Updated api --- src/realm/deppart/image.cc | 147 ++++----------------------- src/realm/deppart/image.h | 34 +------ src/realm/deppart/image_gpu_impl.hpp | 3 +- src/realm/deppart/image_tmpl.cc | 8 +- src/realm/indexspace.h | 27 ++--- src/realm/indexspace.inl | 20 +--- tests/deppart.cc | 19 ++-- 7 files changed, 43 insertions(+), 215 deletions(-) diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc index 660d0f77ad..c57b86b426 100644 --- a/src/realm/deppart/image.cc +++ b/src/realm/deppart/image.cc @@ -32,15 +32,16 @@ namespace Realm { template template - Event IndexSpace::gpu_subspaces_by_image( + Event IndexSpace::create_subspaces_by_image( const DomainTransform &domain_transform, const std::vector> &sources, std::vector> &images, const ProfilingRequestSet &reqs, - std::pair &sizes, RegionInstance buffer, Event wait_on) const { - // output vector should start out empty - assert(images.empty()); + Event wait_on, + RegionInstance buffer, std::pair* buffer_bounds) const { + // output vector should start out empty + assert(images.empty()); - if (buffer==RegionInstance::NO_INST) { + if (buffer_bounds != nullptr || buffer != RegionInstance::NO_INST) { size_t optimal_size = 0; for (size_t i = 0; i < sources.size(); i++) { optimal_size += 5 * sources[i].volume() * sizeof(RectDesc); @@ -73,49 +74,21 @@ namespace Realm { (2 * source_entries * sizeof(uint64_t)) + (source_entries * sizeof(uint64_t)); } - sizes = std::make_pair(minimal_size, minimal_size + optimal_size); - return Event::NO_EVENT; - } - - GenEventImpl *finish_event = GenEventImpl::create_genevent(); - Event e = finish_event->current_event(); - - GPUImageOperation *op = new GPUImageOperation( - *this, domain_transform, reqs, sizes.first, buffer, finish_event, ID(e).event_generation()); - - size_t n = sources.size(); - images.resize(n); - for (size_t i = 0; i < n; i++) { - images[i] = op->add_source(sources[i]); - - if(!images[i].dense()) { - e = Event::merge_events( - {e, SparsityMapRefCounter(images[i].sparsity.id).add_references(1)}); + if (buffer_bounds != nullptr && buffer == RegionInstance::NO_INST) { + *buffer_bounds = std::make_pair(minimal_size, minimal_size + optimal_size); + return Event::NO_EVENT; } - - log_dpops.info() << "image: " << *this << " src=" << sources[i] << " -> " - << images[i] << " (" << e << ")"; + assert(buffer != RegionInstance::NO_INST); + size_t buffer_size = buffer.get_layout()->bytes_used; + assert(buffer_size >= minimal_size); } - op->launch(wait_on); - return e; - } - - template - template - Event IndexSpace::create_subspaces_by_image( - const DomainTransform &domain_transform, - const std::vector> &sources, - std::vector> &images, const ProfilingRequestSet &reqs, - Event wait_on) const { - // output vector should start out empty - assert(images.empty()); GenEventImpl *finish_event = GenEventImpl::create_genevent(); Event e = finish_event->current_event(); ImageOperation *op = new ImageOperation( - *this, domain_transform, reqs, finish_event, ID(e).event_generation()); + *this, domain_transform, reqs, finish_event, ID(e).event_generation(), buffer); size_t n = sources.size(); images.resize(n); @@ -507,10 +480,11 @@ namespace Realm { const IndexSpace &_parent, const DomainTransform &_domain_transform, const ProfilingRequestSet &reqs, GenEventImpl *_finish_event, - EventImpl::gen_t _finish_gen) + EventImpl::gen_t _finish_gen, RegionInstance _buffer) : PartitioningOperation(reqs, _finish_event, _finish_gen), parent(_parent), - domain_transform(_domain_transform) {} + domain_transform(_domain_transform), + buffer(_buffer) {} template ImageOperation::~ImageOperation(void) @@ -715,24 +689,9 @@ namespace Realm { if (gpu_data) { std::swap(domain_transform.ptr_data, gpu_ptr_data); std::swap(domain_transform.range_data, gpu_rect_data); - const char* val = std::getenv("TILE_SIZE"); // or any env var - size_t tile_size = 100000000; //default - if (val) { - tile_size = atoi(val); - } - std::vector byte_fields = {sizeof(char)}; - IndexSpace<1> instance_index_space(Rect<1>(0, tile_size-1)); - RegionInstance buffer; - Memory my_mem; - if (domain_transform.ptr_data.size() > 0) { - my_mem = domain_transform.ptr_data[0].inst.get_location(); - } else { - my_mem = domain_transform.range_data[0].inst.get_location(); - } - RegionInstance::create_instance(buffer, my_mem, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); GPUImageMicroOp *micro_op = new GPUImageMicroOp( - parent, domain_transform, !cpu_data, tile_size, buffer); + parent, domain_transform, !cpu_data, buffer); for (size_t j = 0; j < sources.size(); j++) { micro_op->add_sparsity_output(sources[j], images[j]); } @@ -818,74 +777,6 @@ namespace Realm { os << "ImageOperation(" << parent << ")"; } - //////////////////////////////////////////////////////////////////////// - // - // class GPUImageOperation - - template - GPUImageOperation::GPUImageOperation( - const IndexSpace &_parent, - const DomainTransform &_domain_transform, - const ProfilingRequestSet &reqs, size_t _buffer_size, RegionInstance _buffer, - GenEventImpl *_finish_event, EventImpl::gen_t _finish_gen) - : PartitioningOperation(reqs, _finish_event, _finish_gen), - parent(_parent), - domain_transform(_domain_transform), - buffer_size(_buffer_size), - buffer(_buffer) {} - - template - GPUImageOperation::~GPUImageOperation(void) - {} - - template - IndexSpace GPUImageOperation::add_source(const IndexSpace& source) - { - // try to filter out obviously empty sources - if(parent.empty() || source.empty()) - return IndexSpace::make_empty(); - - // otherwise it'll be something smaller than the current parent - IndexSpace image; - image.bounds = parent.bounds; - - // if the source has a sparsity map, use the same node - otherwise - // get a sparsity ID by round-robin'ing across the nodes that have field data - int target_node = 0; - if(!source.dense()) - target_node = ID(source.sparsity).sparsity_creator_node(); - else - if(!domain_transform.ptr_data.empty()) - target_node = ID(domain_transform.ptr_data[sources.size() % domain_transform.ptr_data.size()].inst).instance_owner_node(); - else - target_node = ID(domain_transform.range_data[sources.size() % domain_transform.range_data.size()].inst).instance_owner_node(); - - SparsityMap sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert >(); - image.sparsity = sparsity; - - sources.push_back(source); - images.push_back(sparsity); - - return image; - } - - template - void GPUImageOperation::execute(void) { - GPUImageMicroOp *micro_op = - new GPUImageMicroOp( - parent, domain_transform, true, buffer_size, buffer); - for (size_t j = 0; j < sources.size(); j++) { - micro_op->add_sparsity_output(sources[j], images[j]); - } - micro_op->dispatch(this, true); - } - - template - void GPUImageOperation::print(std::ostream& os) const - { - os << "ImageOperation(" << parent << ")"; - } - //////////////////////////////////////////////////////////////////////// // // class StructuredImageMicroOp @@ -1015,8 +906,8 @@ namespace Realm { GPUImageMicroOp::GPUImageMicroOp( const IndexSpace &_parent, const DomainTransform &_domain_transform, - bool _exclusive, size_t _fixed_buffer_size, RegionInstance _buffer) - : parent_space(_parent), domain_transform(_domain_transform), fixed_buffer_size(_fixed_buffer_size), buffer(_buffer) + bool _exclusive, RegionInstance _buffer) + : parent_space(_parent), domain_transform(_domain_transform), buffer(_buffer) { this->exclusive = _exclusive; } diff --git a/src/realm/deppart/image.h b/src/realm/deppart/image.h index 82b6393eb7..2f0347c5ff 100644 --- a/src/realm/deppart/image.h +++ b/src/realm/deppart/image.h @@ -97,7 +97,7 @@ namespace Realm { ImageOperation(const IndexSpace &_parent, const DomainTransform &_domain_transform, const ProfilingRequestSet &reqs, GenEventImpl *_finish_event, - EventImpl::gen_t _finish_gen); + EventImpl::gen_t _finish_gen, RegionInstance buffer = RegionInstance::NO_INST); virtual ~ImageOperation(void); @@ -122,6 +122,7 @@ namespace Realm { std::vector > diff_rhss; std::vector > images; bool is_intersection; + RegionInstance buffer; }; template @@ -149,41 +150,13 @@ namespace Realm { std::vector > sparsity_outputs; }; - template - class GPUImageOperation : public PartitioningOperation { - public: - GPUImageOperation(const IndexSpace &_parent, - const DomainTransform &_domain_transform, - const ProfilingRequestSet &reqs, - size_t _buffer_size, - RegionInstance _buffer, - GenEventImpl *_finish_event, - EventImpl::gen_t _finish_gen); - - virtual ~GPUImageOperation(void); - - IndexSpace add_source(const IndexSpace &source); - - virtual void execute(void); - - virtual void print(std::ostream &os) const; - - protected: - IndexSpace parent; - DomainTransform domain_transform; - std::vector > sources; - std::vector > images; - size_t buffer_size; - RegionInstance buffer; - }; - template class GPUImageMicroOp : public GPUMicroOp { public: GPUImageMicroOp( const IndexSpace &_parent, const DomainTransform &_domain_transform, - bool _exclusive, size_t fixed_buffer_size = 0, RegionInstance buffer = RegionInstance::NO_INST); + bool _exclusive, RegionInstance buffer = RegionInstance::NO_INST); virtual ~GPUImageMicroOp(void); @@ -203,7 +176,6 @@ namespace Realm { DomainTransform domain_transform; std::vector > sources; std::vector > sparsity_outputs; - size_t fixed_buffer_size; RegionInstance buffer; }; }; // namespace Realm diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp index 6abb27c043..ce357436b7 100644 --- a/src/realm/deppart/image_gpu_impl.hpp +++ b/src/realm/deppart/image_gpu_impl.hpp @@ -231,7 +231,8 @@ void GPUImageMicroOp::gpu_populate_ptrs() cudaStream_t stream = Cuda::get_task_cuda_stream(); - size_t tile_size = fixed_buffer_size; + size_t tile_size = buffer.get_layout()->bytes_used; + std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; RegionInstance fixed_buffer = buffer; Arena buffer_arena(reinterpret_cast(AffineAccessor(fixed_buffer, 0).base), tile_size); diff --git a/src/realm/deppart/image_tmpl.cc b/src/realm/deppart/image_tmpl.cc index c12dfdb138..6f4371bae2 100644 --- a/src/realm/deppart/image_tmpl.cc +++ b/src/realm/deppart/image_tmpl.cc @@ -52,12 +52,8 @@ namespace Realm { template ImageMicroOp::ImageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \ template Event IndexSpace::create_subspaces_by_image( \ const DomainTransform &, const std::vector > &, \ - std::vector > &, const ProfilingRequestSet &, Event) \ - const; \ - template Event IndexSpace::gpu_subspaces_by_image( \ - const DomainTransform &, const std::vector > &, \ - std::vector > &, const ProfilingRequestSet &, std::pair &, \ - RegionInstance, Event) const; \ + std::vector > &, const ProfilingRequestSet &, Event, \ + RegionInstance, std::pair*) const; \ template Event IndexSpace::create_subspaces_by_image_with_difference( \ const DomainTransform &, \ const std::vector >&, \ diff --git a/src/realm/indexspace.h b/src/realm/indexspace.h index b61a77d689..448b2815fb 100644 --- a/src/realm/indexspace.h +++ b/src/realm/indexspace.h @@ -780,19 +780,10 @@ namespace Realm { const DomainTransform &domain_transform, const std::vector> &sources, std::vector> &images, const ProfilingRequestSet &reqs, - Event wait_on = Event::NO_EVENT) const; + Event wait_on = Event::NO_EVENT, RegionInstance buffer = RegionInstance::NO_INST, + std::pair* buffer_bounds = nullptr) const; ///@} - ///@{ - /// - - template - REALM_PUBLIC_API Event gpu_subspaces_by_image( - const DomainTransform &domain_transform, - const std::vector> &sources, - std::vector> &images, const ProfilingRequestSet &reqs, - std::pair &sizes, RegionInstance buffer = RegionInstance::NO_INST, Event wait_on = Event::NO_EVENT) const; - ///@} ///@{ /** @@ -823,15 +814,8 @@ namespace Realm { &field_data, const std::vector> &sources, std::vector> &images, const ProfilingRequestSet &reqs, - Event wait_on = Event::NO_EVENT) const; - - template - REALM_PUBLIC_API Event gpu_subspaces_by_image( - const std::vector, Point>> - &field_data, - const std::vector> &sources, - std::vector> &images, const ProfilingRequestSet &reqs, - std::pair &sizes, RegionInstance buffer = RegionInstance::NO_INST, Event wait_on = Event::NO_EVENT) const; + Event wait_on = Event::NO_EVENT, RegionInstance buffer = RegionInstance::NO_INST, + std::pair* buffer_bounds = nullptr) const; // range versions template @@ -847,7 +831,8 @@ namespace Realm { &field_data, const std::vector> &sources, std::vector> &images, const ProfilingRequestSet &reqs, - Event wait_on = Event::NO_EVENT) const; + Event wait_on = Event::NO_EVENT, RegionInstance buffer = RegionInstance::NO_INST, + std::pair* buffer_bounds = nullptr) const; ///@} ///@{ diff --git a/src/realm/indexspace.inl b/src/realm/indexspace.inl index c633aa5e46..87cab4ce47 100644 --- a/src/realm/indexspace.inl +++ b/src/realm/indexspace.inl @@ -968,22 +968,10 @@ namespace Realm { const std::vector, Point>> &field_data, const std::vector> &sources, std::vector> &images, const ProfilingRequestSet &reqs, - Event wait_on) const + Event wait_on, RegionInstance buffer, std::pair* buffer_bounds) const { return create_subspaces_by_image(DomainTransform(field_data), sources, - images, reqs, wait_on); - } - - template - template - inline Event IndexSpace::gpu_subspaces_by_image( - const std::vector, Point>> &field_data, - const std::vector> &sources, - std::vector> &images, const ProfilingRequestSet &reqs, - std::pair &sizes, RegionInstance buffer, Event wait_on) const - { - return gpu_subspaces_by_image(DomainTransform(field_data), sources, - images, reqs, sizes, buffer, wait_on); + images, reqs, wait_on, buffer, buffer_bounds); } template @@ -992,10 +980,10 @@ namespace Realm { const std::vector, Rect>> &field_data, const std::vector> &sources, std::vector> &images, const ProfilingRequestSet &reqs, - Event wait_on) const + Event wait_on, RegionInstance buffer, std::pair* buffer_bounds) const { return create_subspaces_by_image(DomainTransform(field_data), sources, - images, reqs, wait_on); + images, reqs, wait_on, buffer, buffer_bounds); } template diff --git a/tests/deppart.cc b/tests/deppart.cc index 815f2cb490..18d74c44f4 100644 --- a/tests/deppart.cc +++ b/tests/deppart.cc @@ -514,13 +514,13 @@ class BasicTest : public TestInterface { e01); if(wait_on_events) e02.wait(); std::pair estimate; - Event _e = is_nodes.gpu_subspaces_by_image(src_field_data_gpu, + Event _e = is_nodes.create_subspaces_by_image(src_field_data_gpu, p_garbage_edges, p_garbage_rd, Realm::ProfilingRequestSet(), - estimate, + e02, RegionInstance::NO_INST, - e02); + &estimate); std::cout << "Minimum size: " << estimate.first << " bytes, " << "Maximum size: " << estimate.second << " bytes\n"; @@ -535,14 +535,11 @@ class BasicTest : public TestInterface { IndexSpace<1> instance_index_space(Rect<1>(0, tile_size-1)); RegionInstance buffer; RegionInstance::create_instance(buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); - estimate.first = tile_size; - Event e03 = is_nodes.gpu_subspaces_by_image(src_field_data_gpu, + Event e03 = is_nodes.create_subspaces_by_image(src_field_data_gpu, p_garbage_edges, p_garbage_rd, Realm::ProfilingRequestSet(), - estimate, - buffer, - e02); + e02, buffer); if(wait_on_events) e03.wait(); Event e04 = is_edges.create_subspaces_by_preimage(dst_node_field_data, @@ -573,13 +570,11 @@ class BasicTest : public TestInterface { // an image of p_edges through out_node gives us all the shared nodes, along // with some private nodes - Event e3 = is_nodes.gpu_subspaces_by_image(src_field_data_gpu, + Event e3 = is_nodes.create_subspaces_by_image(src_field_data_gpu, p_edges, p_rd, Realm::ProfilingRequestSet(), - estimate, - buffer, - e2); + e2, buffer); if(wait_on_events) e3.wait(); log_app.info() << "GPU Image complete " << Clock::current_time_in_microseconds() << "\n"; log_app.info() << "Starting second GPU preimage " << Clock::current_time_in_microseconds() << "\n"; From fd2fe62316f2b1b6261a90a7f984d8301d9ebe8c Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Thu, 29 Jan 2026 00:00:22 -0800 Subject: [PATCH 03/32] API that builds --- src/realm/deppart/image.cc | 191 +++++++++++++++----------------- src/realm/deppart/image.h | 4 +- src/realm/deppart/image_tmpl.cc | 4 +- src/realm/indexspace.h | 31 +++++- src/realm/indexspace.inl | 8 +- tests/deppart.cc | 16 +-- 6 files changed, 129 insertions(+), 125 deletions(-) diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc index c57b86b426..19ecf60ea5 100644 --- a/src/realm/deppart/image.cc +++ b/src/realm/deppart/image.cc @@ -30,65 +30,56 @@ namespace Realm { extern Logger log_part; extern Logger log_uop_timing; + template + template + void IndexSpace::estimate_image( + const DeppartInput& input, + DeppartSuggestion& suggestion) { + size_t minimal_size = 0; + size_t source_entries = 0; + bool bvh = false; + for (size_t size : input.source_sizes) { + source_entries += size == 0 ? 1 : size; + } + minimal_size += sizeof(Rect) * source_entries; + if (this->dense()) { + minimal_size += sizeof(Rect); + } else { + minimal_size += sizeof(Rect) * input.parent_size; + } + if (bvh) { + minimal_size += + (source_entries * sizeof(uint64_t)) + + (source_entries * sizeof(size_t)) + + ((2*source_entries - 1) * sizeof(Rect)) + + (2 * (2*source_entries - 1) * sizeof(int)) + + sizeof(Rect) + + (2 * source_entries * sizeof(uint64_t)) + + (source_entries * sizeof(uint64_t)); + } + for (size_t i = 0; i < input.insts.size(); i++) { + IndexSpace is = input.insts[i].first; + Memory mem = input.insts[i].second; + size_t optimal_size = is.bounds.volume() * sizeof(Rect) * input.source_sizes.size() + minimal_size; + suggestion.suggestions[mem] = std::make_pair(minimal_size, optimal_size); + } + } + template template Event IndexSpace::create_subspaces_by_image( const DomainTransform &domain_transform, const std::vector> &sources, std::vector> &images, const ProfilingRequestSet &reqs, - Event wait_on, - RegionInstance buffer, std::pair* buffer_bounds) const { + Event wait_on, DeppartOutput* buffers) const { // output vector should start out empty assert(images.empty()); - if (buffer_bounds != nullptr || buffer != RegionInstance::NO_INST) { - size_t optimal_size = 0; - for (size_t i = 0; i < sources.size(); i++) { - optimal_size += 5 * sources[i].volume() * sizeof(RectDesc); - } - size_t minimal_size = 0; - size_t source_entries = 0; - bool bvh = false; - for (size_t i = 0; i < sources.size(); ++i) { - IndexSpace my_space = sources[i]; - if (my_space.dense()) { - source_entries += 1; - } else { - bvh = true; - source_entries += my_space.sparsity.impl()->get_entries().size(); - } - } - minimal_size += sizeof(Rect) * source_entries; - if (this->dense()) { - minimal_size += sizeof(Rect); - } else { - minimal_size += sizeof(Rect) * this->sparsity.impl()->get_entries().size(); - } - if (bvh) { - minimal_size += - (source_entries * sizeof(uint64_t)) + - (source_entries * sizeof(size_t)) + - ((2*source_entries - 1) * sizeof(Rect)) + - (2 * (2*source_entries - 1) * sizeof(int)) + - sizeof(Rect) + - (2 * source_entries * sizeof(uint64_t)) + - (source_entries * sizeof(uint64_t)); - } - if (buffer_bounds != nullptr && buffer == RegionInstance::NO_INST) { - *buffer_bounds = std::make_pair(minimal_size, minimal_size + optimal_size); - return Event::NO_EVENT; - } - assert(buffer != RegionInstance::NO_INST); - size_t buffer_size = buffer.get_layout()->bytes_used; - assert(buffer_size >= minimal_size); - } - - GenEventImpl *finish_event = GenEventImpl::create_genevent(); Event e = finish_event->current_event(); ImageOperation *op = new ImageOperation( - *this, domain_transform, reqs, finish_event, ID(e).event_generation(), buffer); + *this, domain_transform, reqs, finish_event, ID(e).event_generation(), buffers); size_t n = sources.size(); images.resize(n); @@ -480,11 +471,11 @@ namespace Realm { const IndexSpace &_parent, const DomainTransform &_domain_transform, const ProfilingRequestSet &reqs, GenEventImpl *_finish_event, - EventImpl::gen_t _finish_gen, RegionInstance _buffer) + EventImpl::gen_t _finish_gen, DeppartOutput* _buffers) : PartitioningOperation(reqs, _finish_event, _finish_gen), parent(_parent), domain_transform(_domain_transform), - buffer(_buffer) {} + buffers(_buffers) {} template ImageOperation::~ImageOperation(void) @@ -592,14 +583,14 @@ namespace Realm { template void ImageOperation::execute(void) { - std::vector,Point> > gpu_ptr_data; + std::map,Point> >> gpu_ptr_data; std::vector,Point> > cpu_ptr_data; - std::vector,Rect> > gpu_rect_data; + std::map,Rect> >> gpu_rect_data; std::vector,Rect> > cpu_rect_data; for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) { if (domain_transform.ptr_data[i].inst.get_location().kind() == Memory::GPU_FB_MEM) { - gpu_ptr_data.push_back(domain_transform.ptr_data[i]); + gpu_ptr_data[domain_transform.ptr_data[i].inst.get_location()].push_back(domain_transform.ptr_data[i]); } else { cpu_ptr_data.push_back(domain_transform.ptr_data[i]); } @@ -607,13 +598,12 @@ namespace Realm { for (size_t i = 0; i < domain_transform.range_data.size(); i++) { if (domain_transform.range_data[i].inst.get_location().kind() == Memory::GPU_FB_MEM) { - gpu_rect_data.push_back(domain_transform.range_data[i]); + gpu_rect_data[domain_transform.range_data[i].inst.get_location()].push_back(domain_transform.range_data[i]); } else { cpu_rect_data.push_back(domain_transform.range_data[i]); } } bool gpu_data = !gpu_ptr_data.empty() || !gpu_rect_data.empty(); - bool cpu_data = !cpu_ptr_data.empty() || !cpu_rect_data.empty(); if (domain_transform.type == DomainTransform::DomainTransformType::STRUCTURED && !gpu_data) { @@ -649,54 +639,55 @@ namespace Realm { uop->dispatch(this, true /* ok to run in this thread */); } else { - if (cpu_data) { - // launch full cross-product of image micro ops right away - for (size_t i = 0; i < sources.size(); i++) - SparsityMapImpl::lookup(images[i])->set_contributor_count( - cpu_ptr_data.size() + - cpu_rect_data.size() + (gpu_data ? 1 : 0)); - - for (size_t i = 0; i < cpu_ptr_data.size(); i++) { - ImageMicroOp *uop = new ImageMicroOp( - parent, cpu_ptr_data[i].index_space, - cpu_ptr_data[i].inst, - cpu_ptr_data[i].field_offset, false /*ptrs*/); - for (size_t j = 0; j < sources.size(); j++) - if (diff_rhss.empty()) - uop->add_sparsity_output(sources[j], images[j]); - else - uop->add_sparsity_output_with_difference(sources[j], diff_rhss[j], - images[j]); - - uop->dispatch(this, true /* ok to run in this thread */); - } - - for (size_t i = 0; i < cpu_rect_data.size(); i++) { - ImageMicroOp *uop = new ImageMicroOp( - parent, cpu_rect_data[i].index_space, - cpu_rect_data[i].inst, - cpu_rect_data[i].field_offset, true /*ranges*/); - for (size_t j = 0; j < sources.size(); j++) - if (diff_rhss.empty()) - uop->add_sparsity_output(sources[j], images[j]); - else - uop->add_sparsity_output_with_difference(sources[j], diff_rhss[j], - images[j]); - - uop->dispatch(this, true /* ok to run in this thread */); - } - } - if (gpu_data) { - std::swap(domain_transform.ptr_data, gpu_ptr_data); - std::swap(domain_transform.range_data, gpu_rect_data); - GPUImageMicroOp *micro_op = - new GPUImageMicroOp( - parent, domain_transform, !cpu_data, buffer); - for (size_t j = 0; j < sources.size(); j++) { - micro_op->add_sparsity_output(sources[j], images[j]); + if (gpu_data) assert(buffers); + bool opcount = cpu_ptr_data.size() + cpu_rect_data.size() + gpu_ptr_data.size() + gpu_rect_data.size(); + bool exclusive = gpu_data && (opcount == 1); + if (!exclusive) { + // launch full cross-product of image micro ops right away + for (size_t i = 0; i < sources.size(); i++) { + SparsityMapImpl::lookup(images[i])->set_contributor_count(opcount); } - micro_op->dispatch(this, true); - } + } + for (size_t i = 0; i < cpu_ptr_data.size(); i++) { + ImageMicroOp *uop = new ImageMicroOp( + parent, cpu_ptr_data[i].index_space, + cpu_ptr_data[i].inst, + cpu_ptr_data[i].field_offset, false /*ptrs*/); + for (size_t j = 0; j < sources.size(); j++) + if (diff_rhss.empty()) + uop->add_sparsity_output(sources[j], images[j]); + else + uop->add_sparsity_output_with_difference(sources[j], diff_rhss[j], + images[j]); + uop->dispatch(this, true /* ok to run in this thread */); + } + for (size_t i = 0; i < cpu_rect_data.size(); i++) { + ImageMicroOp *uop = new ImageMicroOp( + parent, cpu_rect_data[i].index_space, + cpu_rect_data[i].inst, + cpu_rect_data[i].field_offset, true /*ranges*/); + for (size_t j = 0; j < sources.size(); j++) + if (diff_rhss.empty()) + uop->add_sparsity_output(sources[j], images[j]); + else + uop->add_sparsity_output_with_difference(sources[j], diff_rhss[j], + images[j]); + uop->dispatch(this, true /* ok to run in this thread */); + } + for (auto it = gpu_ptr_data.begin(); it != gpu_ptr_data.end(); it++) { + // launch full cross-product of image micro ops right away + Memory my_mem = it->first; + domain_transform.ptr_data = it->second; + assert(buffers->buffers.find(my_mem) != buffers->buffers.end()); + RegionInstance buffer = buffers->buffers[my_mem]; + GPUImageMicroOp *micro_op = + new GPUImageMicroOp( + parent, domain_transform, exclusive, buffer); + for (size_t j = 0; j < sources.size(); j++) { + micro_op->add_sparsity_output(sources[j], images[j]); + } + micro_op->dispatch(this, true); + } } } diff --git a/src/realm/deppart/image.h b/src/realm/deppart/image.h index 2f0347c5ff..cafa58b56e 100644 --- a/src/realm/deppart/image.h +++ b/src/realm/deppart/image.h @@ -97,7 +97,7 @@ namespace Realm { ImageOperation(const IndexSpace &_parent, const DomainTransform &_domain_transform, const ProfilingRequestSet &reqs, GenEventImpl *_finish_event, - EventImpl::gen_t _finish_gen, RegionInstance buffer = RegionInstance::NO_INST); + EventImpl::gen_t _finish_gen, DeppartOutput* buffers = nullptr); virtual ~ImageOperation(void); @@ -122,7 +122,7 @@ namespace Realm { std::vector > diff_rhss; std::vector > images; bool is_intersection; - RegionInstance buffer; + DeppartOutput* buffers; }; template diff --git a/src/realm/deppart/image_tmpl.cc b/src/realm/deppart/image_tmpl.cc index 6f4371bae2..f07359c745 100644 --- a/src/realm/deppart/image_tmpl.cc +++ b/src/realm/deppart/image_tmpl.cc @@ -50,10 +50,12 @@ namespace Realm { template class GPUImageMicroOp; \ template class ImageOperation; \ template ImageMicroOp::ImageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \ + template void IndexSpace::estimate_image( \ + const DeppartInput&, DeppartSuggestion&); \ template Event IndexSpace::create_subspaces_by_image( \ const DomainTransform &, const std::vector > &, \ std::vector > &, const ProfilingRequestSet &, Event, \ - RegionInstance, std::pair*) const; \ + DeppartOutput*) const; \ template Event IndexSpace::create_subspaces_by_image_with_difference( \ const DomainTransform &, \ const std::vector >&, \ diff --git a/src/realm/indexspace.h b/src/realm/indexspace.h index 448b2815fb..4e56b4e4a8 100644 --- a/src/realm/indexspace.h +++ b/src/realm/indexspace.h @@ -111,6 +111,21 @@ namespace Realm { size_t field_offset; }; + template + struct DeppartInput { + std::vector, Memory>> insts; + std::vector source_sizes; + size_t parent_size; + }; + + struct DeppartSuggestion { + std::map> suggestions; + }; + + struct DeppartOutput { + std::map buffers; + }; + /** * \class TranslationTransform * A translation transform is a special case of an affine transform @@ -780,8 +795,14 @@ namespace Realm { const DomainTransform &domain_transform, const std::vector> &sources, std::vector> &images, const ProfilingRequestSet &reqs, - Event wait_on = Event::NO_EVENT, RegionInstance buffer = RegionInstance::NO_INST, - std::pair* buffer_bounds = nullptr) const; + Event wait_on = Event::NO_EVENT, DeppartOutput *buffers = nullptr) const; + + template + REALM_PUBLIC_API void estimate_image( + const DeppartInput &input, + DeppartSuggestion &suggestion); + + ///@} @@ -814,8 +835,7 @@ namespace Realm { &field_data, const std::vector> &sources, std::vector> &images, const ProfilingRequestSet &reqs, - Event wait_on = Event::NO_EVENT, RegionInstance buffer = RegionInstance::NO_INST, - std::pair* buffer_bounds = nullptr) const; + Event wait_on = Event::NO_EVENT, DeppartOutput* buffers = nullptr) const; // range versions template @@ -831,8 +851,7 @@ namespace Realm { &field_data, const std::vector> &sources, std::vector> &images, const ProfilingRequestSet &reqs, - Event wait_on = Event::NO_EVENT, RegionInstance buffer = RegionInstance::NO_INST, - std::pair* buffer_bounds = nullptr) const; + Event wait_on = Event::NO_EVENT, DeppartOutput* buffers = nullptr) const; ///@} ///@{ diff --git a/src/realm/indexspace.inl b/src/realm/indexspace.inl index 87cab4ce47..0627e9799a 100644 --- a/src/realm/indexspace.inl +++ b/src/realm/indexspace.inl @@ -968,10 +968,10 @@ namespace Realm { const std::vector, Point>> &field_data, const std::vector> &sources, std::vector> &images, const ProfilingRequestSet &reqs, - Event wait_on, RegionInstance buffer, std::pair* buffer_bounds) const + Event wait_on, DeppartOutput* buffers) const { return create_subspaces_by_image(DomainTransform(field_data), sources, - images, reqs, wait_on, buffer, buffer_bounds); + images, reqs, wait_on, buffers); } template @@ -980,10 +980,10 @@ namespace Realm { const std::vector, Rect>> &field_data, const std::vector> &sources, std::vector> &images, const ProfilingRequestSet &reqs, - Event wait_on, RegionInstance buffer, std::pair* buffer_bounds) const + Event wait_on, DeppartOutput* buffers) const { return create_subspaces_by_image(DomainTransform(field_data), sources, - images, reqs, wait_on, buffer, buffer_bounds); + images, reqs, wait_on, buffers); } template diff --git a/tests/deppart.cc b/tests/deppart.cc index 18d74c44f4..624bb84a97 100644 --- a/tests/deppart.cc +++ b/tests/deppart.cc @@ -513,16 +513,7 @@ class BasicTest : public TestInterface { Realm::ProfilingRequestSet(), e01); if(wait_on_events) e02.wait(); - std::pair estimate; - Event _e = is_nodes.create_subspaces_by_image(src_field_data_gpu, - p_garbage_edges, - p_garbage_rd, - Realm::ProfilingRequestSet(), - e02, - RegionInstance::NO_INST, - &estimate); - std::cout << "Minimum size: " << estimate.first << " bytes, " - << "Maximum size: " << estimate.second << " bytes\n"; + DeppartOutput output; // an image of p_edges through out_node gives us all the shared nodes, along // with some private nodes @@ -535,11 +526,12 @@ class BasicTest : public TestInterface { IndexSpace<1> instance_index_space(Rect<1>(0, tile_size-1)); RegionInstance buffer; RegionInstance::create_instance(buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + output.buffers[gpu_memory] = buffer; Event e03 = is_nodes.create_subspaces_by_image(src_field_data_gpu, p_garbage_edges, p_garbage_rd, Realm::ProfilingRequestSet(), - e02, buffer); + e02, &output); if(wait_on_events) e03.wait(); Event e04 = is_edges.create_subspaces_by_preimage(dst_node_field_data, @@ -574,7 +566,7 @@ class BasicTest : public TestInterface { p_edges, p_rd, Realm::ProfilingRequestSet(), - e2, buffer); + e2, &output); if(wait_on_events) e3.wait(); log_app.info() << "GPU Image complete " << Clock::current_time_in_microseconds() << "\n"; log_app.info() << "Starting second GPU preimage " << Clock::current_time_in_microseconds() << "\n"; From 0987dc89a4bb9f9db09fd1635900eaf2b9583b88 Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Sun, 1 Feb 2026 20:18:15 -0800 Subject: [PATCH 04/32] Finished image API --- src/realm/deppart/image.cc | 103 ++++++++++++++-------- src/realm/deppart/image.h | 6 +- src/realm/deppart/image_gpu_impl.hpp | 5 +- src/realm/deppart/image_tmpl.cc | 9 +- src/realm/deppart/partitions_gpu_impl.hpp | 2 +- src/realm/indexspace.h | 33 ++++--- src/realm/indexspace.inl | 8 +- tests/deppart.cc | 11 ++- 8 files changed, 103 insertions(+), 74 deletions(-) diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc index 19ecf60ea5..d2585bbef0 100644 --- a/src/realm/deppart/image.cc +++ b/src/realm/deppart/image.cc @@ -32,20 +32,21 @@ namespace Realm { template template - void IndexSpace::estimate_image( - const DeppartInput& input, - DeppartSuggestion& suggestion) { + void IndexSpace::suggest_deppart_buffer_size( + const std::vector>& source_spaces, + const std::vector>& inputs, + std::vector& suggestions) const { size_t minimal_size = 0; size_t source_entries = 0; bool bvh = false; - for (size_t size : input.source_sizes) { - source_entries += size == 0 ? 1 : size; + for (auto subspace : source_spaces) { + source_entries += subspace.entries == 0 ? 1 : subspace.entries; } minimal_size += sizeof(Rect) * source_entries; if (this->dense()) { minimal_size += sizeof(Rect); } else { - minimal_size += sizeof(Rect) * input.parent_size; + minimal_size += sizeof(Rect) * this->sparsity.impl()->get_entries().size(); } if (bvh) { minimal_size += @@ -57,11 +58,27 @@ namespace Realm { (2 * source_entries * sizeof(uint64_t)) + (source_entries * sizeof(uint64_t)); } - for (size_t i = 0; i < input.insts.size(); i++) { - IndexSpace is = input.insts[i].first; - Memory mem = input.insts[i].second; - size_t optimal_size = is.bounds.volume() * sizeof(Rect) * input.source_sizes.size() + minimal_size; - suggestion.suggestions[mem] = std::make_pair(minimal_size, optimal_size); + std::vector result(inputs.size()); + for (size_t i = 0; i < inputs.size(); i++) { + IndexSpace is = inputs[i].space; + Memory mem = inputs[i].location; + if (mem.kind() == Memory::GPU_FB_MEM || + mem.kind() == Memory::Z_COPY_MEM) { + const char* val = std::getenv("MIN_SIZE"); // or any env var + size_t device_size = 2000000; //default + if (val) { + device_size = atoi(val); + } + minimal_size = max(minimal_size, device_size); + size_t optimal_size = is.bounds.volume() * sizeof(Rect) * source_spaces.size() + minimal_size; + result[i].suggested = mem; + result[i].lower_bound = minimal_size; + result[i].upper_bound = optimal_size; + } else { + result[i].suggested = Memory::NO_MEMORY; + result[i].lower_bound = 0; + result[i].upper_bound = 0; + } } } @@ -71,7 +88,7 @@ namespace Realm { const DomainTransform &domain_transform, const std::vector> &sources, std::vector> &images, const ProfilingRequestSet &reqs, - Event wait_on, DeppartOutput* buffers) const { + Event wait_on) const { // output vector should start out empty assert(images.empty()); @@ -79,7 +96,7 @@ namespace Realm { Event e = finish_event->current_event(); ImageOperation *op = new ImageOperation( - *this, domain_transform, reqs, finish_event, ID(e).event_generation(), buffers); + *this, domain_transform, reqs, finish_event, ID(e).event_generation()); size_t n = sources.size(); images.resize(n); @@ -471,11 +488,10 @@ namespace Realm { const IndexSpace &_parent, const DomainTransform &_domain_transform, const ProfilingRequestSet &reqs, GenEventImpl *_finish_event, - EventImpl::gen_t _finish_gen, DeppartOutput* _buffers) + EventImpl::gen_t _finish_gen) : PartitioningOperation(reqs, _finish_event, _finish_gen), parent(_parent), - domain_transform(_domain_transform), - buffers(_buffers) {} + domain_transform(_domain_transform) {} template ImageOperation::~ImageOperation(void) @@ -583,26 +599,26 @@ namespace Realm { template void ImageOperation::execute(void) { - std::map,Point> >> gpu_ptr_data; + std::vector,Point> > gpu_ptr_data; std::vector,Point> > cpu_ptr_data; - std::map,Rect> >> gpu_rect_data; + std::vector,Rect> > gpu_rect_data; std::vector,Rect> > cpu_rect_data; for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) { if (domain_transform.ptr_data[i].inst.get_location().kind() == - Memory::GPU_FB_MEM) { - gpu_ptr_data[domain_transform.ptr_data[i].inst.get_location()].push_back(domain_transform.ptr_data[i]); + Memory::GPU_FB_MEM || domain_transform.ptr_data[i].inst.get_location().kind() == Memory::Z_COPY_MEM) { + gpu_ptr_data.push_back(domain_transform.ptr_data[i]); } else { cpu_ptr_data.push_back(domain_transform.ptr_data[i]); } } - for (size_t i = 0; i < domain_transform.range_data.size(); i++) { - if (domain_transform.range_data[i].inst.get_location().kind() == - Memory::GPU_FB_MEM) { - gpu_rect_data[domain_transform.range_data[i].inst.get_location()].push_back(domain_transform.range_data[i]); - } else { - cpu_rect_data.push_back(domain_transform.range_data[i]); - } - } + for (size_t i = 0; i < domain_transform.range_data.size(); i++) { + if (domain_transform.range_data[i].inst.get_location().kind() == + Memory::GPU_FB_MEM || domain_transform.range_data[i].inst.get_location().kind() == Memory::Z_COPY_MEM) { + gpu_rect_data.push_back(domain_transform.range_data[i]); + } else { + cpu_rect_data.push_back(domain_transform.range_data[i]); + } + } bool gpu_data = !gpu_ptr_data.empty() || !gpu_rect_data.empty(); if (domain_transform.type == DomainTransform::DomainTransformType::STRUCTURED && !gpu_data) { @@ -639,8 +655,7 @@ namespace Realm { uop->dispatch(this, true /* ok to run in this thread */); } else { - if (gpu_data) assert(buffers); - bool opcount = cpu_ptr_data.size() + cpu_rect_data.size() + gpu_ptr_data.size() + gpu_rect_data.size(); + size_t opcount = cpu_ptr_data.size() + cpu_rect_data.size() + gpu_ptr_data.size() + gpu_rect_data.size(); bool exclusive = gpu_data && (opcount == 1); if (!exclusive) { // launch full cross-product of image micro ops right away @@ -674,20 +689,30 @@ namespace Realm { images[j]); uop->dispatch(this, true /* ok to run in this thread */); } - for (auto it = gpu_ptr_data.begin(); it != gpu_ptr_data.end(); it++) { + for (auto ptr_fdd : gpu_ptr_data) { // launch full cross-product of image micro ops right away - Memory my_mem = it->first; - domain_transform.ptr_data = it->second; - assert(buffers->buffers.find(my_mem) != buffers->buffers.end()); - RegionInstance buffer = buffers->buffers[my_mem]; + assert(ptr_fdd.scratch_buffer != RegionInstance::NO_INST); + domain_transform.ptr_data = {ptr_fdd}; GPUImageMicroOp *micro_op = - new GPUImageMicroOp( - parent, domain_transform, exclusive, buffer); + new GPUImageMicroOp( + parent, domain_transform, exclusive); for (size_t j = 0; j < sources.size(); j++) { micro_op->add_sparsity_output(sources[j], images[j]); } micro_op->dispatch(this, true); } + for (auto rect_fdd : gpu_rect_data) { + // launch full cross-product of image micro ops right away + assert(rect_fdd.scratch_buffer != RegionInstance::NO_INST); + domain_transform.range_data = {rect_fdd}; + GPUImageMicroOp *micro_op = + new GPUImageMicroOp( + parent, domain_transform, exclusive); + for (size_t j = 0; j < sources.size(); j++) { + micro_op->add_sparsity_output(sources[j], images[j]); + } + micro_op->dispatch(this, true); + } } } @@ -897,8 +922,8 @@ namespace Realm { GPUImageMicroOp::GPUImageMicroOp( const IndexSpace &_parent, const DomainTransform &_domain_transform, - bool _exclusive, RegionInstance _buffer) - : parent_space(_parent), domain_transform(_domain_transform), buffer(_buffer) + bool _exclusive) + : parent_space(_parent), domain_transform(_domain_transform) { this->exclusive = _exclusive; } diff --git a/src/realm/deppart/image.h b/src/realm/deppart/image.h index cafa58b56e..58131338a3 100644 --- a/src/realm/deppart/image.h +++ b/src/realm/deppart/image.h @@ -97,7 +97,7 @@ namespace Realm { ImageOperation(const IndexSpace &_parent, const DomainTransform &_domain_transform, const ProfilingRequestSet &reqs, GenEventImpl *_finish_event, - EventImpl::gen_t _finish_gen, DeppartOutput* buffers = nullptr); + EventImpl::gen_t _finish_gen); virtual ~ImageOperation(void); @@ -122,7 +122,6 @@ namespace Realm { std::vector > diff_rhss; std::vector > images; bool is_intersection; - DeppartOutput* buffers; }; template @@ -156,7 +155,7 @@ namespace Realm { GPUImageMicroOp( const IndexSpace &_parent, const DomainTransform &_domain_transform, - bool _exclusive, RegionInstance buffer = RegionInstance::NO_INST); + bool _exclusive); virtual ~GPUImageMicroOp(void); @@ -176,7 +175,6 @@ namespace Realm { DomainTransform domain_transform; std::vector > sources; std::vector > sparsity_outputs; - RegionInstance buffer; }; }; // namespace Realm diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp index ce357436b7..b3c38789f5 100644 --- a/src/realm/deppart/image_gpu_impl.hpp +++ b/src/realm/deppart/image_gpu_impl.hpp @@ -224,6 +224,8 @@ void GPUImageMicroOp::gpu_populate_ptrs() return; } + RegionInstance buffer = domain_transform.ptr_data[0].scratch_buffer; + NVTX_DEPPART(gpu_image); Memory sysmem; @@ -233,8 +235,7 @@ void GPUImageMicroOp::gpu_populate_ptrs() size_t tile_size = buffer.get_layout()->bytes_used; std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; - RegionInstance fixed_buffer = buffer; - Arena buffer_arena(reinterpret_cast(AffineAccessor(fixed_buffer, 0).base), tile_size); + Arena buffer_arena(reinterpret_cast(AffineAccessor(buffer, 0).base), tile_size); collapsed_space src_space; src_space.offsets = buffer_arena.alloc(sources.size()+1); diff --git a/src/realm/deppart/image_tmpl.cc b/src/realm/deppart/image_tmpl.cc index f07359c745..288e583758 100644 --- a/src/realm/deppart/image_tmpl.cc +++ b/src/realm/deppart/image_tmpl.cc @@ -50,12 +50,13 @@ namespace Realm { template class GPUImageMicroOp; \ template class ImageOperation; \ template ImageMicroOp::ImageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \ - template void IndexSpace::estimate_image( \ - const DeppartInput&, DeppartSuggestion&); \ + template void IndexSpace::suggest_deppart_buffer_size( \ + const std::vector>&, \ + const std::vector>&, \ + std::vector&) const; \ template Event IndexSpace::create_subspaces_by_image( \ const DomainTransform &, const std::vector > &, \ - std::vector > &, const ProfilingRequestSet &, Event, \ - DeppartOutput*) const; \ + std::vector > &, const ProfilingRequestSet &, Event) const; \ template Event IndexSpace::create_subspaces_by_image_with_difference( \ const DomainTransform &, \ const std::vector >&, \ diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp index 678102b56f..b1459f2ede 100644 --- a/src/realm/deppart/partitions_gpu_impl.hpp +++ b/src/realm/deppart/partitions_gpu_impl.hpp @@ -1524,7 +1524,7 @@ namespace Realm { std::vector> h_rects(end - start); CUDA_CHECK(cudaMemcpyAsync(h_rects.data(), final_rects + start, (end - start) * sizeof(Rect), cudaMemcpyDeviceToHost, stream), stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); - impl->contribute_dense_rect_list(h_rects, true); + impl->contribute_dense_rect_list(h_rects, false); } else { impl->contribute_nothing(); } diff --git a/src/realm/indexspace.h b/src/realm/indexspace.h index 4e56b4e4a8..cf6caf9a26 100644 --- a/src/realm/indexspace.h +++ b/src/realm/indexspace.h @@ -109,21 +109,25 @@ namespace Realm { IS index_space; RegionInstance inst; size_t field_offset; + RegionInstance scratch_buffer = RegionInstance::NO_INST; }; template - struct DeppartInput { - std::vector, Memory>> insts; - std::vector source_sizes; - size_t parent_size; + struct DeppartSubspace { + IndexSpace space; + size_t entries; }; - struct DeppartSuggestion { - std::map> suggestions; + template + struct DeppartEstimateInput { + IndexSpace space; + Memory location; }; - struct DeppartOutput { - std::map buffers; + struct DeppartEstimateSuggestion { + Memory suggested; + size_t lower_bound; + size_t upper_bound; }; /** @@ -795,12 +799,13 @@ namespace Realm { const DomainTransform &domain_transform, const std::vector> &sources, std::vector> &images, const ProfilingRequestSet &reqs, - Event wait_on = Event::NO_EVENT, DeppartOutput *buffers = nullptr) const; + Event wait_on = Event::NO_EVENT) const; template - REALM_PUBLIC_API void estimate_image( - const DeppartInput &input, - DeppartSuggestion &suggestion); + REALM_PUBLIC_API void suggest_deppart_buffer_size( + const std::vector>& source_spaces, + const std::vector>& inputs, + std::vector& suggestions) const; ///@} @@ -835,7 +840,7 @@ namespace Realm { &field_data, const std::vector> &sources, std::vector> &images, const ProfilingRequestSet &reqs, - Event wait_on = Event::NO_EVENT, DeppartOutput* buffers = nullptr) const; + Event wait_on = Event::NO_EVENT) const; // range versions template @@ -851,7 +856,7 @@ namespace Realm { &field_data, const std::vector> &sources, std::vector> &images, const ProfilingRequestSet &reqs, - Event wait_on = Event::NO_EVENT, DeppartOutput* buffers = nullptr) const; + Event wait_on = Event::NO_EVENT) const; ///@} ///@{ diff --git a/src/realm/indexspace.inl b/src/realm/indexspace.inl index 0627e9799a..d2c41e4c4e 100644 --- a/src/realm/indexspace.inl +++ b/src/realm/indexspace.inl @@ -968,10 +968,10 @@ namespace Realm { const std::vector, Point>> &field_data, const std::vector> &sources, std::vector> &images, const ProfilingRequestSet &reqs, - Event wait_on, DeppartOutput* buffers) const + Event wait_on) const { return create_subspaces_by_image(DomainTransform(field_data), sources, - images, reqs, wait_on, buffers); + images, reqs, wait_on); } template @@ -980,10 +980,10 @@ namespace Realm { const std::vector, Rect>> &field_data, const std::vector> &sources, std::vector> &images, const ProfilingRequestSet &reqs, - Event wait_on, DeppartOutput* buffers) const + Event wait_on) const { return create_subspaces_by_image(DomainTransform(field_data), sources, - images, reqs, wait_on, buffers); + images, reqs, wait_on); } template diff --git a/tests/deppart.cc b/tests/deppart.cc index 624bb84a97..eaf4a012e8 100644 --- a/tests/deppart.cc +++ b/tests/deppart.cc @@ -513,7 +513,6 @@ class BasicTest : public TestInterface { Realm::ProfilingRequestSet(), e01); if(wait_on_events) e02.wait(); - DeppartOutput output; // an image of p_edges through out_node gives us all the shared nodes, along // with some private nodes @@ -524,14 +523,14 @@ class BasicTest : public TestInterface { } std::vector byte_fields = {sizeof(char)}; IndexSpace<1> instance_index_space(Rect<1>(0, tile_size-1)); - RegionInstance buffer; - RegionInstance::create_instance(buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); - output.buffers[gpu_memory] = buffer; + for (size_t i = 0; i < src_field_data_gpu.size(); i++) { + RegionInstance::create_instance(src_field_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + } Event e03 = is_nodes.create_subspaces_by_image(src_field_data_gpu, p_garbage_edges, p_garbage_rd, Realm::ProfilingRequestSet(), - e02, &output); + e02); if(wait_on_events) e03.wait(); Event e04 = is_edges.create_subspaces_by_preimage(dst_node_field_data, @@ -566,7 +565,7 @@ class BasicTest : public TestInterface { p_edges, p_rd, Realm::ProfilingRequestSet(), - e2, &output); + e2); if(wait_on_events) e3.wait(); log_app.info() << "GPU Image complete " << Clock::current_time_in_microseconds() << "\n"; log_app.info() << "Starting second GPU preimage " << Clock::current_time_in_microseconds() << "\n"; From 460710d2582a7cafb3bf3444cc4405e0511a1719 Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Sun, 1 Feb 2026 22:06:28 -0800 Subject: [PATCH 05/32] builds with new APIs (ops themselves are slightly broken) --- src/CMakeLists.txt | 2 +- src/realm/deppart/byfield.cc | 62 +- src/realm/deppart/byfield_tmpl.cc | 6 +- src/realm/deppart/image.cc | 16 +- src/realm/deppart/image_tmpl.cc | 2 +- src/realm/deppart/preimage.cc | 1149 +++++++++++--------- src/realm/deppart/preimage.h | 35 +- src/realm/deppart/preimage_gpu_impl.hpp | 468 ++++++++ src/realm/deppart/preimage_gpu_kernels.hpp | 256 +++++ src/realm/deppart/preimage_gpu_tmpl.cu | 69 ++ src/realm/deppart/preimage_tmpl.cc | 43 +- src/realm/indexspace.h | 13 +- 12 files changed, 1572 insertions(+), 549 deletions(-) create mode 100644 src/realm/deppart/preimage_gpu_impl.hpp create mode 100644 src/realm/deppart/preimage_gpu_kernels.hpp create mode 100644 src/realm/deppart/preimage_gpu_tmpl.cu diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index fd0b1fb81a..c277a1b74d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -158,7 +158,7 @@ endforeach() # Generate per-dimension object files for GPU deppart. foreach(INST_N1 RANGE 1 ${REALM_MAX_DIM}) foreach(INST_N2 RANGE 1 ${REALM_MAX_DIM}) - foreach(SRCFILE realm/deppart/byfield realm/deppart/image) + foreach(SRCFILE realm/deppart/byfield realm/deppart/image realm/deppart/preimage) set(_result_file "${CMAKE_CURRENT_BINARY_DIR}/${SRCFILE}_gpu_${INST_N1}_${INST_N2}.cu") # use cmake's configure_file for a portable way of creating wrapper source files configure_file("${PROJECT_SOURCE_DIR}/cmake/deppart_tmpl.cu.in" "${_result_file}") diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc index 51b106f519..c6ccacc6ce 100644 --- a/src/realm/deppart/byfield.cc +++ b/src/realm/deppart/byfield.cc @@ -29,6 +29,34 @@ namespace Realm { extern Logger log_part; extern Logger log_uop_timing; + template + template + void IndexSpace::suggest_byfield_buffer_size( + const std::vector>& inputs, + std::vector& suggestions) const { + suggestions = std::vector(inputs.size()); + for (size_t i = 0; i < inputs.size(); i++) { + IndexSpace is = inputs[i].space; + Memory mem = inputs[i].location; + if (mem.kind() == Memory::GPU_FB_MEM || + mem.kind() == Memory::Z_COPY_MEM) { + const char* val = std::getenv("MIN_SIZE"); // or any env var + size_t device_size = 2000000; //default + if (val) { + device_size = atoi(val); + } + size_t optimal_size = is.bounds.volume() * sizeof(Rect); + suggestions[i].suggested = mem; + suggestions[i].lower_bound = device_size; + suggestions[i].upper_bound = max(device_size, optimal_size); + } else { + suggestions[i].suggested = Memory::NO_MEMORY; + suggestions[i].lower_bound = 0; + suggestions[i].upper_bound = 0; + } + } + } + template template @@ -380,33 +408,35 @@ namespace Realm { std::vector,FT> > gpu_field_data; std::vector,FT> > cpu_field_data; for (size_t i = 0; i < field_data.size(); i++) { - if (field_data[i].inst.get_location().kind() == Memory::GPU_FB_MEM) { + if (field_data[i].inst.get_location().kind() == Memory::GPU_FB_MEM + || field_data[i].inst.get_location().kind() == Memory::Z_COPY_MEM) { gpu_field_data.push_back(field_data[i]); } else { cpu_field_data.push_back(field_data[i]); } } - if (!cpu_field_data.empty()) { + bool exclusive = (gpu_field_data.size() == 1) && cpu_field_data.empty(); + if (!exclusive) { for (size_t i = 0; i < subspaces.size(); i++) - SparsityMapImpl::lookup(subspaces[i])->set_contributor_count(cpu_field_data.size() + (gpu_field_data.empty() ? 0 : 1)); - for (size_t i = 0; i < cpu_field_data.size(); i++) { - ByFieldMicroOp *uop = new ByFieldMicroOp(parent, - cpu_field_data[i].index_space, - cpu_field_data[i].inst, - cpu_field_data[i].field_offset); - for (size_t j = 0; j < colors.size(); j++) - uop->add_sparsity_output(colors[j], subspaces[j]); - - uop->dispatch(this, true /* ok to run in this thread */); - } + SparsityMapImpl::lookup(subspaces[i])->set_contributor_count(cpu_field_data.size() + gpu_field_data.size()); } - if (!gpu_field_data.empty()) { - GPUByFieldMicroOp *uop = new GPUByFieldMicroOp(parent, gpu_field_data, cpu_field_data.empty()); + for (size_t i = 0; i < cpu_field_data.size(); i++) { + ByFieldMicroOp *uop = new ByFieldMicroOp(parent, + cpu_field_data[i].index_space, + cpu_field_data[i].inst, + cpu_field_data[i].field_offset); + for (size_t j = 0; j < colors.size(); j++) + uop->add_sparsity_output(colors[j], subspaces[j]); + + uop->dispatch(this, true /* ok to run in this thread */); + } + for (auto fdd : gpu_field_data) { + std::vector,FT> > single_gpu_field_data = {fdd}; + GPUByFieldMicroOp *uop = new GPUByFieldMicroOp(parent, single_gpu_field_data, exclusive); for (size_t i = 0; i < colors.size(); i++) { uop->add_sparsity_output(colors[i], subspaces[i]); } uop->dispatch(this, false); - } } diff --git a/src/realm/deppart/byfield_tmpl.cc b/src/realm/deppart/byfield_tmpl.cc index 38a95a040d..7575607ea2 100644 --- a/src/realm/deppart/byfield_tmpl.cc +++ b/src/realm/deppart/byfield_tmpl.cc @@ -52,7 +52,11 @@ namespace Realm { const std::vector&, \ std::vector >&, \ const ProfilingRequestSet &, \ - Event) const; + Event) const; \ + template void IndexSpace::suggest_byfield_buffer_size( \ + const std::vector>&, \ + std::vector&) const; + FOREACH_NTF(DOIT) diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc index d2585bbef0..d207161b22 100644 --- a/src/realm/deppart/image.cc +++ b/src/realm/deppart/image.cc @@ -32,7 +32,7 @@ namespace Realm { template template - void IndexSpace::suggest_deppart_buffer_size( + void IndexSpace::suggest_image_buffer_size( const std::vector>& source_spaces, const std::vector>& inputs, std::vector& suggestions) const { @@ -58,7 +58,7 @@ namespace Realm { (2 * source_entries * sizeof(uint64_t)) + (source_entries * sizeof(uint64_t)); } - std::vector result(inputs.size()); + suggestions = std::vector(inputs.size()); for (size_t i = 0; i < inputs.size(); i++) { IndexSpace is = inputs[i].space; Memory mem = inputs[i].location; @@ -71,13 +71,13 @@ namespace Realm { } minimal_size = max(minimal_size, device_size); size_t optimal_size = is.bounds.volume() * sizeof(Rect) * source_spaces.size() + minimal_size; - result[i].suggested = mem; - result[i].lower_bound = minimal_size; - result[i].upper_bound = optimal_size; + suggestions[i].suggested = mem; + suggestions[i].lower_bound = minimal_size; + suggestions[i].upper_bound = optimal_size; } else { - result[i].suggested = Memory::NO_MEMORY; - result[i].lower_bound = 0; - result[i].upper_bound = 0; + suggestions[i].suggested = Memory::NO_MEMORY; + suggestions[i].lower_bound = 0; + suggestions[i].upper_bound = 0; } } } diff --git a/src/realm/deppart/image_tmpl.cc b/src/realm/deppart/image_tmpl.cc index 288e583758..8a0e686f22 100644 --- a/src/realm/deppart/image_tmpl.cc +++ b/src/realm/deppart/image_tmpl.cc @@ -50,7 +50,7 @@ namespace Realm { template class GPUImageMicroOp; \ template class ImageOperation; \ template ImageMicroOp::ImageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \ - template void IndexSpace::suggest_deppart_buffer_size( \ + template void IndexSpace::suggest_image_buffer_size( \ const std::vector>&, \ const std::vector>&, \ std::vector&) const; \ diff --git a/src/realm/deppart/preimage.cc b/src/realm/deppart/preimage.cc index 0e43956865..5df628f2f6 100644 --- a/src/realm/deppart/preimage.cc +++ b/src/realm/deppart/preimage.cc @@ -17,13 +17,14 @@ // preimage operations for Realm dependent partitioning -#include "realm/deppart/preimage.h" - -#include "realm/deppart/deppart_config.h" -#include "realm/deppart/rectlist.h" -#include "realm/deppart/inst_helper.h" -#include "realm/deppart/image.h" -#include "realm/logging.h" +#include "preimage.h" + +#include "deppart_config.h" +#include "rectlist.h" +#include "inst_helper.h" +#include "image.h" +#include "../logging.h" +#include #include namespace Realm { @@ -31,6 +32,58 @@ namespace Realm { extern Logger log_part; extern Logger log_uop_timing; + template + template + void IndexSpace::suggest_preimage_buffer_size( + const std::vector>& target_spaces, + const std::vector>& inputs, + std::vector& suggestions) const { + size_t minimal_size = 0; + size_t source_entries = 0; + bool bvh = false; + for (auto subspace : target_spaces) { + source_entries += subspace.entries == 0 ? 1 : subspace.entries; + } + minimal_size += sizeof(Rect) * source_entries; + if (this->dense()) { + minimal_size += sizeof(Rect); + } else { + minimal_size += sizeof(Rect) * this->sparsity.impl()->get_entries().size(); + } + if (bvh) { + minimal_size += + (source_entries * sizeof(uint64_t)) + + (source_entries * sizeof(size_t)) + + ((2*source_entries - 1) * sizeof(Rect)) + + (2 * (2*source_entries - 1) * sizeof(int)) + + sizeof(Rect) + + (2 * source_entries * sizeof(uint64_t)) + + (source_entries * sizeof(uint64_t)); + } + suggestions = std::vector(inputs.size()); + for (size_t i = 0; i < inputs.size(); i++) { + IndexSpace is = inputs[i].space; + Memory mem = inputs[i].location; + if (mem.kind() == Memory::GPU_FB_MEM || + mem.kind() == Memory::Z_COPY_MEM) { + const char* val = std::getenv("MIN_SIZE"); // or any env var + size_t device_size = 2000000; //default + if (val) { + device_size = atoi(val); + } + minimal_size = max(minimal_size, device_size); + size_t optimal_size = is.bounds.volume() * sizeof(Rect) * target_spaces.size() + minimal_size; + suggestions[i].suggested = mem; + suggestions[i].lower_bound = minimal_size; + suggestions[i].upper_bound = optimal_size; + } else { + suggestions[i].suggested = Memory::NO_MEMORY; + suggestions[i].lower_bound = 0; + suggestions[i].upper_bound = 0; + } + } + } + template template Event IndexSpace::create_subspaces_by_preimage( @@ -165,529 +218,625 @@ namespace Realm { std::cout << " " << targets[it->first] << " = " << it->second->rects.size() << " rectangles" << std::endl; #endif - // iterate over sparsity outputs and contribute to all (even if we didn't have any - // points found for it) - int empty_count = 0; - for(size_t i = 0; i < sparsity_outputs.size(); i++) { - SparsityMapImpl *impl = SparsityMapImpl::lookup(sparsity_outputs[i]); - typename std::map *>::const_iterator it2 = rect_map.find(i); - if(it2 != rect_map.end()) { - impl->contribute_dense_rect_list(it2->second->rects, true /*disjoint*/); - delete it2->second; - } else { - impl->contribute_nothing(); - empty_count++; - } - } - if(empty_count > 0) - log_part.info() << empty_count << " empty preimages (out of " << sparsity_outputs.size() << ")"; - } - - template - void PreimageMicroOp::dispatch(PartitioningOperation *op, bool inline_ok) - { - // a PreimageMicroOp should always be executed on whichever node the field data lives - NodeID exec_node = ID(inst).instance_owner_node(); - - if(exec_node != Network::my_node_id) { - forward_microop >(exec_node, op, this); - return; - } - - // Need valid data for the instance space - if (!inst_space.dense()) { - // it's safe to add the count after the registration only because we initialized - // the count to 2 instead of 1 - bool registered = SparsityMapImpl::lookup(inst_space.sparsity)->add_waiter(this, true /*precise*/); - if(registered) - wait_count.fetch_add(1); - } - - // need valid data for each target - for(size_t i = 0; i < targets.size(); i++) { - if(!targets[i].dense()) { - // it's safe to add the count after the registration only because we initialized - // the count to 2 instead of 1 - bool registered = SparsityMapImpl::lookup(targets[i].sparsity)->add_waiter(this, true /*precise*/); - if(registered) - wait_count.fetch_add(1); - } - } - - // need valid data for the parent space too - if(!parent_space.dense()) { - // it's safe to add the count after the registration only because we initialized - // the count to 2 instead of 1 - bool registered = SparsityMapImpl::lookup(parent_space.sparsity)->add_waiter(this, true /*precise*/); - if(registered) - wait_count.fetch_add(1); - } - - finish_dispatch(op, inline_ok); - } - - template - template - bool PreimageMicroOp::serialize_params(S& s) const - { - return((s << parent_space) && - (s << inst_space) && - (s << inst) && - (s << field_offset) && - (s << is_ranged) && - (s << targets) && - (s << sparsity_outputs)); - } - - template - template - PreimageMicroOp::PreimageMicroOp(NodeID _requestor, - AsyncMicroOp *_async_microop, S& s) - : PartitioningMicroOp(_requestor, _async_microop) - { - bool ok = ((s >> parent_space) && - (s >> inst_space) && - (s >> inst) && - (s >> field_offset) && - (s >> is_ranged) && - (s >> targets) && - (s >> sparsity_outputs)); - assert(ok); - (void)ok; - } - - template - ActiveMessageHandlerReg > > PreimageMicroOp::areg; - - - //////////////////////////////////////////////////////////////////////// - // - // class PreimageOperation - - template - PreimageOperation::PreimageOperation( - const IndexSpace &_parent, - const DomainTransform &_domain_transform, - const ProfilingRequestSet &reqs, GenEventImpl *_finish_event, - EventImpl::gen_t _finish_gen) - : PartitioningOperation(reqs, _finish_event, _finish_gen), - parent(_parent), - domain_transform(_domain_transform), - overlap_tester(0), - dummy_overlap_uop(0) { - areg.force_instantiation(); - } - - template - PreimageOperation::~PreimageOperation(void) - { - if(overlap_tester) - delete overlap_tester; - } - - template - IndexSpace PreimageOperation::add_target(const IndexSpace& target) - { - // try to filter out obviously empty targets - if(parent.empty() || target.empty()) - return IndexSpace::make_empty(); - - // otherwise it'll be something smaller than the current parent - IndexSpace preimage; - preimage.bounds = parent.bounds; - - // if the target has a sparsity map, use the same node - otherwise - // get a sparsity ID by round-robin'ing across the nodes that have field data - int target_node; - if(!target.dense()) - target_node = ID(target.sparsity).sparsity_creator_node(); - else if (!domain_transform.ptr_data.empty()) - target_node = - ID(domain_transform - .ptr_data[targets.size() % domain_transform.ptr_data.size()] - .inst) - .instance_owner_node(); - else - target_node = - ID(domain_transform - .range_data[targets.size() % domain_transform.range_data.size()] - .inst) - .instance_owner_node(); - SparsityMap sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert >(); - preimage.sparsity = sparsity; - - targets.push_back(target); - preimages.push_back(sparsity); - - return preimage; - } - - template - void PreimageOperation::execute(void) { - if (domain_transform.type == - DomainTransform::DomainTransformType::STRUCTURED) { - for (size_t i = 0; i < preimages.size(); i++) { - SparsityMapImpl::lookup(preimages[i])->set_contributor_count(1); - } - - StructuredPreimageMicroOp *micro_op = - new StructuredPreimageMicroOp( - domain_transform.structured_transform, parent); - - for (size_t j = 0; j < targets.size(); j++) { - micro_op->add_sparsity_output(targets[j], preimages[j]); - } - micro_op->dispatch(this, true); - } else { - if (!DeppartConfig::cfg_disable_intersection_optimization) { - // build the overlap tester based on the targets, since they're at least - // known - ComputeOverlapMicroOp *uop = - new ComputeOverlapMicroOp(this); - - remaining_sparse_images.store(domain_transform.ptr_data.size() + - domain_transform.range_data.size()); - contrib_counts.resize(preimages.size(), atomic(0)); - - // create a dummy async microop that lives until we've received all the - // sparse images - dummy_overlap_uop = new AsyncMicroOp(this, 0); - add_async_work_item(dummy_overlap_uop); - - // add each target, but also generate a bounding box for all of them - Rect target_bbox; - for (size_t i = 0; i < targets.size(); i++) { - uop->add_input_space(targets[i]); - if (i == 0) - target_bbox = targets[i].bounds; - else - target_bbox = target_bbox.union_bbox(targets[i].bounds); - } - - for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) { - // in parallel, we will request the approximate images of each instance's - // data (ideally limited to the target_bbox) - ImageMicroOp *img = new ImageMicroOp( - target_bbox, domain_transform.ptr_data[i].index_space, - domain_transform.ptr_data[i].inst, - domain_transform.ptr_data[i].field_offset, false /*ptrs*/); - img->add_approx_output(i, this); - img->dispatch(this, false /* do not run in this thread */); - } - - for (size_t i = 0; i < domain_transform.range_data.size(); i++) { - // in parallel, we will request the approximate images of each instance's - // data (ideally limited to the target_bbox) - ImageMicroOp *img = new ImageMicroOp( - target_bbox, domain_transform.range_data[i].index_space, - domain_transform.range_data[i].inst, - domain_transform.range_data[i].field_offset, true /*ranges*/); - img->add_approx_output(i + domain_transform.ptr_data.size(), this); - img->dispatch(this, false /* do not run in this thread */); - } + // iterate over sparsity outputs and contribute to all (even if we didn't have any + // points found for it) + int empty_count = 0; + for (size_t i = 0; i < sparsity_outputs.size(); i++) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(sparsity_outputs[i]); + typename std::map *>::const_iterator it2 = rect_map.find(i); + if (it2 != rect_map.end()) { + impl->contribute_dense_rect_list(it2->second->rects, true /*disjoint*/); + delete it2->second; + } else { + impl->contribute_nothing(); + empty_count++; + } + } + if (empty_count > 0) { + log_part.info() << empty_count << " empty preimages (out of " << sparsity_outputs.size() << ")"; + } + } - uop->dispatch(this, true /* ok to run in this thread */); - } else { - for (size_t i = 0; i < preimages.size(); i++) - SparsityMapImpl::lookup(preimages[i]) - ->set_contributor_count(domain_transform.ptr_data.size() + - domain_transform.range_data.size()); - - for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) { - PreimageMicroOp *uop = new PreimageMicroOp( - parent, domain_transform.ptr_data[i].index_space, - domain_transform.ptr_data[i].inst, - domain_transform.ptr_data[i].field_offset, false /*ptrs*/); - for (size_t j = 0; j < targets.size(); j++) - uop->add_sparsity_output(targets[j], preimages[j]); - uop->dispatch(this, true /* ok to run in this thread */); - } + template + void PreimageMicroOp::dispatch(PartitioningOperation *op, bool inline_ok) { + // a PreimageMicroOp should always be executed on whichever node the field data lives + NodeID exec_node = ID(inst).instance_owner_node(); + + if (exec_node != Network::my_node_id) { + forward_microop >(exec_node, op, this); + return; + } + + // Need valid data for the instance space + if (!inst_space.dense()) { + // it's safe to add the count after the registration only because we initialized + // the count to 2 instead of 1 + bool registered = SparsityMapImpl::lookup(inst_space.sparsity)->add_waiter(this, true /*precise*/); + if (registered) + wait_count.fetch_add(1); + } + + // need valid data for each target + for (size_t i = 0; i < targets.size(); i++) { + if (!targets[i].dense()) { + // it's safe to add the count after the registration only because we initialized + // the count to 2 instead of 1 + bool registered = SparsityMapImpl::lookup(targets[i].sparsity)->add_waiter( + this, true /*precise*/); + if (registered) + wait_count.fetch_add(1); + } + } + + // need valid data for the parent space too + if (!parent_space.dense()) { + // it's safe to add the count after the registration only because we initialized + // the count to 2 instead of 1 + bool registered = SparsityMapImpl::lookup(parent_space.sparsity)->add_waiter(this, true /*precise*/); + if (registered) + wait_count.fetch_add(1); + } + + finish_dispatch(op, inline_ok); + } - for (size_t i = 0; i < domain_transform.range_data.size(); i++) { - PreimageMicroOp *uop = new PreimageMicroOp( - parent, domain_transform.range_data[i].index_space, - domain_transform.range_data[i].inst, - domain_transform.range_data[i].field_offset, true /*ranges*/); - for (size_t j = 0; j < targets.size(); j++) - uop->add_sparsity_output(targets[j], preimages[j]); - uop->dispatch(this, true /* ok to run in this thread */); - } - } - } - } + template + template + bool PreimageMicroOp::serialize_params(S &s) const { + return ((s << parent_space) && + (s << inst_space) && + (s << inst) && + (s << field_offset) && + (s << is_ranged) && + (s << targets) && + (s << sparsity_outputs)); + } - template - void PreimageOperation::provide_sparse_image(int index, const Rect *rects, size_t count) - { - // atomically check the overlap tester's readiness and queue us if not - bool tester_ready = false; - { - AutoLock<> al(mutex); - if(overlap_tester != 0) { - tester_ready = true; - } else { - std::vector >& r = pending_sparse_images[index]; - r.insert(r.end(), rects, rects + count); - } - } + template + template + PreimageMicroOp::PreimageMicroOp(NodeID _requestor, + AsyncMicroOp *_async_microop, S &s) + : PartitioningMicroOp(_requestor, _async_microop) { + bool ok = ((s >> parent_space) && + (s >> inst_space) && + (s >> inst) && + (s >> field_offset) && + (s >> is_ranged) && + (s >> targets) && + (s >> sparsity_outputs)); + assert(ok); + (void) ok; + } - if(tester_ready) { - // see which of the targets this image overlaps - std::set overlaps; - overlap_tester->test_overlap(rects, count, overlaps); - if((size_t)index < domain_transform.ptr_data.size()) { - log_part.info() << "image of ptr_data[" << index << "] overlaps " << overlaps.size() << " targets"; - PreimageMicroOp *uop = new PreimageMicroOp( - parent, domain_transform.ptr_data[index].index_space, - domain_transform.ptr_data[index].inst, - domain_transform.ptr_data[index].field_offset, false /*ptrs*/); - for(std::set::const_iterator it2 = overlaps.begin(); - it2 != overlaps.end(); - it2++) { - int j = *it2; - contrib_counts[j].fetch_add(1); - uop->add_sparsity_output(targets[j], preimages[j]); + template + ActiveMessageHandlerReg > > PreimageMicroOp::areg; + + + //////////////////////////////////////////////////////////////////////// + // + // class PreimageOperation + + template + PreimageOperation::PreimageOperation( + const IndexSpace &_parent, + const DomainTransform &_domain_transform, + const ProfilingRequestSet &reqs, GenEventImpl *_finish_event, + EventImpl::gen_t _finish_gen) + : PartitioningOperation(reqs, _finish_event, _finish_gen), + parent(_parent), + domain_transform(_domain_transform), + overlap_tester(0), + dummy_overlap_uop(0) { + areg.force_instantiation(); } - uop->dispatch(this, false /* do not run in this thread */); - } else { - size_t rel_index = index - domain_transform.ptr_data.size(); - assert(rel_index < domain_transform.range_data.size()); - log_part.info() << "image of range_data[" << rel_index << "] overlaps " << overlaps.size() << " targets"; - PreimageMicroOp *uop = new PreimageMicroOp( - parent, domain_transform.range_data[rel_index].index_space, - domain_transform.range_data[rel_index].inst, - domain_transform.range_data[rel_index].field_offset, - true /*ranges*/); - for(std::set::const_iterator it2 = overlaps.begin(); - it2 != overlaps.end(); - it2++) { - int j = *it2; - contrib_counts[j].fetch_add(1); - uop->add_sparsity_output(targets[j], preimages[j]); + + template + PreimageOperation::~PreimageOperation(void) { + if (overlap_tester) + delete overlap_tester; } - uop->dispatch(this, false /* do not run in this thread */); - } - // if these were the last sparse images, we can now set the contributor counts - int v = remaining_sparse_images.fetch_sub(1) - 1; - if(v == 0) { - for(size_t j = 0; j < preimages.size(); j++) { - log_part.info() << contrib_counts[j].load() << " total contributors to preimage " << j; - SparsityMapImpl::lookup(preimages[j])->set_contributor_count(contrib_counts[j].load()); + template + IndexSpace PreimageOperation::add_target(const IndexSpace &target) { + // try to filter out obviously empty targets + if (parent.empty() || target.empty()) + return IndexSpace::make_empty(); + + // otherwise it'll be something smaller than the current parent + IndexSpace preimage; + preimage.bounds = parent.bounds; + + // if the target has a sparsity map, use the same node - otherwise + // get a sparsity ID by round-robin'ing across the nodes that have field data + int target_node; + if (!target.dense()) + target_node = ID(target.sparsity).sparsity_creator_node(); + else if (!domain_transform.ptr_data.empty()) + target_node = + ID(domain_transform + .ptr_data[targets.size() % domain_transform.ptr_data.size()] + .inst) + .instance_owner_node(); + else + target_node = + ID(domain_transform + .range_data[targets.size() % domain_transform.range_data.size()] + .inst) + .instance_owner_node(); + SparsityMap sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert >(); + preimage.sparsity = sparsity; + + targets.push_back(target); + preimages.push_back(sparsity); + + return preimage; } - dummy_overlap_uop->mark_finished(true /*successful*/); - } - } - } - template - void PreimageOperation::set_overlap_tester(void *tester) - { - // atomically set the overlap tester and see if there are any pending entries - std::map > > pending; - { - AutoLock<> al(mutex); - assert(overlap_tester == 0); - overlap_tester = static_cast *>(tester); - pending.swap(pending_sparse_images); - } + template + void PreimageOperation::execute(void) { + std::vector,Point> > gpu_ptr_data; + std::vector,Point> > cpu_ptr_data; + std::vector,Rect> > gpu_rect_data; + std::vector,Rect> > cpu_rect_data; + for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) { + if (domain_transform.ptr_data[i].inst.get_location().kind() == + Memory::GPU_FB_MEM || domain_transform.ptr_data[i].inst.get_location().kind() == Memory::Z_COPY_MEM) { + gpu_ptr_data.push_back(domain_transform.ptr_data[i]); + } else { + cpu_ptr_data.push_back(domain_transform.ptr_data[i]); + } + } + for (size_t i = 0; i < domain_transform.range_data.size(); i++) { + if (domain_transform.range_data[i].inst.get_location().kind() == + Memory::GPU_FB_MEM || domain_transform.range_data[i].inst.get_location().kind() == Memory::Z_COPY_MEM) { + gpu_rect_data.push_back(domain_transform.range_data[i]); + } else { + cpu_rect_data.push_back(domain_transform.range_data[i]); + } + } + bool gpu_data = !gpu_ptr_data.empty() || !gpu_rect_data.empty(); + bool opcount = cpu_ptr_data.size() + cpu_rect_data.size() + gpu_ptr_data.size() + gpu_rect_data.size(); + bool exclusive = (gpu_data && (opcount == 1)); + if (domain_transform.type == + DomainTransform::DomainTransformType::STRUCTURED && !gpu_data) { + for (size_t i = 0; i < preimages.size(); i++) { + SparsityMapImpl::lookup(preimages[i])->set_contributor_count(1); + } + + StructuredPreimageMicroOp *micro_op = + new StructuredPreimageMicroOp( + domain_transform.structured_transform, parent); + + for (size_t j = 0; j < targets.size(); j++) { + micro_op->add_sparsity_output(targets[j], preimages[j]); + } + micro_op->dispatch(this, true); + } else if (!DeppartConfig::cfg_disable_intersection_optimization && !gpu_data) { + // build the overlap tester based on the targets, since they're at least + // known + ComputeOverlapMicroOp *uop = + new ComputeOverlapMicroOp(this); + + remaining_sparse_images.store(domain_transform.ptr_data.size() + + domain_transform.range_data.size()); + contrib_counts.resize(preimages.size(), atomic(0)); + + // create a dummy async microop that lives until we've received all the + // sparse images + dummy_overlap_uop = new AsyncMicroOp(this, 0); + add_async_work_item(dummy_overlap_uop); + + // add each target, but also generate a bounding box for all of them + Rect target_bbox; + for (size_t i = 0; i < targets.size(); i++) { + uop->add_input_space(targets[i]); + if (i == 0) + target_bbox = targets[i].bounds; + else + target_bbox = target_bbox.union_bbox(targets[i].bounds); + } + + for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) { + // in parallel, we will request the approximate images of each instance's + // data (ideally limited to the target_bbox) + ImageMicroOp *img = new ImageMicroOp( + target_bbox, domain_transform.ptr_data[i].index_space, + domain_transform.ptr_data[i].inst, + domain_transform.ptr_data[i].field_offset, false /*ptrs*/); + img->add_approx_output(i, this); + img->dispatch(this, false /* do not run in this thread */); + } + + for (size_t i = 0; i < domain_transform.range_data.size(); i++) { + // in parallel, we will request the approximate images of each instance's + // data (ideally limited to the target_bbox) + ImageMicroOp *img = new ImageMicroOp( + target_bbox, domain_transform.range_data[i].index_space, + domain_transform.range_data[i].inst, + domain_transform.range_data[i].field_offset, true /*ranges*/); + img->add_approx_output(i + domain_transform.ptr_data.size(), this); + img->dispatch(this, false /* do not run in this thread */); + } + + uop->dispatch(this, true /* ok to run in this thread */); + } else { + if (!exclusive) { + for (size_t i = 0; i < preimages.size(); i++) + SparsityMapImpl::lookup(preimages[i]) + ->set_contributor_count(opcount); + } + for (size_t i = 0; i < cpu_ptr_data.size(); i++) { + PreimageMicroOp *uop = new PreimageMicroOp( + parent, cpu_ptr_data[i].index_space, + cpu_ptr_data[i].inst, + cpu_ptr_data[i].field_offset, false /*ptrs*/); + for (size_t j = 0; j < targets.size(); j++) + uop->add_sparsity_output(targets[j], preimages[j]); + uop->dispatch(this, true /* ok to run in this thread */); + } + for (size_t i = 0; i < cpu_rect_data.size(); i++) { + PreimageMicroOp *uop = new PreimageMicroOp( + parent, cpu_rect_data[i].index_space, + cpu_rect_data[i].inst, + cpu_rect_data[i].field_offset, true /*ranges*/); + for (size_t j = 0; j < targets.size(); j++) + uop->add_sparsity_output(targets[j], preimages[j]); + uop->dispatch(this, true /* ok to run in this thread */); + } + for (auto ptr_fdd : gpu_ptr_data) { + domain_transform.ptr_data = {ptr_fdd}; + GPUPreimageMicroOp *micro_op = + new GPUPreimageMicroOp( + domain_transform, parent, exclusive); + for (size_t j = 0; j < targets.size(); j++) { + micro_op->add_sparsity_output(targets[j], preimages[j]); + } + micro_op->dispatch(this, true); + } + for (auto range_fdd : gpu_rect_data) { + domain_transform.range_data = {range_fdd}; + GPUPreimageMicroOp *micro_op = + new GPUPreimageMicroOp( + domain_transform, parent, exclusive); + for (size_t j = 0; j < targets.size(); j++) { + micro_op->add_sparsity_output(targets[j], preimages[j]); + } + micro_op->dispatch(this, true); + } + } + } - // now issue work for any sparse images we got before the tester was ready - if(!pending.empty()) { - for(typename std::map > >::const_iterator it = pending.begin(); - it != pending.end(); - it++) { - // see which instance this is an image from - size_t idx = it->first; - // see which of the targets that image overlaps - std::set overlaps; - overlap_tester->test_overlap(&it->second[0], it->second.size(), overlaps); - if(idx < domain_transform.ptr_data.size()) { - log_part.info() << "image of ptr_data[" << idx << "] overlaps " << overlaps.size() << " targets"; - PreimageMicroOp *uop = - new PreimageMicroOp( - parent, domain_transform.ptr_data[idx].index_space, - domain_transform.ptr_data[idx].inst, - domain_transform.ptr_data[idx].field_offset, false /*ptrs*/); - for(std::set::const_iterator it2 = overlaps.begin(); - it2 != overlaps.end(); - it2++) { - int j = *it2; - contrib_counts[j].fetch_add(1); - uop->add_sparsity_output(targets[j], preimages[j]); - } - uop->dispatch(this, true /* ok to run in this thread */); - } else { - size_t rel_index = idx - domain_transform.ptr_data.size(); - assert(rel_index < domain_transform.range_data.size()); - log_part.info() << "image of range_data[" << rel_index << "] overlaps " << overlaps.size() << " targets"; - PreimageMicroOp *uop = - new PreimageMicroOp( - parent, domain_transform.range_data[rel_index].index_space, - domain_transform.range_data[rel_index].inst, - domain_transform.range_data[rel_index].field_offset, - true /*ranges*/); - for(std::set::const_iterator it2 = overlaps.begin(); - it2 != overlaps.end(); - it2++) { - int j = *it2; - contrib_counts[j].fetch_add(1); - uop->add_sparsity_output(targets[j], preimages[j]); - } - uop->dispatch(this, true /* ok to run in this thread */); + template + void PreimageOperation::provide_sparse_image(int index, const Rect *rects, size_t count) { + // atomically check the overlap tester's readiness and queue us if not + bool tester_ready = false; + { + AutoLock<> al(mutex); + if (overlap_tester != 0) { + tester_ready = true; + } else { + std::vector > &r = pending_sparse_images[index]; + r.insert(r.end(), rects, rects + count); + } + } + + if (tester_ready) { + // see which of the targets this image overlaps + std::set overlaps; + overlap_tester->test_overlap(rects, count, overlaps); + if ((size_t) index < domain_transform.ptr_data.size()) { + log_part.info() << "image of ptr_data[" << index << "] overlaps " << overlaps.size() << " targets"; + PreimageMicroOp *uop = new PreimageMicroOp( + parent, domain_transform.ptr_data[index].index_space, + domain_transform.ptr_data[index].inst, + domain_transform.ptr_data[index].field_offset, false /*ptrs*/); + for (std::set::const_iterator it2 = overlaps.begin(); + it2 != overlaps.end(); + it2++) { + int j = *it2; + contrib_counts[j].fetch_add(1); + uop->add_sparsity_output(targets[j], preimages[j]); + } + uop->dispatch(this, false /* do not run in this thread */); + } else { + size_t rel_index = index - domain_transform.ptr_data.size(); + assert(rel_index < domain_transform.range_data.size()); + log_part.info() << "image of range_data[" << rel_index << "] overlaps " << overlaps.size() << + " targets"; + PreimageMicroOp *uop = new PreimageMicroOp( + parent, domain_transform.range_data[rel_index].index_space, + domain_transform.range_data[rel_index].inst, + domain_transform.range_data[rel_index].field_offset, + true /*ranges*/); + for (std::set::const_iterator it2 = overlaps.begin(); + it2 != overlaps.end(); + it2++) { + int j = *it2; + contrib_counts[j].fetch_add(1); + uop->add_sparsity_output(targets[j], preimages[j]); + } + uop->dispatch(this, false /* do not run in this thread */); + } + + // if these were the last sparse images, we can now set the contributor counts + int v = remaining_sparse_images.fetch_sub(1) - 1; + if (v == 0) { + for (size_t j = 0; j < preimages.size(); j++) { + log_part.info() << contrib_counts[j].load() << " total contributors to preimage " << j; + SparsityMapImpl::lookup(preimages[j])->set_contributor_count(contrib_counts[j].load()); + } + dummy_overlap_uop->mark_finished(true /*successful*/); + } + } } - } - // if these were the last sparse images, we can now set the contributor counts - int v = remaining_sparse_images.fetch_sub(pending.size()) - pending.size(); - if(v == 0) { - for(size_t j = 0; j < preimages.size(); j++) { - log_part.info() << contrib_counts[j].load() << " total contributors to preimage " << j; - SparsityMapImpl::lookup(preimages[j])->set_contributor_count(contrib_counts[j].load()); + template + void PreimageOperation::set_overlap_tester(void *tester) { + // atomically set the overlap tester and see if there are any pending entries + std::map > > pending; + { + AutoLock<> al(mutex); + assert(overlap_tester == 0); + overlap_tester = static_cast *>(tester); + pending.swap(pending_sparse_images); + } + + // now issue work for any sparse images we got before the tester was ready + if (!pending.empty()) { + for (typename std::map > >::const_iterator it = pending.begin(); + it != pending.end(); + it++) { + // see which instance this is an image from + size_t idx = it->first; + // see which of the targets that image overlaps + std::set overlaps; + overlap_tester->test_overlap(&it->second[0], it->second.size(), overlaps); + if (idx < domain_transform.ptr_data.size()) { + log_part.info() << "image of ptr_data[" << idx << "] overlaps " << overlaps.size() << " targets"; + PreimageMicroOp *uop = + new PreimageMicroOp( + parent, domain_transform.ptr_data[idx].index_space, + domain_transform.ptr_data[idx].inst, + domain_transform.ptr_data[idx].field_offset, false /*ptrs*/); + for (std::set::const_iterator it2 = overlaps.begin(); + it2 != overlaps.end(); + it2++) { + int j = *it2; + contrib_counts[j].fetch_add(1); + uop->add_sparsity_output(targets[j], preimages[j]); + } + uop->dispatch(this, true /* ok to run in this thread */); + } else { + size_t rel_index = idx - domain_transform.ptr_data.size(); + assert(rel_index < domain_transform.range_data.size()); + log_part.info() << "image of range_data[" << rel_index << "] overlaps " << overlaps.size() << + " targets"; + PreimageMicroOp *uop = + new PreimageMicroOp( + parent, domain_transform.range_data[rel_index].index_space, + domain_transform.range_data[rel_index].inst, + domain_transform.range_data[rel_index].field_offset, + true /*ranges*/); + for (std::set::const_iterator it2 = overlaps.begin(); + it2 != overlaps.end(); + it2++) { + int j = *it2; + contrib_counts[j].fetch_add(1); + uop->add_sparsity_output(targets[j], preimages[j]); + } + uop->dispatch(this, true /* ok to run in this thread */); + } + } + + // if these were the last sparse images, we can now set the contributor counts + int v = remaining_sparse_images.fetch_sub(pending.size()) - pending.size(); + if (v == 0) { + for (size_t j = 0; j < preimages.size(); j++) { + log_part.info() << contrib_counts[j].load() << " total contributors to preimage " << j; + SparsityMapImpl::lookup(preimages[j])->set_contributor_count(contrib_counts[j].load()); + } + dummy_overlap_uop->mark_finished(true /*successful*/); + } + } } - dummy_overlap_uop->mark_finished(true /*successful*/); - } - } - } - template - void PreimageOperation::print(std::ostream& os) const - { - os << "PreimageOperation(" << parent << ")"; - } + template + void PreimageOperation::print(std::ostream &os) const { + os << "PreimageOperation(" << parent << ")"; + } - template - ActiveMessageHandlerReg > > PreimageOperation::areg; + template + ActiveMessageHandlerReg > > PreimageOperation::areg; - //////////////////////////////////////////////////////////////////////// - // - // class ApproxImageResponseMessage + //////////////////////////////////////////////////////////////////////// + // + // class ApproxImageResponseMessage - template - /*static*/ void ApproxImageResponseMessage::handle_message(NodeID sender, - const ApproxImageResponseMessage &msg, - const void *data, size_t datalen) - { - T *op = reinterpret_cast(msg.approx_output_op); - op->provide_sparse_image(msg.approx_output_index, - static_cast *>(data), - datalen / sizeof(Rect)); - } + template + /*static*/ void ApproxImageResponseMessage::handle_message(NodeID sender, + const ApproxImageResponseMessage &msg, + const void *data, size_t datalen) { + T *op = reinterpret_cast(msg.approx_output_op); + op->provide_sparse_image(msg.approx_output_index, + static_cast *>(data), + datalen / sizeof(Rect)); + } - //////////////////////////////////////////////////////////////////////// - // - // class StructuredPreimageMicroOp + //////////////////////////////////////////////////////////////////////// + // + // class StructuredPreimageMicroOp - template - StructuredPreimageMicroOp::StructuredPreimageMicroOp( - const StructuredTransform &_transform, - IndexSpace _parent_space) - : transform(_transform), parent_space(_parent_space) {} + template + StructuredPreimageMicroOp::StructuredPreimageMicroOp( + const StructuredTransform &_transform, + IndexSpace _parent_space) + : transform(_transform), parent_space(_parent_space) { + } - template - StructuredPreimageMicroOp::~StructuredPreimageMicroOp(void) {} + template + StructuredPreimageMicroOp::~StructuredPreimageMicroOp(void) { + } - template - void StructuredPreimageMicroOp::add_sparsity_output( - IndexSpace _target, SparsityMap _sparsity) { - targets.push_back(_target); - sparsity_outputs.push_back(_sparsity); - } + template + void StructuredPreimageMicroOp::add_sparsity_output( + IndexSpace _target, SparsityMap _sparsity) { + targets.push_back(_target); + sparsity_outputs.push_back(_sparsity); + } - template - template - void StructuredPreimageMicroOp::populate_bitmasks( - std::map &bitmasks) { - Rect target_bbox = targets[0].bounds; - for (size_t i = 1; i < targets.size(); i++) { - target_bbox = target_bbox.union_bbox(targets[i].bounds); - } - for (IndexSpaceIterator it2(parent_space); it2.valid; it2.step()) { - Rect parent_bbox; - parent_bbox.lo = transform[it2.rect.lo]; - parent_bbox.hi = transform[it2.rect.hi]; - - if (target_bbox.intersection(parent_bbox).empty()) continue; - - for (PointInRectIterator pir(it2.rect); pir.valid; pir.step()) { - Point target_point = transform[pir.p]; - for (size_t i = 0; i < targets.size(); i++) { - if (targets[i].contains(target_point)) { - BM *&bmp = bitmasks[i]; - if (!bmp) bmp = new BM; - bmp->add_point(pir.p); - } - } - } - } - } + template + template + void StructuredPreimageMicroOp::populate_bitmasks( + std::map &bitmasks) { + Rect target_bbox = targets[0].bounds; + for (size_t i = 1; i < targets.size(); i++) { + target_bbox = target_bbox.union_bbox(targets[i].bounds); + } + for (IndexSpaceIterator it2(parent_space); it2.valid; it2.step()) { + Rect parent_bbox; + parent_bbox.lo = transform[it2.rect.lo]; + parent_bbox.hi = transform[it2.rect.hi]; + + if (target_bbox.intersection(parent_bbox).empty()) continue; + + for (PointInRectIterator pir(it2.rect); pir.valid; pir.step()) { + Point target_point = transform[pir.p]; + for (size_t i = 0; i < targets.size(); i++) { + if (targets[i].contains(target_point)) { + BM *&bmp = bitmasks[i]; + if (!bmp) bmp = new BM; + bmp->add_point(pir.p); + } + } + } + } + } - template - void StructuredPreimageMicroOp::execute(void) - { - TimeStamp ts("PreimageMicroOp::execute", true, &log_uop_timing); - std::map *> rect_map; + template + void StructuredPreimageMicroOp::execute(void) { + TimeStamp ts("PreimageMicroOp::execute", true, &log_uop_timing); + std::map *> rect_map; - populate_bitmasks(rect_map); + populate_bitmasks(rect_map); #ifdef DEBUG_PARTITIONING - std::cout << rect_map.size() << " non-empty preimages present in instance " - << inst << std::endl; - for (typename std::map *>::const_iterator it = - rect_map.begin(); - it != rect_map.end(); it++) - std::cout << " " << targets[it->first] << " = " - << it->second->rects.size() << " rectangles" << std::endl; + std::cout << rect_map.size() << " non-empty preimages present in instance " + << inst << std::endl; + for (typename std::map *>::const_iterator it = + rect_map.begin(); + it != rect_map.end(); it++) + std::cout << " " << targets[it->first] << " = " + << it->second->rects.size() << " rectangles" << std::endl; #endif - // iterate over sparsity outputs and contribute to all (even if we - // didn't have any points found for it) - int empty_count = 0; - for (size_t i = 0; i < sparsity_outputs.size(); i++) { - SparsityMapImpl *impl = - SparsityMapImpl::lookup(sparsity_outputs[i]); - typename std::map *>::const_iterator it2 = - rect_map.find(i); - if (it2 != rect_map.end()) { - impl->contribute_dense_rect_list(it2->second->rects, true /*disjoint*/); - delete it2->second; - } else { - impl->contribute_nothing(); - empty_count++; - } - } + // iterate over sparsity outputs and contribute to all (even if we + // didn't have any points found for it) + int empty_count = 0; + for (size_t i = 0; i < sparsity_outputs.size(); i++) { + SparsityMapImpl *impl = + SparsityMapImpl::lookup(sparsity_outputs[i]); + typename std::map *>::const_iterator it2 = + rect_map.find(i); + if (it2 != rect_map.end()) { + impl->contribute_dense_rect_list(it2->second->rects, true /*disjoint*/); + delete it2->second; + } else { + impl->contribute_nothing(); + empty_count++; + } + } + + if (empty_count > 0) { + log_part.info() << empty_count << " empty preimages (out of " + << sparsity_outputs.size() << ")"; + } + } - if (empty_count > 0) { - log_part.info() << empty_count << " empty preimages (out of " - << sparsity_outputs.size() << ")"; - } - } + template + void StructuredPreimageMicroOp::dispatch( + PartitioningOperation *op, bool inline_ok) { + // need valid data for each target + for (size_t i = 0; i < targets.size(); i++) { + if (!targets[i].dense()) { + // it's safe to add the count after the registration only because we + // initialized the count to 2 instead of 1 + bool registered = SparsityMapImpl::lookup(targets[i].sparsity) + ->add_waiter(this, true /*precise*/); + if (registered) wait_count.fetch_add(1); + } + } + + // need valid data for the parent space too + if (!parent_space.dense()) { + // it's safe to add the count after the registration only because we + // initialized the count to 2 instead of 1 + bool registered = SparsityMapImpl::lookup(parent_space.sparsity) + ->add_waiter(this, true /*precise*/); + if (registered) wait_count.fetch_add(1); + } + + this->finish_dispatch(op, inline_ok); + } - template - void StructuredPreimageMicroOp::dispatch( - PartitioningOperation *op, bool inline_ok) { - // need valid data for each target - for (size_t i = 0; i < targets.size(); i++) { - if (!targets[i].dense()) { - // it's safe to add the count after the registration only because we - // initialized the count to 2 instead of 1 - bool registered = SparsityMapImpl::lookup(targets[i].sparsity) - ->add_waiter(this, true /*precise*/); - if (registered) wait_count.fetch_add(1); - } - } + //////////////////////////////////////////////////////////////////////// + // + // class GPUPreimageMicroOp - // need valid data for the parent space too - if (!parent_space.dense()) { - // it's safe to add the count after the registration only because we - // initialized the count to 2 instead of 1 - bool registered = SparsityMapImpl::lookup(parent_space.sparsity) - ->add_waiter(this, true /*precise*/); - if (registered) wait_count.fetch_add(1); - } + template + GPUPreimageMicroOp::GPUPreimageMicroOp( + const DomainTransform &_domain_transform, + IndexSpace _parent_space, bool _exclusive) + : domain_transform(_domain_transform), parent_space(_parent_space) { + this->exclusive = _exclusive; + } - finish_dispatch(op, inline_ok); - } + template + GPUPreimageMicroOp::~GPUPreimageMicroOp(void) { + } + + template + void GPUPreimageMicroOp::add_sparsity_output( + IndexSpace _target, SparsityMap _sparsity) { + targets.push_back(_target); + sparsity_outputs.push_back(_sparsity); + } + + template + void GPUPreimageMicroOp::execute(void) { + TimeStamp ts("GPUPreimageMicroOp::execute", true, &log_uop_timing); + if (domain_transform.ptr_data.size() > 0) { + gpu_populate_bitmasks(); + } else if (domain_transform.range_data.size() > 0) { + gpu_populate_ranges(); + } + } - // instantiations of templates handled in preimage_tmpl.cc + template + void GPUPreimageMicroOp::dispatch( + PartitioningOperation *op, bool inline_ok) { + // need valid data for each target + for (size_t i = 0; i < targets.size(); i++) { + if (!targets[i].dense()) { + // it's safe to add the count after the registration only because we + // initialized the count to 2 instead of 1 + bool registered = SparsityMapImpl::lookup(targets[i].sparsity) + ->add_waiter(this, true /*precise*/); + if (registered) this->wait_count.fetch_add(1); + } + } + + // need valid data for the parent space too + if (!parent_space.dense()) { + // it's safe to add the count after the registration only because we + // initialized the count to 2 instead of 1 + bool registered = SparsityMapImpl::lookup(parent_space.sparsity) + ->add_waiter(this, true /*precise*/); + if (registered) this->wait_count.fetch_add(1); + } + + this->finish_dispatch(op, inline_ok); + } -}; // namespace Realm + // instantiations of templates handled in preimage_tmpl.cc +}; // namespace Realm \ No newline at end of file diff --git a/src/realm/deppart/preimage.h b/src/realm/deppart/preimage.h index c08c0dfd30..1a67c12aee 100644 --- a/src/realm/deppart/preimage.h +++ b/src/realm/deppart/preimage.h @@ -20,7 +20,8 @@ #ifndef REALM_DEPPART_PREIMAGE_H #define REALM_DEPPART_PREIMAGE_H -#include "realm/deppart/partitions.h" +#include "partitions.h" +#include "realm/deppart/rectlist.h" namespace Realm { @@ -152,6 +153,36 @@ namespace Realm { std::vector > sparsity_outputs; }; + template + class GPUPreimageMicroOp : public GPUMicroOp { + public: + static const int DIM = N; + typedef T IDXTYPE; + static const int DIM2 = N2; + typedef T2 IDXTYPE2; + + GPUPreimageMicroOp(const DomainTransform &_domain_transform, + IndexSpace _parent_space, bool _exclusive); + + virtual ~GPUPreimageMicroOp(void); + + void add_sparsity_output(IndexSpace _target, SparsityMap _sparsity); + + virtual void execute(void); + + void dispatch(PartitioningOperation *op, bool inline_ok); + + protected: + + void gpu_populate_ranges(); + void gpu_populate_bitmasks(); + + DomainTransform domain_transform; + IndexSpace parent_space; + std::vector > targets; + std::vector > sparsity_outputs; + }; + }; // namespace Realm -#endif // REALM_DEPPART_PREIMAGE_H +#endif // REALM_DEPPART_PREIMAGE_H \ No newline at end of file diff --git a/src/realm/deppart/preimage_gpu_impl.hpp b/src/realm/deppart/preimage_gpu_impl.hpp new file mode 100644 index 0000000000..3793b32458 --- /dev/null +++ b/src/realm/deppart/preimage_gpu_impl.hpp @@ -0,0 +1,468 @@ +#pragma once +#include "realm/deppart/preimage.h" +#include "realm/deppart/preimage_gpu_kernels.hpp" +#include "realm/deppart/byfield_gpu_kernels.hpp" +#include "realm/deppart/partitions_gpu_impl.hpp" +#include +#include +#include "realm/nvtx.h" + +namespace Realm { + + template + void GPUPreimageMicroOp::gpu_populate_ranges() { + if (targets.size() == 0) { + return; + } + + Memory my_mem = domain_transform.range_data[0].inst.get_location(); + + const char* val = std::getenv("TILE_SIZE"); // or any env var + size_t tile_size = 100000000; //default + if (val) { + tile_size = atoi(val); + } + + RegionInstance fixed_buffer = this->realm_malloc(tile_size, my_mem); + Arena buffer_arena(reinterpret_cast(AffineAccessor(fixed_buffer, 0).base), tile_size); + + NVTX_DEPPART(gpu_preimage); + + cudaStream_t stream = Cuda::get_task_cuda_stream(); + + collapsed_space inst_space; + + // We combine all of our instances into one to batch work, tracking the offsets between instances. + RegionInstance inst_offsets_instance = this->realm_malloc((domain_transform.range_data.size() + 1) * sizeof(size_t), my_mem); + inst_space.offsets = reinterpret_cast(AffineAccessor(inst_offsets_instance, 0).base); + inst_space.num_children = domain_transform.range_data.size(); + + RegionInstance inst_entries_instance; + + GPUMicroOp::collapse_multi_space(domain_transform.range_data, inst_space, buffer_arena, stream); + + RegionInstance parent_entries_instance; + collapsed_space collapsed_parent; + + // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs. + GPUMicroOp::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream); + + + // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter + // to figure out where to write each rectangle. + RegionInstance inst_counters_instance = this->realm_malloc((2*domain_transform.range_data.size() + 1) * sizeof(uint32_t), my_mem); + uint32_t* d_inst_counters = reinterpret_cast(AffineAccessor(inst_counters_instance, 0).base); + + // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second + // to track which instance each rectangle came from in the populate phase. + uint32_t* d_inst_prefix = d_inst_counters + domain_transform.range_data.size(); + RegionInstance out_instance; + size_t num_valid_rects; + + Rect* d_valid_rects; + + // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from. + GPUMicroOp::template construct_input_rectlist>(inst_space, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream); + inst_entries_instance.destroy(); + parent_entries_instance.destroy(); + inst_offsets_instance.destroy(); + + if (num_valid_rects == 0) { + for (auto it : sparsity_outputs) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(it); + if (this->exclusive) { + impl->gpu_finalize(); + } else { + impl->contribute_nothing(); + } + } + out_instance.destroy(); + inst_counters_instance.destroy(); + return; + } + + // Prefix sum the valid rectangles by volume. + RegionInstance prefix_rects_instance; + size_t total_pts; + + size_t* d_prefix_rects; + GPUMicroOp::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream); + + nvtx_range_push("cuda", "build target entries"); + + collapsed_space target_space; + RegionInstance offsets_instance = this->realm_malloc((targets.size()+1) * sizeof(size_t), my_mem); + target_space.offsets = reinterpret_cast(AffineAccessor(offsets_instance, 0).base); + target_space.num_children = targets.size(); + + RegionInstance targets_entries_instance; + + GPUMicroOp::collapse_multi_space(targets, target_space, buffer_arena, stream); + + Memory zcpy_mem; + assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM)); + RegionInstance accessors_instance = this->realm_malloc(domain_transform.range_data.size() * sizeof(AffineAccessor,N,T>), zcpy_mem); + AffineAccessor,N,T>* d_accessors = reinterpret_cast,N,T>*>(AffineAccessor(accessors_instance, 0).base); + for (size_t i = 0; i < domain_transform.range_data.size(); ++i) { + d_accessors[i] = AffineAccessor,N,T>(domain_transform.range_data[i].inst, domain_transform.range_data[i].field_offset); + } + + RegionInstance points_instance; + PointDesc* d_points; + size_t num_valid_points; + + RegionInstance target_counters_instance = this->realm_malloc((2*targets.size()+1) * sizeof(uint32_t), my_mem); + uint32_t* d_target_counters = reinterpret_cast(AffineAccessor(target_counters_instance, 0).base); + uint32_t* d_targets_prefix = d_target_counters + targets.size(); + CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, targets.size() * sizeof(uint32_t), stream), stream); + + if (target_space.num_entries > targets.size()) { + BVH preimage_bvh; + RegionInstance bvh_instance; + GPUMicroOp::build_bvh(target_space, preimage_bvh, buffer_arena, stream); + + preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices, + preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.range_data.size(), preimage_bvh.num_leaves, nullptr, d_target_counters, nullptr); + KERNEL_CHECK(stream); + + std::vector h_target_counters(targets.size()+1); + h_target_counters[0] = 0; // prefix sum starts at 0 + CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t i = 0; i < targets.size(); ++i) { + h_target_counters[i+1] += h_target_counters[i]; + } + + num_valid_points = h_target_counters[targets.size()]; + + if (num_valid_points == 0) { + for (auto it : sparsity_outputs) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(it); + if (this->exclusive) { + impl->gpu_finalize(); + } else { + impl->contribute_nothing(); + } + } + target_counters_instance.destroy(); + accessors_instance.destroy(); + targets_entries_instance.destroy(); + offsets_instance.destroy(); + prefix_rects_instance.destroy(); + out_instance.destroy(); + inst_counters_instance.destroy(); + bvh_instance.destroy(); + return; + } + + CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream); + + points_instance = this->realm_malloc(num_valid_points * sizeof(PointDesc), my_mem); + d_points = reinterpret_cast*>(AffineAccessor(points_instance, 0).base); + + CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream); + + preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices, + preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.range_data.size(), preimage_bvh.num_leaves, d_targets_prefix, d_target_counters, d_points); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + bvh_instance.destroy(); + } else { + preimage_dense_populate_bitmasks_kernel < N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts, + num_valid_rects, domain_transform.range_data.size(), targets.size(), nullptr, d_target_counters, nullptr); + KERNEL_CHECK(stream); + + std::vector h_target_counters(targets.size()+1); + h_target_counters[0] = 0; // prefix sum starts at 0 + CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t i = 0; i < targets.size(); ++i) { + h_target_counters[i+1] += h_target_counters[i]; + } + + num_valid_points = h_target_counters[targets.size()]; + + if (num_valid_points == 0) { + for (auto it : sparsity_outputs) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(it); + if (this->exclusive) { + impl->gpu_finalize(); + } else { + impl->contribute_nothing(); + } + } + target_counters_instance.destroy(); + accessors_instance.destroy(); + targets_entries_instance.destroy(); + offsets_instance.destroy(); + prefix_rects_instance.destroy(); + out_instance.destroy(); + inst_counters_instance.destroy(); + return; + } + + CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream); + + points_instance = this->realm_malloc(num_valid_points * sizeof(PointDesc), my_mem); + d_points = reinterpret_cast*>(AffineAccessor(points_instance, 0).base); + + CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream); + + preimage_dense_populate_bitmasks_kernel < N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts, + num_valid_rects, domain_transform.range_data.size(), targets.size(), d_targets_prefix, d_target_counters, d_points); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } + + target_counters_instance.destroy(); + accessors_instance.destroy(); + targets_entries_instance.destroy(); + offsets_instance.destroy(); + prefix_rects_instance.destroy(); + out_instance.destroy(); + inst_counters_instance.destroy(); + + size_t out_rects = 0; + RectDesc* trash; + this->complete_pipeline(d_points, num_valid_points, trash, out_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + + points_instance.destroy(); + } + + template + void GPUPreimageMicroOp::gpu_populate_bitmasks() { + if (targets.size() == 0) { + return; + } + + Memory my_mem = domain_transform.ptr_data[0].inst.get_location(); + + const char* val = std::getenv("TILE_SIZE"); // or any env var + size_t tile_size = 100000000; //default + if (val) { + tile_size = atoi(val); + } + + RegionInstance fixed_buffer = this->realm_malloc(tile_size, my_mem); + Arena buffer_arena(reinterpret_cast(AffineAccessor(fixed_buffer, 0).base), tile_size); + + NVTX_DEPPART(gpu_preimage); + + cudaStream_t stream = Cuda::get_task_cuda_stream(); + + collapsed_space inst_space; + + // We combine all of our instances into one to batch work, tracking the offsets between instances. + RegionInstance inst_offsets_instance = this->realm_malloc((domain_transform.ptr_data.size() + 1) * sizeof(size_t), my_mem); + inst_space.offsets = reinterpret_cast(AffineAccessor(inst_offsets_instance, 0).base); + inst_space.num_children = domain_transform.ptr_data.size(); + + RegionInstance inst_entries_instance; + + GPUMicroOp::collapse_multi_space(domain_transform.ptr_data, inst_space, buffer_arena, stream); + + RegionInstance parent_entries_instance; + collapsed_space collapsed_parent; + + // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs. + GPUMicroOp::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream); + + + // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter + // to figure out where to write each rectangle. + RegionInstance inst_counters_instance = this->realm_malloc((2*domain_transform.ptr_data.size() + 1) * sizeof(uint32_t), my_mem); + uint32_t* d_inst_counters = reinterpret_cast(AffineAccessor(inst_counters_instance, 0).base); + + // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second + // to track which instance each rectangle came from in the populate phase. + uint32_t* d_inst_prefix = d_inst_counters + domain_transform.ptr_data.size(); + RegionInstance out_instance; + size_t num_valid_rects; + + Rect* d_valid_rects; + // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from. + GPUMicroOp::template construct_input_rectlist>(inst_space, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream); + inst_entries_instance.destroy(); + parent_entries_instance.destroy(); + inst_offsets_instance.destroy(); + + if (num_valid_rects == 0) { + for (auto it : sparsity_outputs) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(it); + if (this->exclusive) { + impl->gpu_finalize(); + } else { + impl->contribute_nothing(); + } + } + out_instance.destroy(); + inst_counters_instance.destroy(); + return; + } + + // Prefix sum the valid rectangles by volume. + RegionInstance prefix_rects_instance; + size_t total_pts; + + size_t* d_prefix_rects; + GPUMicroOp::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream); + + nvtx_range_push("cuda", "build target entries"); + + collapsed_space target_space; + RegionInstance offsets_instance = this->realm_malloc((targets.size()+1) * sizeof(size_t), my_mem); + target_space.offsets = reinterpret_cast(AffineAccessor(offsets_instance, 0).base); + target_space.num_children = targets.size(); + + RegionInstance targets_entries_instance; + + GPUMicroOp::collapse_multi_space(targets, target_space, buffer_arena, stream); + + Memory zcpy_mem; + assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM)); + RegionInstance accessors_instance = this->realm_malloc(domain_transform.ptr_data.size() * sizeof(AffineAccessor,N,T>), zcpy_mem); + AffineAccessor,N,T>* d_accessors = reinterpret_cast,N,T>*>(AffineAccessor(accessors_instance, 0).base); + for (size_t i = 0; i < domain_transform.ptr_data.size(); ++i) { + d_accessors[i] = AffineAccessor,N,T>(domain_transform.ptr_data[i].inst, domain_transform.ptr_data[i].field_offset); + } + + RegionInstance points_instance; + PointDesc* d_points; + size_t num_valid_points; + + RegionInstance target_counters_instance = this->realm_malloc((2*targets.size()+1) * sizeof(uint32_t), my_mem); + uint32_t* d_target_counters = reinterpret_cast(AffineAccessor(target_counters_instance, 0).base); + uint32_t* d_targets_prefix = d_target_counters + targets.size(); + CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, targets.size() * sizeof(uint32_t), stream), stream); + + if (target_space.num_entries > targets.size()) { + BVH preimage_bvh; + RegionInstance bvh_instance; + GPUMicroOp::build_bvh(target_space, preimage_bvh, buffer_arena, stream); + + preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices, + preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.ptr_data.size(), preimage_bvh.num_leaves, nullptr, d_target_counters, nullptr); + KERNEL_CHECK(stream); + + std::vector h_target_counters(targets.size()+1); + h_target_counters[0] = 0; // prefix sum starts at 0 + CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t i = 0; i < targets.size(); ++i) { + h_target_counters[i+1] += h_target_counters[i]; + } + + num_valid_points = h_target_counters[targets.size()]; + + if (num_valid_points == 0) { + for (auto it : sparsity_outputs) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(it); + if (this->exclusive) { + impl->gpu_finalize(); + } else { + impl->contribute_nothing(); + } + } + target_counters_instance.destroy(); + accessors_instance.destroy(); + targets_entries_instance.destroy(); + offsets_instance.destroy(); + prefix_rects_instance.destroy(); + out_instance.destroy(); + inst_counters_instance.destroy(); + bvh_instance.destroy(); + return; + } + + CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream); + + points_instance = this->realm_malloc(num_valid_points * sizeof(PointDesc), my_mem); + d_points = reinterpret_cast*>(AffineAccessor(points_instance, 0).base); + + CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream); + + preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices, + preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.ptr_data.size(), preimage_bvh.num_leaves, d_targets_prefix, d_target_counters, d_points); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + bvh_instance.destroy(); + } else { + preimage_dense_populate_bitmasks_kernel< N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts, + num_valid_rects, domain_transform.ptr_data.size(), targets.size(), nullptr, d_target_counters, nullptr); + KERNEL_CHECK(stream); + + std::vector h_target_counters(targets.size()+1); + h_target_counters[0] = 0; // prefix sum starts at 0 + CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t i = 0; i < targets.size(); ++i) { + h_target_counters[i+1] += h_target_counters[i]; + } + + num_valid_points = h_target_counters[targets.size()]; + + if (num_valid_points == 0) { + for (auto it : sparsity_outputs) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(it); + if (this->exclusive) { + impl->gpu_finalize(); + } else { + impl->contribute_nothing(); + } + } + target_counters_instance.destroy(); + accessors_instance.destroy(); + targets_entries_instance.destroy(); + offsets_instance.destroy(); + prefix_rects_instance.destroy(); + out_instance.destroy(); + inst_counters_instance.destroy(); + return; + } + + CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream); + + points_instance = this->realm_malloc(num_valid_points * sizeof(PointDesc), my_mem); + d_points = reinterpret_cast*>(AffineAccessor(points_instance, 0).base); + + CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream); + + preimage_dense_populate_bitmasks_kernel< N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts, + num_valid_rects, domain_transform.ptr_data.size(), targets.size(), d_targets_prefix, d_target_counters, d_points); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } + + target_counters_instance.destroy(); + accessors_instance.destroy(); + targets_entries_instance.destroy(); + offsets_instance.destroy(); + prefix_rects_instance.destroy(); + out_instance.destroy(); + inst_counters_instance.destroy(); + + size_t out_rects = 0; + RectDesc* trash; + this->complete_pipeline(d_points, num_valid_points, trash, out_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + + points_instance.destroy(); + } +} \ No newline at end of file diff --git a/src/realm/deppart/preimage_gpu_kernels.hpp b/src/realm/deppart/preimage_gpu_kernels.hpp new file mode 100644 index 0000000000..10d9c5225c --- /dev/null +++ b/src/realm/deppart/preimage_gpu_kernels.hpp @@ -0,0 +1,256 @@ +#pragma once +#include "realm/deppart/preimage.h" + +namespace Realm { + + +template +__global__ void preimage_build_morton_codes( + const SparsityMapEntry* d_targets_entries, + const size_t* d_offsets_rects, + const Rect* d_global_bounds, + size_t total_rects, + size_t num_targets, + uint64_t* d_morton_codes, + uint64_t* d_indices, + uint64_t* d_targets_indices) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= total_rects) return; + const auto &entry = d_targets_entries[idx]; + d_morton_codes[idx] = bvh_morton_code(entry.bounds, *d_global_bounds); + d_indices[idx] = idx; + size_t low = 0, high = num_targets; + while (low < high) { + size_t mid = (low + high) >> 1; + if (d_offsets_rects[mid+1] <= idx) low = mid + 1; + else high = mid; + } + d_targets_indices[idx] = low; +} + +// +// 2) Initialize leaf boxes +// +template +__global__ +void preimage_init_leaf_boxes_kernel( + const SparsityMapEntry *rects, // [G] all flattened Rects + const uint64_t *leafIdx, // [n] maps leaf→orig Rect index + size_t total_rects, + Rect *boxes) // [(2n−1)] +{ + int k = blockIdx.x*blockDim.x + threadIdx.x; + if (k >= total_rects) return; + + size_t orig = leafIdx[k]; + boxes[k + total_rects - 1] = rects[orig].bounds; +} + + template +__device__ void preimage_queryBVH( + const Rect *boxes, + const int* childLeft, + const int* childRight, + const uint64_t* leafIdx, + const size_t* targets_indices, + int root, + size_t numTargetRects, + const Q& in_query, + Point out_point, + uint32_t* d_targets_prefix, + uint32_t* d_target_counters, + PointDesc *d_points) +{ + constexpr int MAX_STACK = 64; // max stack size for BVH traversal + int stack[MAX_STACK]; + int sp = 0; + + // start at the root + stack[sp++] = -1; + int node = root; + do + { + + int left = childLeft[node]; + int right = childRight[node]; + + bool overlapL; + bool overlapR; + + if constexpr (std::is_same_v>) { + overlapL = boxes[left].overlaps(in_query); + overlapR = boxes[right].overlaps(in_query); + } else { + static_assert(std::is_same_v>, + "Q must be Rect or Point"); + overlapL = boxes[left].contains(in_query); + overlapR = boxes[right].contains(in_query); + } + + + if (overlapL && left >= numTargetRects - 1) { + // left child is a leaf + uint64_t rect_idx = leafIdx[left - (numTargetRects - 1)]; + size_t target_idx = targets_indices[rect_idx]; + uint32_t local = atomicAdd(&d_target_counters[target_idx], 1); + if (d_points != nullptr) { + PointDesc point_desc; + point_desc.src_idx = target_idx; + point_desc.point = out_point; + uint32_t out_idx = d_targets_prefix[target_idx] + local; + d_points[out_idx] = point_desc; + } + } + if (overlapR && right >= numTargetRects - 1) { + uint64_t rect_idx = leafIdx[right - (numTargetRects - 1)]; + size_t target_idx = targets_indices[rect_idx]; + uint32_t local = atomicAdd(&d_target_counters[target_idx], 1); + if (d_points != nullptr) { + PointDesc point_desc; + point_desc.src_idx = target_idx; + point_desc.point = out_point; + uint32_t out_idx = d_targets_prefix[target_idx] + local; + d_points[out_idx] = point_desc; + } + } + + bool traverseL = overlapL && left < numTargetRects - 1; + bool traverseR = overlapR && right < numTargetRects - 1; + + if (!traverseL && !traverseR) { + node = stack[--sp]; + } else { + node = (traverseL ? left : right); + if (traverseL && traverseR) { + stack[sp++] = right; + } + } + } while (node != -1); +} + +template < + int N, typename T, + int N2, typename T2, typename Q +> +__global__ +void preimage_gpuPopulateBitmasksPtrsKernel( + AffineAccessor *accessors, + Rect* rects, + size_t* prefix, + uint32_t* inst_offsets, + int root, + int *childLeft, + int *childRight, + uint64_t *indices, + uint64_t *targets_indices, + Rect *boxes, + size_t numPoints, + size_t numRects, + size_t numInsts, + size_t numTargetRects, + uint32_t* d_targets_prefix, + uint32_t* d_target_counters, + PointDesc *d_points +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numPoints) return; + size_t low = 0, high = numRects; + while (low < high) { + size_t mid = (low + high) >> 1; + if (prefix[mid+1] <= idx) low = mid + 1; + else high = mid; + } + size_t r = low; + low = 0, high = numInsts; + while (low < high) { + size_t mid = (low + high) >> 1; + if (inst_offsets[mid+1] <= r) low = mid + 1; + else high = mid; + } + size_t inst_idx = low; + size_t offset = idx - prefix[r]; + Point p; + for (int k = N-1; k >= 0; --k) { + size_t dim = rects[r].hi[k] + 1 - rects[r].lo[k]; + p[k] = rects[r].lo[k] + (offset % dim); + offset /= dim; + } + Q ptr = accessors[inst_idx].read(p); + preimage_queryBVH(boxes, childLeft, childRight, indices, targets_indices, root, numTargetRects, ptr, p, d_targets_prefix, d_target_counters, d_points); +} + +template < + int N, typename T, + int N2, typename T2, typename Q +> +__global__ +void preimage_dense_populate_bitmasks_kernel( + AffineAccessor* accessors, + Rect* rects, + size_t* prefix, + uint32_t* inst_offsets, + SparsityMapEntry* targets_entries, + size_t* target_offsets, + size_t numPoints, + size_t numRects, + size_t numInsts, + size_t numTargets, + uint32_t *d_targets_prefix, + uint32_t *d_target_counters, + PointDesc *d_points +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numPoints) return; + size_t low = 0, high = numRects; + while (low < high) { + size_t mid = (low + high) >> 1; + if (prefix[mid+1] <= idx) low = mid + 1; + else high = mid; + } + size_t r = low; + low = 0, high = numInsts; + while (low < high) { + size_t mid = (low + high) >> 1; + if (inst_offsets[mid+1] <= r) low = mid + 1; + else high = mid; + } + size_t inst_idx = low; + size_t offset = idx - prefix[r]; + Point p; + for (int k = N-1; k >= 0; --k) { + size_t dim = rects[r].hi[k] + 1 - rects[r].lo[k]; + p[k] = rects[r].lo[k] + (offset % dim); + offset /= dim; + } + Q ptr = accessors[inst_idx].read(p); + for (size_t i = 0; i < numTargets; i++) { + bool inside = false; + for (size_t j = target_offsets[i]; j < target_offsets[i+1]; j++) { + if constexpr (std::is_same_v>) { + if (targets_entries[j].bounds.overlaps(ptr)) { + inside = true; + break; + } + } else { + static_assert(std::is_same_v>, + "Q must be Rect or Point"); + if (targets_entries[j].bounds.contains(ptr)) { + inside = true; + break; + } + } + } + if (inside) { + uint32_t local = atomicAdd(&d_target_counters[i], 1); + if (d_points != nullptr) { + PointDesc point_desc; + point_desc.src_idx = i; + point_desc.point = p; + uint32_t out_idx = d_targets_prefix[i] + local; + d_points[out_idx] = point_desc; + } + } + } +} + +} // namespace Realm \ No newline at end of file diff --git a/src/realm/deppart/preimage_gpu_tmpl.cu b/src/realm/deppart/preimage_gpu_tmpl.cu new file mode 100644 index 0000000000..eb532a5a1d --- /dev/null +++ b/src/realm/deppart/preimage_gpu_tmpl.cu @@ -0,0 +1,69 @@ +/* Copyright 2024 Stanford University, NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// per‐dimension instantiator for the GPU version of +// ImageMicroOp<…>::gpu_populate_bitmasks_ptrs + +#define REALM_TEMPLATES_ONLY +#include "realm/deppart/preimage_gpu_kernels.hpp" +#include "realm/deppart/preimage_gpu_impl.hpp" + +#ifndef INST_N1 + #error "INST_N1 must be defined before including preimage_gpu_tmpl.cu" +#endif +#ifndef INST_N2 + #error "INST_N2 must be defined before including preimage_gpu_tmpl.cu" +#endif + +// same set of T1,T2 pairs you use on the CPU side: +#define FOREACH_TT(__func__) \ + __func__(int, int) \ + __func__(int, unsigned) \ + __func__(int, long long) \ + __func__(unsigned,int) \ + __func__(unsigned,unsigned) \ + __func__(unsigned,long long) \ + __func__(long long, int) \ + __func__(long long, unsigned) \ + __func__(long long, long long) + +#define FOREACH_T(__func__) \ + __func__(int) \ + __func__(unsigned) \ + __func__(long long) + +namespace Realm { + #define N1 INST_N1 + #define N2 INST_N2 + + // Replace MyBitmask with whatever bitmask‐type you actually use + // (it must have an `as_vector.rects` member that your code touches). + // + // This explicitly instantiates: + // template void + // ImageMicroOp::gpu_populate_bitmasks_ptrs( + // std::map&); + // + #define DO_DOUBLE(T1,T2) \ + template class GPUPreimageMicroOp; \ + template class PreimageMicroOp; + + FOREACH_TT(DO_DOUBLE) + + #undef DO_DOUBLE + #undef N1 + #undef N2 + +} // namespace Realm \ No newline at end of file diff --git a/src/realm/deppart/preimage_tmpl.cc b/src/realm/deppart/preimage_tmpl.cc index 50bc3a1ba8..2d3d73e5b2 100644 --- a/src/realm/deppart/preimage_tmpl.cc +++ b/src/realm/deppart/preimage_tmpl.cc @@ -1,5 +1,5 @@ /* - * Copyright 2025 Stanford University, NVIDIA Corporation +* Copyright 2025 Stanford University, NVIDIA Corporation * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -28,15 +28,15 @@ #endif #define FOREACH_TT(__func__) \ - __func__(int,int) \ - __func__(int,unsigned) \ - __func__(int,long long) \ - __func__(unsigned,int) \ - __func__(unsigned,unsigned) \ - __func__(unsigned,long long) \ - __func__(long long,int) \ - __func__(long long,unsigned) \ - __func__(long long,long long) +__func__(int,int) \ +__func__(int,unsigned) \ +__func__(int,long long) \ +__func__(unsigned,int) \ +__func__(unsigned,unsigned) \ +__func__(unsigned,long long) \ +__func__(long long,int) \ +__func__(long long,unsigned) \ +__func__(long long,long long) namespace Realm { @@ -44,16 +44,21 @@ namespace Realm { #define N2 INST_N2 #define DOIT(T1,T2) \ - template class PreimageMicroOp; \ - template class StructuredPreimageMicroOp; \ - template class PreimageOperation; \ - template PreimageMicroOp::PreimageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \ - template Event IndexSpace::create_subspaces_by_preimage( \ - const DomainTransform &, const std::vector > &, \ - std::vector > &, const ProfilingRequestSet &, Event) \ - const; +template class PreimageMicroOp; \ +template class GPUPreimageMicroOp; \ +template class StructuredPreimageMicroOp; \ +template class PreimageOperation; \ +template PreimageMicroOp::PreimageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \ +template void IndexSpace::suggest_preimage_buffer_size( \ + const std::vector>&, \ + const std::vector>&, \ + std::vector&) const; \ +template Event IndexSpace::create_subspaces_by_preimage( \ +const DomainTransform &, const std::vector > &, \ +std::vector > &, const ProfilingRequestSet &, Event) \ +const; FOREACH_TT(DOIT) -}; +}; \ No newline at end of file diff --git a/src/realm/indexspace.h b/src/realm/indexspace.h index cf6caf9a26..9ea593b392 100644 --- a/src/realm/indexspace.h +++ b/src/realm/indexspace.h @@ -736,6 +736,11 @@ namespace Realm { const std::vector &colors, std::vector> &subspaces, const ProfilingRequestSet &reqs, Event wait_on = Event::NO_EVENT) const; + template + REALM_PUBLIC_API void suggest_byfield_buffer_size( + const std::vector>& inputs, + std::vector& suggestions) const; + ///@{ /** * Allows the "function" described by the field to be composed with a @@ -802,7 +807,7 @@ namespace Realm { Event wait_on = Event::NO_EVENT) const; template - REALM_PUBLIC_API void suggest_deppart_buffer_size( + REALM_PUBLIC_API void suggest_image_buffer_size( const std::vector>& source_spaces, const std::vector>& inputs, std::vector& suggestions) const; @@ -927,6 +932,12 @@ namespace Realm { const std::vector> &targets, std::vector> &preimages, const ProfilingRequestSet &reqs, Event wait_on = Event::NO_EVENT) const; + + template + REALM_PUBLIC_API void suggest_preimage_buffer_size( + const std::vector>& target_spaces, + const std::vector>& inputs, + std::vector& suggestions) const; ///@} ///@{ From d7e9e478a5588b12f23c59bd8e3267cc834ec631 Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Mon, 2 Feb 2026 17:53:29 -0800 Subject: [PATCH 06/32] Added ifdef REALM_USE_CUDA guards to gpu deppart --- src/realm/deppart/byfield.cc | 7 ++++++ src/realm/deppart/byfield.h | 4 ++++ src/realm/deppart/byfield_tmpl.cc | 8 ++++++- src/realm/deppart/image.cc | 10 ++++++++- src/realm/deppart/image.h | 2 ++ src/realm/deppart/image_tmpl.cc | 8 ++++++- src/realm/deppart/partitions.h | 35 ++++++------------------------ src/realm/deppart/preimage.cc | 8 +++++++ src/realm/deppart/preimage.h | 4 ++++ src/realm/deppart/preimage_tmpl.cc | 8 ++++++- 10 files changed, 62 insertions(+), 32 deletions(-) diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc index c6ccacc6ce..9c9d5a4ad1 100644 --- a/src/realm/deppart/byfield.cc +++ b/src/realm/deppart/byfield.cc @@ -309,6 +309,7 @@ namespace Realm { ActiveMessageHandlerReg > > ByFieldMicroOp::areg; +#ifdef REALM_USE_CUDA //////////////////////////////////////////////////////////////////////// // // class GPUByFieldMicroOp @@ -355,6 +356,8 @@ namespace Realm { sparsity_outputs[_val] = _sparsity; } +#endif + //////////////////////////////////////////////////////////////////////// // @@ -430,6 +433,7 @@ namespace Realm { uop->dispatch(this, true /* ok to run in this thread */); } +#ifdef REALM_USE_CUDA for (auto fdd : gpu_field_data) { std::vector,FT> > single_gpu_field_data = {fdd}; GPUByFieldMicroOp *uop = new GPUByFieldMicroOp(parent, single_gpu_field_data, exclusive); @@ -438,6 +442,9 @@ namespace Realm { } uop->dispatch(this, false); } +#else + assert(gpu_field_data.empty()); +#endif } template diff --git a/src/realm/deppart/byfield.h b/src/realm/deppart/byfield.h index 92902efbd1..cc21234f32 100644 --- a/src/realm/deppart/byfield.h +++ b/src/realm/deppart/byfield.h @@ -68,6 +68,8 @@ namespace Realm { std::map > sparsity_outputs; }; +#ifdef REALM_USE_CUDA + template class GPUByFieldMicroOp : public GPUMicroOp { public: @@ -91,6 +93,8 @@ namespace Realm { std::map > sparsity_outputs; }; +#endif + template class ByFieldOperation : public PartitioningOperation { public: diff --git a/src/realm/deppart/byfield_tmpl.cc b/src/realm/deppart/byfield_tmpl.cc index 7575607ea2..c8e6db0bcd 100644 --- a/src/realm/deppart/byfield_tmpl.cc +++ b/src/realm/deppart/byfield_tmpl.cc @@ -43,9 +43,15 @@ namespace Realm { #define N1 INST_N1 #define N2 INST_N2 +#ifdef REALM_USE_CUDA + #define GPU_BYFIELD_LINE(N, T, ...) template class GPUByFieldMicroOp; +#else + #define GPU_BYFIELD_LINE(N, T, ...) /* no CUDA */ +#endif + #define DOIT(N,T,F) \ template class ByFieldMicroOp; \ - template class GPUByFieldMicroOp; \ + GPU_BYFIELD_LINE(N, T, F) \ template class ByFieldOperation; \ template ByFieldMicroOp::ByFieldMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \ template Event IndexSpace::create_subspaces_by_field(const std::vector,F> >&, \ diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc index d207161b22..d0251687b4 100644 --- a/src/realm/deppart/image.cc +++ b/src/realm/deppart/image.cc @@ -689,6 +689,7 @@ namespace Realm { images[j]); uop->dispatch(this, true /* ok to run in this thread */); } +#ifdef REALM_USE_CUDA for (auto ptr_fdd : gpu_ptr_data) { // launch full cross-product of image micro ops right away assert(ptr_fdd.scratch_buffer != RegionInstance::NO_INST); @@ -713,6 +714,9 @@ namespace Realm { } micro_op->dispatch(this, true); } +#else + assert(!gpu_data); +#endif } } @@ -916,7 +920,9 @@ namespace Realm { //////////////////////////////////////////////////////////////////////// // - // class StructuredImageMicroOp + // class GPUImageMicroOp + +#ifdef REALM_USE_CUDA template GPUImageMicroOp::GPUImageMicroOp( @@ -979,6 +985,8 @@ namespace Realm { gpu_populate_rngs(); } } +#endif + //////////////////////////////////////////////////////////////////////// diff --git a/src/realm/deppart/image.h b/src/realm/deppart/image.h index 58131338a3..ab81ecafae 100644 --- a/src/realm/deppart/image.h +++ b/src/realm/deppart/image.h @@ -148,6 +148,7 @@ namespace Realm { std::vector > sources; std::vector > sparsity_outputs; }; +#ifdef REALM_USE_CUDA template class GPUImageMicroOp : public GPUMicroOp { @@ -176,6 +177,7 @@ namespace Realm { std::vector > sources; std::vector > sparsity_outputs; }; +#endif }; // namespace Realm #endif // REALM_DEPPART_IMAGE_H diff --git a/src/realm/deppart/image_tmpl.cc b/src/realm/deppart/image_tmpl.cc index 8a0e686f22..19242fa9ca 100644 --- a/src/realm/deppart/image_tmpl.cc +++ b/src/realm/deppart/image_tmpl.cc @@ -44,10 +44,16 @@ namespace Realm { #define N1 INST_N1 #define N2 INST_N2 +#ifdef REALM_USE_CUDA + #define GPU_IMAGE_LINE(N1,T1,N2,T2) template class GPUImageMicroOp; +#else + #define GPU_IMAGE_LINE(N1,T1,N2,T2) /* no CUDA */ +#endif + #define DOIT(T1,T2) \ template class StructuredImageMicroOp; \ template class ImageMicroOp; \ - template class GPUImageMicroOp; \ + GPU_IMAGE_LINE(N1, T1, N2, T2) \ template class ImageOperation; \ template ImageMicroOp::ImageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \ template void IndexSpace::suggest_image_buffer_size( \ diff --git a/src/realm/deppart/partitions.h b/src/realm/deppart/partitions.h index 4ec4560984..051717d803 100644 --- a/src/realm/deppart/partitions.h +++ b/src/realm/deppart/partitions.h @@ -43,21 +43,7 @@ namespace Realm { class PartitioningMicroOp; class PartitioningOperation; - template - constexpr std::string_view type_name() { - #if defined(__clang__) - std::string_view p = __PRETTY_FUNCTION__; - return {p.data() + 34, p.size() - 34 - 1}; - #elif defined(__GNUC__) - std::string_view p = __PRETTY_FUNCTION__; - return {p.data() + 49, p.size() - 49 - 1}; - #elif defined(_MSC_VER) - std::string_view p = __FUNCSIG__; - return {p.data() + 84, p.size() - 84 - 7}; - #else - return "unknown"; - #endif - } +#ifdef REALM_USE_CUDA template struct HiFlag { @@ -139,19 +125,7 @@ namespace Realm { template T* alloc(size_t count = 1) { - try { - if (parity_) { - return alloc_right(count); - } else { - return alloc_left(count); - } - } catch (arena_oom&) { - std::cout << "Arena OOM: requested " << count << " of " << type_name() - << " capacity " << cap_ << " bytes, " - << " used " << used() << " bytes, " - << " left " << (cap_ - left_ - right_) << " bytes.\n"; - throw arena_oom{}; - } + return parity_ ? alloc_right(count) : alloc_left(count); } void flip_parity(void) noexcept { @@ -241,6 +215,9 @@ namespace Realm { size_t base_right_; }; + +#endif + template class OverlapTester { public: @@ -349,6 +326,7 @@ namespace Realm { std::vector *> extra_deps; }; +#ifdef REALM_USE_CUDA //The parent class for all GPU partitioning micro-ops. Provides output utility functions template @@ -387,6 +365,7 @@ namespace Realm { bool exclusive = false; }; +#endif //////////////////////////////////////// // diff --git a/src/realm/deppart/preimage.cc b/src/realm/deppart/preimage.cc index 5df628f2f6..63131916bc 100644 --- a/src/realm/deppart/preimage.cc +++ b/src/realm/deppart/preimage.cc @@ -484,6 +484,7 @@ namespace Realm { uop->add_sparsity_output(targets[j], preimages[j]); uop->dispatch(this, true /* ok to run in this thread */); } +#ifdef REALM_USE_CUDA for (auto ptr_fdd : gpu_ptr_data) { domain_transform.ptr_data = {ptr_fdd}; GPUPreimageMicroOp *micro_op = @@ -504,6 +505,10 @@ namespace Realm { } micro_op->dispatch(this, true); } +#else + assert(!gpu_data); +#endif + } } @@ -782,6 +787,7 @@ namespace Realm { //////////////////////////////////////////////////////////////////////// // // class GPUPreimageMicroOp +#ifdef REALM_USE_CUDA template GPUPreimageMicroOp::GPUPreimageMicroOp( @@ -837,6 +843,8 @@ namespace Realm { this->finish_dispatch(op, inline_ok); } +#endif + // instantiations of templates handled in preimage_tmpl.cc }; // namespace Realm \ No newline at end of file diff --git a/src/realm/deppart/preimage.h b/src/realm/deppart/preimage.h index 1a67c12aee..ed301ad51e 100644 --- a/src/realm/deppart/preimage.h +++ b/src/realm/deppart/preimage.h @@ -153,6 +153,8 @@ namespace Realm { std::vector > sparsity_outputs; }; + #ifdef REALM_USE_CUDA + template class GPUPreimageMicroOp : public GPUMicroOp { public: @@ -183,6 +185,8 @@ namespace Realm { std::vector > sparsity_outputs; }; +#endif + }; // namespace Realm #endif // REALM_DEPPART_PREIMAGE_H \ No newline at end of file diff --git a/src/realm/deppart/preimage_tmpl.cc b/src/realm/deppart/preimage_tmpl.cc index 2d3d73e5b2..2df0d80502 100644 --- a/src/realm/deppart/preimage_tmpl.cc +++ b/src/realm/deppart/preimage_tmpl.cc @@ -43,9 +43,15 @@ namespace Realm { #define N1 INST_N1 #define N2 INST_N2 +#ifdef REALM_USE_CUDA + #define GPU_PREIMAGE_LINE(N1,T1,N2,T2) template class GPUPreimageMicroOp; +#else + #define GPU_PREIMAGE_LINE(N1,T1,N2,T2) /* no CUDA */ +#endif + #define DOIT(T1,T2) \ template class PreimageMicroOp; \ -template class GPUPreimageMicroOp; \ +GPU_PREIMAGE_LINE(N1,T1,N2,T2) \ template class StructuredPreimageMicroOp; \ template class PreimageOperation; \ template PreimageMicroOp::PreimageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \ From d82566d10092e95be1eba0ada606aa18705decfe Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Mon, 9 Feb 2026 15:02:30 -0800 Subject: [PATCH 07/32] renamed suggested to required and provided target proc instead of mem in buffer descriptor --- src/realm/deppart/byfield.cc | 42 +++++++++++++++++++----------- src/realm/deppart/byfield_tmpl.cc | 4 +-- src/realm/deppart/image.cc | 29 ++++++++++++++------- src/realm/deppart/image_tmpl.cc | 4 +-- src/realm/deppart/preimage.cc | 30 ++++++++++++++------- src/realm/deppart/preimage_tmpl.cc | 4 +-- src/realm/indexspace.h | 17 ++++++------ 7 files changed, 83 insertions(+), 47 deletions(-) diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc index 9c9d5a4ad1..203843d81d 100644 --- a/src/realm/deppart/byfield.cc +++ b/src/realm/deppart/byfield.cc @@ -31,28 +31,40 @@ namespace Realm { template template - void IndexSpace::suggest_byfield_buffer_size( + void IndexSpace::required_byfield_buffer_size( const std::vector>& inputs, - std::vector& suggestions) const { - suggestions = std::vector(inputs.size()); + std::vector& requirements) const { + requirements = std::vector(inputs.size()); for (size_t i = 0; i < inputs.size(); i++) { IndexSpace is = inputs[i].space; Memory mem = inputs[i].location; if (mem.kind() == Memory::GPU_FB_MEM || mem.kind() == Memory::Z_COPY_MEM) { - const char* val = std::getenv("MIN_SIZE"); // or any env var - size_t device_size = 2000000; //default - if (val) { - device_size = atoi(val); - } - size_t optimal_size = is.bounds.volume() * sizeof(Rect); - suggestions[i].suggested = mem; - suggestions[i].lower_bound = device_size; - suggestions[i].upper_bound = max(device_size, optimal_size); + const char* val = std::getenv("MIN_SIZE"); // or any env var + size_t device_size = 2000000; //default + if (val) { + device_size = atoi(val); + } + size_t optimal_size = is.bounds.volume() * sizeof(Rect); + std::vector affinities; + unsigned best_bandwidth = 0; + Processor best_proc = Processor::NO_PROC; + Machine::get_machine().get_proc_mem_affinity(affinities, Processor::NO_PROC, mem); + for (auto affinity : affinities) { + if (affinity.bandwidth > best_bandwidth) { + best_bandwidth = affinity.bandwidth; + best_proc = affinity.p; + } + } + requirements[i].target_proc = best_proc; + requirements[i].lower_bound = device_size; + requirements[i].upper_bound = max(device_size, optimal_size); + requirements[i].minimum_alignment = 128; } else { - suggestions[i].suggested = Memory::NO_MEMORY; - suggestions[i].lower_bound = 0; - suggestions[i].upper_bound = 0; + requirements[i].target_proc = Processor::NO_PROC; + requirements[i].lower_bound = 0; + requirements[i].upper_bound = 0; + requirements[i].minimum_alignment = 0; } } } diff --git a/src/realm/deppart/byfield_tmpl.cc b/src/realm/deppart/byfield_tmpl.cc index c8e6db0bcd..fc15f5b94a 100644 --- a/src/realm/deppart/byfield_tmpl.cc +++ b/src/realm/deppart/byfield_tmpl.cc @@ -59,9 +59,9 @@ namespace Realm { std::vector >&, \ const ProfilingRequestSet &, \ Event) const; \ - template void IndexSpace::suggest_byfield_buffer_size( \ + template void IndexSpace::required_byfield_buffer_size( \ const std::vector>&, \ - std::vector&) const; + std::vector&) const; FOREACH_NTF(DOIT) diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc index d0251687b4..437167a95b 100644 --- a/src/realm/deppart/image.cc +++ b/src/realm/deppart/image.cc @@ -32,10 +32,10 @@ namespace Realm { template template - void IndexSpace::suggest_image_buffer_size( + void IndexSpace::required_image_buffer_size( const std::vector>& source_spaces, const std::vector>& inputs, - std::vector& suggestions) const { + std::vector& requirements) const { size_t minimal_size = 0; size_t source_entries = 0; bool bvh = false; @@ -58,7 +58,7 @@ namespace Realm { (2 * source_entries * sizeof(uint64_t)) + (source_entries * sizeof(uint64_t)); } - suggestions = std::vector(inputs.size()); + requirements = std::vector(inputs.size()); for (size_t i = 0; i < inputs.size(); i++) { IndexSpace is = inputs[i].space; Memory mem = inputs[i].location; @@ -71,13 +71,24 @@ namespace Realm { } minimal_size = max(minimal_size, device_size); size_t optimal_size = is.bounds.volume() * sizeof(Rect) * source_spaces.size() + minimal_size; - suggestions[i].suggested = mem; - suggestions[i].lower_bound = minimal_size; - suggestions[i].upper_bound = optimal_size; + std::vector affinities; + unsigned best_bandwidth = 0; + Processor best_proc = Processor::NO_PROC; + Machine::get_machine().get_proc_mem_affinity(affinities, Processor::NO_PROC, mem); + for (auto affinity : affinities) { + if (affinity.bandwidth > best_bandwidth) { + best_bandwidth = affinity.bandwidth; + best_proc = affinity.p; + } + } + requirements[i].target_proc = best_proc; + requirements[i].lower_bound = minimal_size; + requirements[i].upper_bound = optimal_size; + requirements[i].minimum_alignment = 128; } else { - suggestions[i].suggested = Memory::NO_MEMORY; - suggestions[i].lower_bound = 0; - suggestions[i].upper_bound = 0; + requirements[i].target_proc = Processor::NO_PROC; + requirements[i].lower_bound = 0; + requirements[i].upper_bound = 0; } } } diff --git a/src/realm/deppart/image_tmpl.cc b/src/realm/deppart/image_tmpl.cc index 19242fa9ca..a2cb2cb9e6 100644 --- a/src/realm/deppart/image_tmpl.cc +++ b/src/realm/deppart/image_tmpl.cc @@ -56,10 +56,10 @@ namespace Realm { GPU_IMAGE_LINE(N1, T1, N2, T2) \ template class ImageOperation; \ template ImageMicroOp::ImageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \ - template void IndexSpace::suggest_image_buffer_size( \ + template void IndexSpace::required_image_buffer_size( \ const std::vector>&, \ const std::vector>&, \ - std::vector&) const; \ + std::vector&) const; \ template Event IndexSpace::create_subspaces_by_image( \ const DomainTransform &, const std::vector > &, \ std::vector > &, const ProfilingRequestSet &, Event) const; \ diff --git a/src/realm/deppart/preimage.cc b/src/realm/deppart/preimage.cc index 63131916bc..d327df1c74 100644 --- a/src/realm/deppart/preimage.cc +++ b/src/realm/deppart/preimage.cc @@ -34,10 +34,10 @@ namespace Realm { template template - void IndexSpace::suggest_preimage_buffer_size( + void IndexSpace::required_preimage_buffer_size( const std::vector>& target_spaces, const std::vector>& inputs, - std::vector& suggestions) const { + std::vector& requirements) const { size_t minimal_size = 0; size_t source_entries = 0; bool bvh = false; @@ -60,7 +60,7 @@ namespace Realm { (2 * source_entries * sizeof(uint64_t)) + (source_entries * sizeof(uint64_t)); } - suggestions = std::vector(inputs.size()); + requirements = std::vector(inputs.size()); for (size_t i = 0; i < inputs.size(); i++) { IndexSpace is = inputs[i].space; Memory mem = inputs[i].location; @@ -73,13 +73,25 @@ namespace Realm { } minimal_size = max(minimal_size, device_size); size_t optimal_size = is.bounds.volume() * sizeof(Rect) * target_spaces.size() + minimal_size; - suggestions[i].suggested = mem; - suggestions[i].lower_bound = minimal_size; - suggestions[i].upper_bound = optimal_size; + std::vector affinities; + unsigned best_bandwidth = 0; + Processor best_proc = Processor::NO_PROC; + Machine::get_machine().get_proc_mem_affinity(affinities, Processor::NO_PROC, mem); + for (auto affinity : affinities) { + if (affinity.bandwidth > best_bandwidth) { + best_bandwidth = affinity.bandwidth; + best_proc = affinity.p; + } + } + requirements[i].target_proc = best_proc; + requirements[i].lower_bound = minimal_size; + requirements[i].upper_bound = optimal_size; + requirements[i].minimum_alignment = 128; } else { - suggestions[i].suggested = Memory::NO_MEMORY; - suggestions[i].lower_bound = 0; - suggestions[i].upper_bound = 0; + requirements[i].target_proc = Processor::NO_PROC; + requirements[i].lower_bound = 0; + requirements[i].upper_bound = 0; + requirements[i].minimum_alignment = 0; } } } diff --git a/src/realm/deppart/preimage_tmpl.cc b/src/realm/deppart/preimage_tmpl.cc index 2df0d80502..ef6725f567 100644 --- a/src/realm/deppart/preimage_tmpl.cc +++ b/src/realm/deppart/preimage_tmpl.cc @@ -55,10 +55,10 @@ GPU_PREIMAGE_LINE(N1,T1,N2,T2) \ template class StructuredPreimageMicroOp; \ template class PreimageOperation; \ template PreimageMicroOp::PreimageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \ -template void IndexSpace::suggest_preimage_buffer_size( \ +template void IndexSpace::required_preimage_buffer_size( \ const std::vector>&, \ const std::vector>&, \ - std::vector&) const; \ + std::vector&) const; \ template Event IndexSpace::create_subspaces_by_preimage( \ const DomainTransform &, const std::vector > &, \ std::vector > &, const ProfilingRequestSet &, Event) \ diff --git a/src/realm/indexspace.h b/src/realm/indexspace.h index 9ea593b392..61109181fc 100644 --- a/src/realm/indexspace.h +++ b/src/realm/indexspace.h @@ -124,10 +124,11 @@ namespace Realm { Memory location; }; - struct DeppartEstimateSuggestion { - Memory suggested; + struct DeppartBufferRequirements { size_t lower_bound; size_t upper_bound; + size_t minimum_alignment = 128; + Processor target_proc; }; /** @@ -737,9 +738,9 @@ namespace Realm { const ProfilingRequestSet &reqs, Event wait_on = Event::NO_EVENT) const; template - REALM_PUBLIC_API void suggest_byfield_buffer_size( + REALM_PUBLIC_API void required_byfield_buffer_size( const std::vector>& inputs, - std::vector& suggestions) const; + std::vector& suggestions) const; ///@{ /** @@ -807,10 +808,10 @@ namespace Realm { Event wait_on = Event::NO_EVENT) const; template - REALM_PUBLIC_API void suggest_image_buffer_size( + REALM_PUBLIC_API void required_image_buffer_size( const std::vector>& source_spaces, const std::vector>& inputs, - std::vector& suggestions) const; + std::vector& suggestions) const; ///@} @@ -934,10 +935,10 @@ namespace Realm { Event wait_on = Event::NO_EVENT) const; template - REALM_PUBLIC_API void suggest_preimage_buffer_size( + REALM_PUBLIC_API void required_preimage_buffer_size( const std::vector>& target_spaces, const std::vector>& inputs, - std::vector& suggestions) const; + std::vector& suggestions) const; ///@} ///@{ From 0d921066e718b8f1b6a5611ecec72d08d0263f34 Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Tue, 10 Feb 2026 10:00:30 -0800 Subject: [PATCH 08/32] deleted default alignment --- src/realm/indexspace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/realm/indexspace.h b/src/realm/indexspace.h index 61109181fc..61ff97da55 100644 --- a/src/realm/indexspace.h +++ b/src/realm/indexspace.h @@ -127,7 +127,7 @@ namespace Realm { struct DeppartBufferRequirements { size_t lower_bound; size_t upper_bound; - size_t minimum_alignment = 128; + size_t minimum_alignment; Processor target_proc; }; From 59ad8780b83a171949893542ccfa8b91870e256b Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Tue, 10 Feb 2026 23:30:34 -0800 Subject: [PATCH 09/32] removed ft from byfield estimate template --- src/realm/deppart/byfield.cc | 1 - src/realm/deppart/byfield_tmpl.cc | 14 ++++++++++---- src/realm/indexspace.h | 1 - tests/deppart.cc | 3 +++ 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc index 203843d81d..9af275eb3d 100644 --- a/src/realm/deppart/byfield.cc +++ b/src/realm/deppart/byfield.cc @@ -30,7 +30,6 @@ namespace Realm { extern Logger log_uop_timing; template - template void IndexSpace::required_byfield_buffer_size( const std::vector>& inputs, std::vector& requirements) const { diff --git a/src/realm/deppart/byfield_tmpl.cc b/src/realm/deppart/byfield_tmpl.cc index fc15f5b94a..b9896c5c53 100644 --- a/src/realm/deppart/byfield_tmpl.cc +++ b/src/realm/deppart/byfield_tmpl.cc @@ -45,6 +45,13 @@ namespace Realm { #ifdef REALM_USE_CUDA #define GPU_BYFIELD_LINE(N, T, ...) template class GPUByFieldMicroOp; + #define DOIT_NT(N, T) \ + template void IndexSpace::required_byfield_buffer_size( \ + const std::vector>&, \ + std::vector&) const; + +FOREACH_NT(DOIT_NT) + #else #define GPU_BYFIELD_LINE(N, T, ...) /* no CUDA */ #endif @@ -58,10 +65,9 @@ namespace Realm { const std::vector&, \ std::vector >&, \ const ProfilingRequestSet &, \ - Event) const; \ - template void IndexSpace::required_byfield_buffer_size( \ - const std::vector>&, \ - std::vector&) const; + Event) const; + + FOREACH_NTF(DOIT) diff --git a/src/realm/indexspace.h b/src/realm/indexspace.h index 61ff97da55..14c8561e20 100644 --- a/src/realm/indexspace.h +++ b/src/realm/indexspace.h @@ -737,7 +737,6 @@ namespace Realm { const std::vector &colors, std::vector> &subspaces, const ProfilingRequestSet &reqs, Event wait_on = Event::NO_EVENT) const; - template REALM_PUBLIC_API void required_byfield_buffer_size( const std::vector>& inputs, std::vector& suggestions) const; diff --git a/tests/deppart.cc b/tests/deppart.cc index eaf4a012e8..742c9d9c8b 100644 --- a/tests/deppart.cc +++ b/tests/deppart.cc @@ -559,6 +559,9 @@ class BasicTest : public TestInterface { log_app.info() << "GPU Preimage complete " << Clock::current_time_in_microseconds() << "\n"; log_app.info() << "Starting GPU Image " << Clock::current_time_in_microseconds() << "\n"; + std::vector> spaces = {}; + std::vector requirements; + is_nodes.required_byfield_buffer_size(spaces, requirements); // an image of p_edges through out_node gives us all the shared nodes, along // with some private nodes Event e3 = is_nodes.create_subspaces_by_image(src_field_data_gpu, From b2f64a9c6f4392194123c2534cd299ffabcee371 Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Thu, 12 Feb 2026 11:48:05 -0800 Subject: [PATCH 10/32] renamed gpu deppart requirement functions --- src/realm/deppart/byfield.cc | 6 +++--- src/realm/deppart/byfield_tmpl.cc | 12 +++++------- src/realm/deppart/image.cc | 6 +++--- src/realm/deppart/image_tmpl.cc | 2 +- src/realm/deppart/preimage.cc | 6 +++--- src/realm/deppart/preimage_tmpl.cc | 2 +- src/realm/indexspace.h | 14 +++++++------- tests/deppart.cc | 2 +- 8 files changed, 24 insertions(+), 26 deletions(-) diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc index 9af275eb3d..cfd2927589 100644 --- a/src/realm/deppart/byfield.cc +++ b/src/realm/deppart/byfield.cc @@ -30,7 +30,7 @@ namespace Realm { extern Logger log_uop_timing; template - void IndexSpace::required_byfield_buffer_size( + void IndexSpace::by_field_buffer_requirements( const std::vector>& inputs, std::vector& requirements) const { requirements = std::vector(inputs.size()); @@ -55,12 +55,12 @@ namespace Realm { best_proc = affinity.p; } } - requirements[i].target_proc = best_proc; + requirements[i].affinity_processor = best_proc; requirements[i].lower_bound = device_size; requirements[i].upper_bound = max(device_size, optimal_size); requirements[i].minimum_alignment = 128; } else { - requirements[i].target_proc = Processor::NO_PROC; + requirements[i].affinity_processor = Processor::NO_PROC; requirements[i].lower_bound = 0; requirements[i].upper_bound = 0; requirements[i].minimum_alignment = 0; diff --git a/src/realm/deppart/byfield_tmpl.cc b/src/realm/deppart/byfield_tmpl.cc index b9896c5c53..3da5121f04 100644 --- a/src/realm/deppart/byfield_tmpl.cc +++ b/src/realm/deppart/byfield_tmpl.cc @@ -45,12 +45,6 @@ namespace Realm { #ifdef REALM_USE_CUDA #define GPU_BYFIELD_LINE(N, T, ...) template class GPUByFieldMicroOp; - #define DOIT_NT(N, T) \ - template void IndexSpace::required_byfield_buffer_size( \ - const std::vector>&, \ - std::vector&) const; - -FOREACH_NT(DOIT_NT) #else #define GPU_BYFIELD_LINE(N, T, ...) /* no CUDA */ @@ -67,9 +61,13 @@ FOREACH_NT(DOIT_NT) const ProfilingRequestSet &, \ Event) const; +#define DOIT_NT(N, T) \ + template void IndexSpace::by_field_buffer_requirements( \ + const std::vector>&, \ + std::vector&) const; - +FOREACH_NT(DOIT_NT) FOREACH_NTF(DOIT) #define ZP(N,T) Point diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc index 437167a95b..ff1122d820 100644 --- a/src/realm/deppart/image.cc +++ b/src/realm/deppart/image.cc @@ -32,7 +32,7 @@ namespace Realm { template template - void IndexSpace::required_image_buffer_size( + void IndexSpace::by_image_buffer_requirements( const std::vector>& source_spaces, const std::vector>& inputs, std::vector& requirements) const { @@ -81,12 +81,12 @@ namespace Realm { best_proc = affinity.p; } } - requirements[i].target_proc = best_proc; + requirements[i].affinity_processor = best_proc; requirements[i].lower_bound = minimal_size; requirements[i].upper_bound = optimal_size; requirements[i].minimum_alignment = 128; } else { - requirements[i].target_proc = Processor::NO_PROC; + requirements[i].affinity_processor = Processor::NO_PROC; requirements[i].lower_bound = 0; requirements[i].upper_bound = 0; } diff --git a/src/realm/deppart/image_tmpl.cc b/src/realm/deppart/image_tmpl.cc index a2cb2cb9e6..a0d3d7319a 100644 --- a/src/realm/deppart/image_tmpl.cc +++ b/src/realm/deppart/image_tmpl.cc @@ -56,7 +56,7 @@ namespace Realm { GPU_IMAGE_LINE(N1, T1, N2, T2) \ template class ImageOperation; \ template ImageMicroOp::ImageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \ - template void IndexSpace::required_image_buffer_size( \ + template void IndexSpace::by_image_buffer_requirements( \ const std::vector>&, \ const std::vector>&, \ std::vector&) const; \ diff --git a/src/realm/deppart/preimage.cc b/src/realm/deppart/preimage.cc index d327df1c74..4ae8cd4ddc 100644 --- a/src/realm/deppart/preimage.cc +++ b/src/realm/deppart/preimage.cc @@ -34,7 +34,7 @@ namespace Realm { template template - void IndexSpace::required_preimage_buffer_size( + void IndexSpace::by_preimage_buffer_requirements( const std::vector>& target_spaces, const std::vector>& inputs, std::vector& requirements) const { @@ -83,12 +83,12 @@ namespace Realm { best_proc = affinity.p; } } - requirements[i].target_proc = best_proc; + requirements[i].affinity_processor = best_proc; requirements[i].lower_bound = minimal_size; requirements[i].upper_bound = optimal_size; requirements[i].minimum_alignment = 128; } else { - requirements[i].target_proc = Processor::NO_PROC; + requirements[i].affinity_processor = Processor::NO_PROC; requirements[i].lower_bound = 0; requirements[i].upper_bound = 0; requirements[i].minimum_alignment = 0; diff --git a/src/realm/deppart/preimage_tmpl.cc b/src/realm/deppart/preimage_tmpl.cc index ef6725f567..dadf4b8aa6 100644 --- a/src/realm/deppart/preimage_tmpl.cc +++ b/src/realm/deppart/preimage_tmpl.cc @@ -55,7 +55,7 @@ GPU_PREIMAGE_LINE(N1,T1,N2,T2) \ template class StructuredPreimageMicroOp; \ template class PreimageOperation; \ template PreimageMicroOp::PreimageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \ -template void IndexSpace::required_preimage_buffer_size( \ +template void IndexSpace::by_preimage_buffer_requirements( \ const std::vector>&, \ const std::vector>&, \ std::vector&) const; \ diff --git a/src/realm/indexspace.h b/src/realm/indexspace.h index 14c8561e20..82071fd6ae 100644 --- a/src/realm/indexspace.h +++ b/src/realm/indexspace.h @@ -128,7 +128,7 @@ namespace Realm { size_t lower_bound; size_t upper_bound; size_t minimum_alignment; - Processor target_proc; + Processor affinity_processor; }; /** @@ -737,9 +737,9 @@ namespace Realm { const std::vector &colors, std::vector> &subspaces, const ProfilingRequestSet &reqs, Event wait_on = Event::NO_EVENT) const; - REALM_PUBLIC_API void required_byfield_buffer_size( + REALM_PUBLIC_API void by_field_buffer_requirements( const std::vector>& inputs, - std::vector& suggestions) const; + std::vector& requirements) const; ///@{ /** @@ -807,10 +807,10 @@ namespace Realm { Event wait_on = Event::NO_EVENT) const; template - REALM_PUBLIC_API void required_image_buffer_size( + REALM_PUBLIC_API void by_image_buffer_requirements( const std::vector>& source_spaces, const std::vector>& inputs, - std::vector& suggestions) const; + std::vector& requirements) const; ///@} @@ -934,10 +934,10 @@ namespace Realm { Event wait_on = Event::NO_EVENT) const; template - REALM_PUBLIC_API void required_preimage_buffer_size( + REALM_PUBLIC_API void by_preimage_buffer_requirements( const std::vector>& target_spaces, const std::vector>& inputs, - std::vector& suggestions) const; + std::vector& requirements) const; ///@} ///@{ diff --git a/tests/deppart.cc b/tests/deppart.cc index 742c9d9c8b..448d3a60d0 100644 --- a/tests/deppart.cc +++ b/tests/deppart.cc @@ -561,7 +561,7 @@ class BasicTest : public TestInterface { std::vector> spaces = {}; std::vector requirements; - is_nodes.required_byfield_buffer_size(spaces, requirements); + is_nodes.by_field_buffer_requirements(spaces, requirements); // an image of p_edges through out_node gives us all the shared nodes, along // with some private nodes Event e3 = is_nodes.create_subspaces_by_image(src_field_data_gpu, From 9f7be25d0d207dfa037c71d1114957114f4428b8 Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Thu, 12 Feb 2026 12:25:40 -0800 Subject: [PATCH 11/32] Added default initializations to DeppartBufferRequirements --- src/realm/indexspace.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/realm/indexspace.h b/src/realm/indexspace.h index 82071fd6ae..c1a61b21cb 100644 --- a/src/realm/indexspace.h +++ b/src/realm/indexspace.h @@ -125,9 +125,9 @@ namespace Realm { }; struct DeppartBufferRequirements { - size_t lower_bound; - size_t upper_bound; - size_t minimum_alignment; + size_t lower_bound = 0; + size_t upper_bound = 0; + size_t minimum_alignment = 0; Processor affinity_processor; }; From a72be3ed5522d2eaf50231a5d79b120b3dc00369 Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Tue, 17 Feb 2026 19:15:56 -0800 Subject: [PATCH 12/32] updated 1d image range --- src/realm/deppart/image.cc | 15 +- src/realm/deppart/image_gpu_impl.hpp | 283 +++++++++++-------- src/realm/deppart/partitions_gpu_impl.hpp | 25 +- src/realm/deppart/partitions_gpu_kernels.hpp | 1 + src/realm/deppart/rectlist.inl | 6 +- tests/deppart.cc | 30 ++ 6 files changed, 231 insertions(+), 129 deletions(-) diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc index ff1122d820..b0dcd4383a 100644 --- a/src/realm/deppart/image.cc +++ b/src/realm/deppart/image.cc @@ -38,7 +38,7 @@ namespace Realm { std::vector& requirements) const { size_t minimal_size = 0; size_t source_entries = 0; - bool bvh = false; + bool bvh = true; for (auto subspace : source_spaces) { source_entries += subspace.entries == 0 ? 1 : subspace.entries; } @@ -70,7 +70,7 @@ namespace Realm { device_size = atoi(val); } minimal_size = max(minimal_size, device_size); - size_t optimal_size = is.bounds.volume() * sizeof(Rect) * source_spaces.size() + minimal_size; + size_t optimal_size = is.bounds.volume() * sizeof(Rect) * source_spaces.size() * 10 + minimal_size; std::vector affinities; unsigned best_bandwidth = 0; Processor best_proc = Processor::NO_PROC; @@ -285,6 +285,7 @@ namespace Realm { if(!bmpp) bmpp = &bitmasks[i]; if(!*bmpp) *bmpp = new BM; (*bmpp)->add_rect(it3.rect); + } } } @@ -704,10 +705,11 @@ namespace Realm { for (auto ptr_fdd : gpu_ptr_data) { // launch full cross-product of image micro ops right away assert(ptr_fdd.scratch_buffer != RegionInstance::NO_INST); - domain_transform.ptr_data = {ptr_fdd}; + DomainTransform domain_transform_copy = domain_transform; + domain_transform_copy.ptr_data = {ptr_fdd}; GPUImageMicroOp *micro_op = new GPUImageMicroOp( - parent, domain_transform, exclusive); + parent, domain_transform_copy, exclusive); for (size_t j = 0; j < sources.size(); j++) { micro_op->add_sparsity_output(sources[j], images[j]); } @@ -716,10 +718,11 @@ namespace Realm { for (auto rect_fdd : gpu_rect_data) { // launch full cross-product of image micro ops right away assert(rect_fdd.scratch_buffer != RegionInstance::NO_INST); - domain_transform.range_data = {rect_fdd}; + DomainTransform domain_transform_copy = domain_transform; + domain_transform_copy.range_data = {rect_fdd}; GPUImageMicroOp *micro_op = new GPUImageMicroOp( - parent, domain_transform, exclusive); + parent, domain_transform_copy, exclusive); for (size_t j = 0; j < sources.size(); j++) { micro_op->add_sparsity_output(sources[j], images[j]); } diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp index b3c38789f5..ce83e03639 100644 --- a/src/realm/deppart/image_gpu_impl.hpp +++ b/src/realm/deppart/image_gpu_impl.hpp @@ -46,74 +46,40 @@ void GPUImageMicroOp::gpu_populate_rngs() NVTX_DEPPART(gpu_image); - Memory my_mem = domain_transform.range_data[0].inst.get_location(); + RegionInstance buffer = domain_transform.range_data[0].scratch_buffer; + size_t tile_size = buffer.get_layout()->bytes_used; + std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; + Arena buffer_arena(reinterpret_cast(AffineAccessor(buffer, 0).base), tile_size); cudaStream_t stream = Cuda::get_task_cuda_stream(); - const char* val = std::getenv("TILE_SIZE"); // or any env var - size_t tile_size = 100000000; //default - if (val) { - tile_size = atoi(val); - } - - RegionInstance fixed_buffer = this->realm_malloc(tile_size, my_mem); - Arena buffer_arena(reinterpret_cast(AffineAccessor(fixed_buffer, 0).base), tile_size); - collapsed_space src_space; - RegionInstance offsets_instance = this->realm_malloc((sources.size()+1) * sizeof(size_t), my_mem); - src_space.offsets = reinterpret_cast(AffineAccessor(offsets_instance, 0).base); + src_space.offsets = buffer_arena.alloc(sources.size()+1); src_space.num_children = sources.size(); - GPUMicroOp::collapse_multi_space(sources, src_space, buffer_arena, stream); collapsed_space inst_space; // We combine all of our instances into one to batch work, tracking the offsets between instances. - RegionInstance inst_offsets_instance = this->realm_malloc((domain_transform.range_data.size() + 1) * sizeof(size_t), my_mem); - inst_space.offsets = reinterpret_cast(AffineAccessor(inst_offsets_instance, 0).base); + inst_space.offsets = buffer_arena.alloc(domain_transform.range_data.size() + 1); inst_space.num_children = domain_transform.range_data.size(); - GPUMicroOp::collapse_multi_space(domain_transform.range_data, inst_space, buffer_arena, stream); + Arena sys_arena; + GPUMicroOp::collapse_multi_space(domain_transform.range_data, inst_space, sys_arena, stream); // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter // to figure out where to write each rectangle. - RegionInstance inst_counters_instance = this->realm_malloc((2*domain_transform.range_data.size() + 1) * sizeof(uint32_t), my_mem); - uint32_t* d_inst_counters = reinterpret_cast(AffineAccessor(inst_counters_instance, 0).base); + uint32_t* d_inst_counters = buffer_arena.alloc(2 * domain_transform.range_data.size()+1); // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second // to track which instance each rectangle came from in the populate phase. uint32_t* d_inst_prefix = d_inst_counters + domain_transform.range_data.size(); - RegionInstance valid_rects_instance; - size_t num_valid_rects; - RectDesc* d_valid_rects; - - // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from. - GPUMicroOp::template construct_input_rectlist>(inst_space, src_space, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream); - inst_offsets_instance.destroy(); - - if (num_valid_rects == 0) { - for (SparsityMap it : sparsity_outputs) { - SparsityMapImpl *impl = SparsityMapImpl::lookup(it); - if (this->exclusive) { - impl->gpu_finalize(); - } else { - impl->contribute_nothing(); - } - } - valid_rects_instance.destroy(); - inst_counters_instance.destroy(); - return; - } - - // Prefix sum the valid rectangles by volume. - size_t* d_prefix_rects; - size_t total_pts; - - GPUMicroOp::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream); + size_t num_valid_rects = tile_size; - RegionInstance rngs_instance = this->realm_malloc(total_pts * sizeof(RectDesc), my_mem); - RectDesc* d_rngs = reinterpret_cast*>(AffineAccessor(rngs_instance, 0).base); + collapsed_space collapsed_parent; + // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs. + GPUMicroOp::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream); Memory zcpy_mem; assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM)); @@ -123,88 +89,174 @@ void GPUImageMicroOp::gpu_populate_rngs() d_accessors[i] = AffineAccessor,N2,T2>(domain_transform.range_data[i].inst, domain_transform.range_data[i].field_offset); } - image_gpuPopulateBitmasksRngsKernel<<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, total_pts, num_valid_rects, domain_transform.range_data.size(), d_rngs); - KERNEL_CHECK(stream); + uint32_t* d_src_counters = buffer_arena.alloc(2 * sources.size() + 1); + uint32_t* d_src_prefix = d_src_counters + sources.size(); - RegionInstance parent_entries_instance; - collapsed_space collapsed_parent; + buffer_arena.commit(false); + size_t left = buffer_arena.used(); - // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs. - GPUMicroOp::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream); - + size_t num_output = 0; + RectDesc* output_start = nullptr; + size_t num_completed = 0; + size_t curr_tile = tile_size / 2; + int count = 0; + while (num_completed < inst_space.num_entries) { + try { + std::cout << "Tile iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; + buffer_arena.start(); + buffer_arena.flip_parity(); + if (num_completed + curr_tile > inst_space.num_entries) { + curr_tile = inst_space.num_entries - num_completed; + } + collapsed_space inst_space_tile = inst_space; + inst_space_tile.num_entries = curr_tile; + inst_space_tile.entries_buffer = buffer_arena.alloc>(curr_tile); + CUDA_CHECK(cudaMemcpyAsync(inst_space_tile.entries_buffer, inst_space.entries_buffer + num_completed, curr_tile * sizeof(SparsityMapEntry), cudaMemcpyHostToDevice, stream), stream); + + RectDesc* d_valid_rects; + // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from. + GPUMicroOp::template construct_input_rectlist>(inst_space_tile, src_space, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream); + + if (num_valid_rects == 0) { + num_completed += curr_tile; + curr_tile = tile_size / 2; + subtract_const<<>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + continue; + } + + // Prefix sum the valid rectangles by volume. + size_t* d_prefix_rects; + size_t total_pts; - RegionInstance src_counters_instance = this->realm_malloc(sources.size() * sizeof(uint32_t), my_mem); - uint32_t* d_src_counters = reinterpret_cast(AffineAccessor(src_counters_instance, 0).base); - CUDA_CHECK(cudaMemsetAsync(d_src_counters, 0, sources.size() * sizeof(uint32_t), stream), stream); + GPUMicroOp::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream); + buffer_arena.flip_parity(); + RectDesc* d_rngs = buffer_arena.alloc>(total_pts); - //Finally, we do another two pass count + emit to intersect with the parent rectangles - image_intersect_output<<>>(collapsed_parent.entries_buffer, d_rngs, nullptr, collapsed_parent.num_entries, total_pts, d_src_counters, nullptr); - KERNEL_CHECK(stream); + image_gpuPopulateBitmasksRngsKernel<<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, total_pts, num_valid_rects, domain_transform.range_data.size(), d_rngs); + KERNEL_CHECK(stream); - std::vector h_src_counters(sources.size()+1); - h_src_counters[0] = 0; // prefix sum starts at 0 - CUDA_CHECK(cudaMemcpyAsync(h_src_counters.data()+1, d_src_counters, sources.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); - CUDA_CHECK(cudaStreamSynchronize(stream), stream); - valid_rects_instance.destroy(); - accessors_instance.destroy(); + CUDA_CHECK(cudaMemsetAsync(d_src_counters, 0, sources.size() * sizeof(uint32_t), stream), stream); - for (size_t i = 0; i < sources.size(); ++i) { - h_src_counters[i+1] += h_src_counters[i]; - } - size_t num_valid_output = h_src_counters[sources.size()]; + //Finally, we do another two pass count + emit to intersect with the parent rectangles + image_intersect_output<<>>(collapsed_parent.entries_buffer, d_rngs, nullptr, collapsed_parent.num_entries, total_pts, d_src_counters, nullptr); + KERNEL_CHECK(stream); - if (num_valid_output == 0) { - for (SparsityMap it : sparsity_outputs) { - SparsityMapImpl *impl = SparsityMapImpl::lookup(it); - if (this->exclusive) { - impl->gpu_finalize(); - } else { - impl->contribute_nothing(); + std::vector h_src_counters(sources.size()+1); + h_src_counters[0] = 0; // prefix sum starts at 0 + CUDA_CHECK(cudaMemcpyAsync(h_src_counters.data()+1, d_src_counters, sources.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + for (size_t i = 0; i < sources.size(); ++i) { + h_src_counters[i+1] += h_src_counters[i]; } - } - parent_entries_instance.destroy(); - src_counters_instance.destroy(); - rngs_instance.destroy(); - return; - } + size_t num_valid_output = h_src_counters[sources.size()]; - RegionInstance valid_intersect_instance = this->realm_malloc(num_valid_output * sizeof(RectDesc), my_mem); - RectDesc* d_valid_intersect = reinterpret_cast*>(AffineAccessor(valid_intersect_instance, 0).base); + if (num_valid_output == 0) { + num_completed += curr_tile; + curr_tile = tile_size / 2; + subtract_const<<>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + continue; + } - RegionInstance src_prefix_instance = this->realm_malloc((sources.size() + 1) * sizeof(uint32_t), my_mem); - uint32_t* d_src_prefix = reinterpret_cast(AffineAccessor(src_prefix_instance, 0).base); - CUDA_CHECK(cudaMemcpyAsync(d_src_prefix, h_src_counters.data(), (sources.size() + 1) * sizeof(size_t), cudaMemcpyHostToDevice, stream), stream); + buffer_arena.flip_parity(); + RectDesc* d_valid_intersect = buffer_arena.alloc>(num_valid_output); - CUDA_CHECK(cudaMemsetAsync(d_src_counters, 0, sources.size() * sizeof(uint32_t), stream), stream); + CUDA_CHECK(cudaMemcpyAsync(d_src_prefix, h_src_counters.data(), (sources.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream); + CUDA_CHECK(cudaMemsetAsync(d_src_counters, 0, sources.size() * sizeof(uint32_t), stream), stream); - image_intersect_output<<>>(collapsed_parent.entries_buffer, d_rngs, d_src_prefix, collapsed_parent.num_entries, total_pts, d_src_counters, d_valid_intersect); - KERNEL_CHECK(stream); + image_intersect_output<<>>(collapsed_parent.entries_buffer, d_rngs, d_src_prefix, collapsed_parent.num_entries, total_pts, d_src_counters, d_valid_intersect); + KERNEL_CHECK(stream); - CUDA_CHECK(cudaStreamSynchronize(stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); - src_prefix_instance.destroy(); - parent_entries_instance.destroy(); - src_counters_instance.destroy(); - rngs_instance.destroy(); + size_t num_new_rects = 2; + assert(!buffer_arena.get_parity()); + RectDesc* d_new_rects; - size_t out_rects = 0; - RectDesc* trash; - this->complete_rect_pipeline(d_valid_intersect, num_valid_output, trash, out_rects, buffer_arena, - /* the Container: */ sparsity_outputs, - /* getIndex: */ [&](auto const& elem){ - // elem is a SparsityMap from the vector - return size_t(&elem - sparsity_outputs.data()); - }, - /* getMap: */ [&](auto const& elem){ - // return the SparsityMap key itself - return elem; - }); + //Send it off for processing + this->complete_rect_pipeline(d_valid_intersect, num_valid_output, d_new_rects, num_new_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + + //Set our first set of output rectangles + if (num_output==0) { + + //We need to place the new output at the rightmost end of the buffer + buffer_arena.flip_parity(); + buffer_arena.reset(true); + output_start = buffer_arena.alloc>(num_new_rects); + buffer_arena.commit(true); + CUDA_CHECK(cudaMemcpyAsync(output_start, d_new_rects, num_new_rects * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); + num_output = num_new_rects; + num_completed += curr_tile; + subtract_const<<>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + curr_tile = tile_size / 2; + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + continue; + } + + //Otherwise we merge with existing rectangles + RectDesc* d_old_rects = buffer_arena.alloc>(num_output); + assert(d_old_rects == d_new_rects + num_new_rects); + CUDA_CHECK(cudaMemcpyAsync(d_old_rects, output_start, num_output * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); - valid_intersect_instance.destroy(); + size_t num_final_rects = 1; + + //Send it off for processing + this->complete_rect_pipeline(d_new_rects, num_output + num_new_rects, output_start, num_final_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + num_completed += curr_tile; + num_output = num_final_rects; + subtract_const<<>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + curr_tile = tile_size / 2; + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } + catch (arena_oom&) { + std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; + std::cout << buffer_arena.used() << " bytes used in arena." << std::endl; + curr_tile /= 2; + if (curr_tile == 0) { + throw; + } + } + } + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + KERNEL_CHECK(stream); + this->send_output(output_start, num_output, buffer_arena, sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); } @@ -249,8 +301,8 @@ void GPUImageMicroOp::gpu_populate_ptrs() inst_space.offsets = buffer_arena.alloc(domain_transform.ptr_data.size()+1); inst_space.num_children = domain_transform.ptr_data.size(); - Arena no; - GPUMicroOp::collapse_multi_space(domain_transform.ptr_data, inst_space, no, stream); + Arena sys_arena; + GPUMicroOp::collapse_multi_space(domain_transform.ptr_data, inst_space, sys_arena, stream); // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter // to figure out where to write each rectangle. @@ -305,15 +357,11 @@ void GPUImageMicroOp::gpu_populate_ptrs() RectDesc* d_valid_rects; GPUMicroOp::template construct_input_rectlist>(inst_space_tile, src_space, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream); - if (num_valid_rects == std::numeric_limits::max()) { - curr_tile /= 2; - continue; - } - if (num_valid_rects == 0) { num_completed += curr_tile; curr_tile = tile_size / 2; subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); + KERNEL_CHECK(stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); continue; } @@ -344,6 +392,7 @@ void GPUImageMicroOp::gpu_populate_ptrs() num_completed += curr_tile; curr_tile = tile_size / 2; subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); + KERNEL_CHECK(stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); continue; } @@ -385,6 +434,7 @@ void GPUImageMicroOp::gpu_populate_ptrs() num_output = num_new_rects; num_completed += curr_tile; subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); + KERNEL_CHECK(stream); curr_tile = tile_size / 2; CUDA_CHECK(cudaStreamSynchronize(stream), stream); continue; @@ -411,6 +461,7 @@ void GPUImageMicroOp::gpu_populate_ptrs() num_completed += curr_tile; num_output = num_final_rects; subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); + KERNEL_CHECK(stream); curr_tile = tile_size / 2; CUDA_CHECK(cudaStreamSynchronize(stream), stream); } diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp index b1459f2ede..42b640660b 100644 --- a/src/realm/deppart/partitions_gpu_impl.hpp +++ b/src/realm/deppart/partitions_gpu_impl.hpp @@ -104,6 +104,8 @@ namespace Realm { void GPUMicroOp::collapse_multi_space(const std::vector& spaces, collapsed_space &out_space, Arena &my_arena, cudaStream_t stream) { + out_space.bounds = Rect::make_empty(); + char *val = std::getenv("SHATTER_SIZE"); // or any env var int shatter_size = 1; //default if (val) { @@ -123,6 +125,7 @@ namespace Realm { } else { my_space = spaces[i].index_space; } + out_space.bounds = out_space.bounds.union_bbox(my_space.bounds); if (my_space.dense()) { if constexpr (std::is_same_v>) { out_space.num_entries += 1; @@ -208,6 +211,7 @@ namespace Realm { entry.bounds = parent_space.bounds; out_space.entries_buffer = my_arena.alloc>(1); out_space.num_entries = 1; + out_space.bounds = parent_space.bounds; CUDA_CHECK(cudaMemcpyAsync(out_space.entries_buffer, &entry, sizeof(SparsityMapEntry), cudaMemcpyHostToDevice, stream), stream); } else { span> tmp = parent_space.sparsity.impl()->get_entries(); @@ -225,7 +229,6 @@ namespace Realm { template void GPUMicroOp::build_bvh(const collapsed_space &space, BVH &result, Arena &my_arena, cudaStream_t stream) { - //We want to keep the entire BVH that we return in one instance for convenience. size_t indices_instance_size = space.num_entries * sizeof(uint64_t); size_t labels_instance_size = space.offsets == nullptr ? 0 : space.num_entries * sizeof(size_t); @@ -1316,18 +1319,28 @@ namespace Realm { CUDA_CHECK(cudaMemcpyAsync(&last_grp, &group_ids[num_intermediate-1], sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); - my_arena.rollback(prev); my_arena.flip_parity(); assert(my_arena.get_parity()); - my_arena.reset(true); + + if (out_rects == 1) { + my_arena.reset(true); + } d_rects_out = my_arena.alloc>(last_grp); - my_arena.commit(true); + if (out_rects == 1) { + my_arena.commit(true); + } init_rects_dim<<>>(d_rects_in, d_hi_flags_out, break_points, group_ids, d_rects_out, num_intermediate, 0); KERNEL_CHECK(stream); num_intermediate = last_grp; - std::swap(d_rects_in, d_rects_out); + if (out_rects == 2) { + my_arena.flip_parity(); + d_rects_in = my_arena.alloc>(num_intermediate); + CUDA_CHECK(cudaMemcpyAsync(d_rects_in, d_rects_out, num_intermediate * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); + } else { + std::swap(d_rects_in, d_rects_out); + } CUDA_CHECK(cudaStreamSynchronize(stream), stream); } @@ -1496,6 +1509,8 @@ namespace Realm { CUDA_CHECK(cudaMemsetAsync(d_ends, 0, ctr.size()*sizeof(size_t),stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + //Convert RectDesc to SparsityMapEntry and determine where each src's rectangles start and end. build_final_output<<>>(d_rects, final_entries, final_rects, d_starts, d_ends, total_rects); KERNEL_CHECK(stream); diff --git a/src/realm/deppart/partitions_gpu_kernels.hpp b/src/realm/deppart/partitions_gpu_kernels.hpp index f3c1dd514e..2f607930d9 100644 --- a/src/realm/deppart/partitions_gpu_kernels.hpp +++ b/src/realm/deppart/partitions_gpu_kernels.hpp @@ -674,6 +674,7 @@ void mark_breaks_dim(const RectDesc* in, int d) { size_t i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= M) return; if(i == 0) { brk[0] = 1; return; } diff --git a/src/realm/deppart/rectlist.inl b/src/realm/deppart/rectlist.inl index 621476e511..233d14c5c2 100644 --- a/src/realm/deppart/rectlist.inl +++ b/src/realm/deppart/rectlist.inl @@ -647,8 +647,10 @@ namespace Realm { // as_map.rbegin()->second << "\n"; // bigger than everything - see if we can merge with the last guy T &last = as_map.rbegin()->second; - if(last == (r.lo[0] - 1)) - last = r.hi[0]; + if(last >= (r.lo[0] - 1)) { + if (last < r.hi[0]) + last = r.hi[0]; + } else if(last < (r.lo[0] - 1)) as_map[r.lo[0]] = r.hi[0]; } else { diff --git a/tests/deppart.cc b/tests/deppart.cc index 448d3a60d0..70c6e9dfc1 100644 --- a/tests/deppart.cc +++ b/tests/deppart.cc @@ -1612,11 +1612,41 @@ class RangeTest : public TestInterface { std::vector> p_garbage_rects, p_garbage_colors; log_app.info() << "WARMING UP " << "\n"; + std::vector> field_estimate_input(rect_id_data_gpu.size()); + std::vector field_estimate_output(rect_id_data_gpu.size()); + std::vector> image_estimate_input(rect_val_data_gpu.size()); + std::vector image_estimate_output(rect_val_data_gpu.size()); + std::vector> subspace_input(colors.size()); + for (size_t i = 0; i < rect_id_data_gpu.size(); i++) { + field_estimate_input[i].location = rect_id_data_gpu[i].inst.get_location(); + field_estimate_input[i].space = rect_id_data_gpu[i].index_space; + } + for (size_t i = 0; i < rect_val_data_gpu.size(); i++) { + image_estimate_input[i].location = rect_val_data_gpu[i].inst.get_location(); + image_estimate_input[i].space = rect_val_data_gpu[i].index_space; + } + + is_rects.by_field_buffer_requirements(field_estimate_input, field_estimate_output); + std::vector byte_fields = {sizeof(char)}; + for (size_t i = 0; i < rect_id_data_gpu.size(); i++) { + IndexSpace<1> instance_index_space(Rect<1>(0, field_estimate_output[i].upper_bound-1)); + RegionInstance::create_instance(rect_id_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + } + Event e001 = is_rects.create_subspaces_by_field(rect_id_data_gpu, colors, p_garbage_colors, Realm::ProfilingRequestSet()); if (wait_on_events) e001.wait(); + for (size_t i = 0; i < colors.size(); i++) { + subspace_input[i].space = p_garbage_colors[i]; + subspace_input[i].entries = p_garbage_colors[i].sparsity.impl()->get_entries().size(); + } + is_nodes.by_image_buffer_requirements(subspace_input, image_estimate_input, image_estimate_output); + for (size_t i = 0; i < rect_val_data_gpu.size(); i++) { + IndexSpace<1> instance_index_space(Rect<1>(0, (image_estimate_output[i].upper_bound)/4-1)); + RegionInstance::create_instance(rect_val_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + } Event e002 = is_nodes.create_subspaces_by_image(rect_val_data_gpu, p_garbage_colors, p_garbage_rects, From c9325aed3803e157d0866df1ebb4464368c1896f Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Wed, 18 Feb 2026 22:00:54 -0800 Subject: [PATCH 13/32] working multidimensional, no fixed buffer --- src/realm/deppart/byfield.cc | 3 +- src/realm/deppart/byfield_gpu_impl.hpp | 88 ++-- src/realm/deppart/partitions.h | 24 +- src/realm/deppart/partitions_gpu_impl.hpp | 23 +- tests/deppart.cc | 543 +++++++++++++++++++++- 5 files changed, 594 insertions(+), 87 deletions(-) diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc index cfd2927589..b9d4bf5e43 100644 --- a/src/realm/deppart/byfield.cc +++ b/src/realm/deppart/byfield.cc @@ -44,7 +44,7 @@ namespace Realm { if (val) { device_size = atoi(val); } - size_t optimal_size = is.bounds.volume() * sizeof(Rect); + size_t optimal_size = is.bounds.volume() * sizeof(Rect) * 100; std::vector affinities; unsigned best_bandwidth = 0; Processor best_proc = Processor::NO_PROC; @@ -446,6 +446,7 @@ namespace Realm { } #ifdef REALM_USE_CUDA for (auto fdd : gpu_field_data) { + assert(fdd.scratch_buffer != RegionInstance::NO_INST); std::vector,FT> > single_gpu_field_data = {fdd}; GPUByFieldMicroOp *uop = new GPUByFieldMicroOp(parent, single_gpu_field_data, exclusive); for (size_t i = 0; i < colors.size(); i++) { diff --git a/src/realm/deppart/byfield_gpu_impl.hpp b/src/realm/deppart/byfield_gpu_impl.hpp index f2aa8c3288..c7e619e06d 100644 --- a/src/realm/deppart/byfield_gpu_impl.hpp +++ b/src/realm/deppart/byfield_gpu_impl.hpp @@ -23,24 +23,15 @@ void GPUByFieldMicroOp::execute() cudaStream_t stream = Cuda::get_task_cuda_stream(); - Memory my_mem = field_data[0].inst.get_location(); - collapsed_space inst_space; - const char* val = std::getenv("TILE_SIZE"); // or any env var - size_t tile_size = 100000000; //default - if (val) { - tile_size = atoi(val); - } + size_t tile_size = field_data[0].scratch_buffer.get_layout()->bytes_used; - RegionInstance fixed_buffer = this->realm_malloc(tile_size, my_mem); - Arena buffer_arena(reinterpret_cast(AffineAccessor(fixed_buffer, 0).base), tile_size); + Arena buffer_arena(reinterpret_cast(AffineAccessor(field_data[0].scratch_buffer, 0).base), tile_size); inst_space.offsets = buffer_arena.alloc(field_data.size() + 1); inst_space.num_children = field_data.size(); - GPUMicroOp::collapse_multi_space(field_data, inst_space, buffer_arena, stream); - collapsed_space collapsed_parent; // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs. @@ -49,8 +40,7 @@ void GPUByFieldMicroOp::execute() // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter // to figure out where to write each rectangle. - RegionInstance inst_counters_instance = this->realm_malloc((2*field_data.size() + 1) * sizeof(uint32_t), my_mem); - uint32_t* d_inst_counters = reinterpret_cast(AffineAccessor(inst_counters_instance, 0).base); + uint32_t* d_inst_counters = buffer_arena.alloc(2*field_data.size() + 1); // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second // to track which instance each rectangle came from in the populate phase. @@ -58,37 +48,7 @@ void GPUByFieldMicroOp::execute() size_t num_valid_rects = 0; Rect* d_valid_rects; - // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from. - GPUMicroOp::template construct_input_rectlist>(inst_space, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream); - - - // Early out if we don't have any rectangles. - if (num_valid_rects == 0) { - for (std::pair> it : sparsity_outputs) { - SparsityMapImpl *impl = SparsityMapImpl::lookup(it.second); - if (this->exclusive) { - impl->gpu_finalize(); - } else { - impl->contribute_nothing(); - } - } - inst_counters_instance.destroy(); - return; - } - - - // Prefix sum the valid rectangles by volume. - size_t total_pts; - - size_t* d_prefix_rects; - GPUMicroOp::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream); - - // Now we have everything we need to actually populate our outputs. - RegionInstance points_instance = this->realm_malloc(total_pts * sizeof(PointDesc), my_mem); - PointDesc* d_points = reinterpret_cast*>(AffineAccessor(points_instance, 0).base); - FT* d_colors; - RegionInstance colors_instance; // Memcpying a boolean vector breaks things for some reason so we have this disgusting workaround. @@ -97,13 +57,11 @@ void GPUByFieldMicroOp::execute() for (size_t i = 0; i < colors.size(); i++) { flat_colors[i] = colors[i] ? 1 : 0; } - colors_instance = this->realm_malloc(total_pts * sizeof(PointDesc), my_mem); - uint8_t* d_flat_colors = reinterpret_cast(AffineAccessor(colors_instance, 0).base); + uint8_t* d_flat_colors = buffer_arena.alloc(colors.size()); CUDA_CHECK(cudaMemcpyAsync(d_flat_colors, flat_colors.data(), colors.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, stream), stream); d_colors = reinterpret_cast(d_flat_colors); } else { - colors_instance = this->realm_malloc(colors.size() * sizeof(FT), my_mem); - d_colors = reinterpret_cast(AffineAccessor(colors_instance, 0).base); + d_colors = buffer_arena.alloc(colors.size()); CUDA_CHECK(cudaMemcpyAsync(d_colors, colors.data(), colors.size() * sizeof(FT), cudaMemcpyHostToDevice, stream), stream); } @@ -118,8 +76,39 @@ void GPUByFieldMicroOp::execute() d_accessors[i] = AffineAccessor(field_data[i].inst, field_data[i].field_offset); } + buffer_arena.commit(false); + + GPUMicroOp::collapse_multi_space(field_data, inst_space, buffer_arena, stream); + + // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from. + GPUMicroOp::template construct_input_rectlist>(inst_space, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream); + // Early out if we don't have any rectangles. + if (num_valid_rects == 0) { + for (std::pair> it : sparsity_outputs) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(it.second); + if (this->exclusive) { + impl->gpu_finalize(); + } else { + impl->contribute_nothing(); + } + } + return; + } + + + // Prefix sum the valid rectangles by volume. + size_t total_pts; + + size_t* d_prefix_rects; + GPUMicroOp::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream); + + // Now we have everything we need to actually populate our outputs. + buffer_arena.flip_parity(); + assert(!buffer_arena.get_parity()); + PointDesc* d_points = buffer_arena.alloc>(total_pts); + // This is where the work is actually done - each thread figures out which points to read, reads it, marks a PointDesc with its color, and writes it out. byfield_gpuPopulateBitmasksKernel<<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, d_colors, total_pts, colors.size(), num_valid_rects, field_data.size(), d_points); KERNEL_CHECK(stream); @@ -132,9 +121,6 @@ void GPUByFieldMicroOp::execute() } CUDA_CHECK(cudaStreamSynchronize(stream), stream); - colors_instance.destroy(); - accessors_instance.destroy(); - inst_counters_instance.destroy(); // Ship off the points for final processing. size_t out_rects = 0; @@ -149,7 +135,5 @@ void GPUByFieldMicroOp::execute() // return the SparsityMap key itself return kv.second; }); - - points_instance.destroy(); } } diff --git a/src/realm/deppart/partitions.h b/src/realm/deppart/partitions.h index 051717d803..222e553ee5 100644 --- a/src/realm/deppart/partitions.h +++ b/src/realm/deppart/partitions.h @@ -123,6 +123,18 @@ namespace Realm { } } + size_t mark(bool dir) const noexcept { + return dir ? right_ : left_; + } + + void rollback(size_t mark, bool dir) noexcept { + if (dir) { + right_ = mark; + } else { + left_ = mark; + } + } + template T* alloc(size_t count = 1) { return parity_ ? alloc_right(count) : alloc_left(count); @@ -171,16 +183,22 @@ namespace Realm { void* alloc_left_bytes(size_t bytes, size_t align = alignof(std::max_align_t)) { const size_t aligned = align_up(left_, align); - if (aligned + bytes + right_ > cap_) throw arena_oom{}; + if (aligned + bytes + right_ > cap_) { + throw arena_oom{}; + } void* p = base_ + aligned; left_ = aligned + bytes; return p; } void* alloc_right_bytes(size_t bytes, size_t align = alignof(std::max_align_t)) { - if (bytes + right_ > cap_) throw arena_oom{}; + if (bytes + right_ > cap_) { + throw arena_oom{}; + } const size_t aligned = align_down(cap_ - right_ - bytes, align); - if (aligned < left_) throw arena_oom{}; + if (aligned < left_) { + throw arena_oom{}; + } void *p = base_ + aligned; right_ = cap_ - aligned; return p; diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp index 42b640660b..de21b7fc99 100644 --- a/src/realm/deppart/partitions_gpu_impl.hpp +++ b/src/realm/deppart/partitions_gpu_impl.hpp @@ -679,8 +679,6 @@ namespace Realm { RegionInstance exc_sum_instance = this->realm_malloc(num_corners * total_rects * sizeof(size_t), my_mem); - size_t per_elem_size = 2*alloc_size_1 + sizeof(uint8_t) + sizeof(size_t); - size_t* d_src_keys_in = reinterpret_cast(AffineAccessor(shared_instance, 0).base); size_t* d_src_keys_out = reinterpret_cast(AffineAccessor(shared_instance, 0).base) + num_corners * total_rects; T* d_coord_keys_in = reinterpret_cast(AffineAccessor(shared_instance, 0).base); @@ -962,17 +960,6 @@ namespace Realm { CUDA_CHECK(cudaMemcpyAsync(&last_count, &d_seg_counters[num_segments-1], sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); next_round += last_count; - if (out_rects > 0 && (next_round + last_count) * per_elem_size > out_rects) { - shared_instance.destroy(); - flags_instance.destroy(); - exc_sum_instance.destroy(); - seg_bound_instance.destroy(); - seg_counters.destroy(); - seg_counters_out.destroy(); - corners_instance.destroy(); - out_rects = std::numeric_limits::max(); - return; - } num_intermediate = next_round; @@ -1190,10 +1177,6 @@ namespace Realm { CUDA_CHECK(cudaStreamSynchronize(stream), stream); } - heads_instance.destroy(); - shared_instance.destroy(); - tmp_instance.destroy(); - //And... we're done if (out_rects > 0) { d_out_rects = d_rects_in; @@ -1369,7 +1352,8 @@ namespace Realm { NVTX_DEPPART(complete_pipeline); - size_t prev = my_arena.mark(); + my_arena.flip_parity(); + cudaStream_t stream = Cuda::get_task_cuda_stream(); @@ -1466,7 +1450,7 @@ namespace Realm { num_intermediate = last_grp; std::swap(d_rects_in, d_rects_out); } - my_arena.rollback(prev); + my_arena.flip_parity(); d_out_rects = my_arena.alloc>(num_intermediate); CUDA_CHECK(cudaMemcpyAsync(d_out_rects, d_rects_in, num_intermediate * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); @@ -1476,7 +1460,6 @@ namespace Realm { out_rects = num_intermediate; } else { this->send_output(d_rects_in, num_intermediate, my_arena, ctr, getIndex, getMap); - my_arena.rollback(prev); } } diff --git a/tests/deppart.cc b/tests/deppart.cc index 70c6e9dfc1..8fde66845d 100644 --- a/tests/deppart.cc +++ b/tests/deppart.cc @@ -44,7 +44,7 @@ enum INIT_BASIC_DATA_TASK, INIT_TILE_DATA_TASK, INIT_RANGE_DATA_TASK, - INIT_2D_DATA_TASK, + INIT_RANGE2D_DATA_TASK, INIT_PENNANT_DATA_TASK, INIT_MINIAERO_DATA_TASK, }; @@ -501,6 +501,19 @@ class BasicTest : public TestInterface { } wait_on_events = true; log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; + const char* val = std::getenv("TILE_SIZE"); // or any env var + size_t tile_size = 100000000; //default + if (val) { + tile_size = atoi(val); + } + std::vector byte_fields = {sizeof(char)}; + IndexSpace<1> instance_index_space(Rect<1>(0, tile_size-1)); + for (size_t i = 0; i < piece_field_data_gpu.size(); i++) { + RegionInstance::create_instance(piece_field_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + } + for (size_t i = 0; i < src_field_data_gpu.size(); i++) { + RegionInstance::create_instance(src_field_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + } std::vector > p_garbage_nodes, p_garbage_edges, p_garbage_rd, p_garbage_preimage_edges; Event e01 = is_nodes.create_subspaces_by_field(piece_field_data_gpu, colors, @@ -516,16 +529,6 @@ class BasicTest : public TestInterface { // an image of p_edges through out_node gives us all the shared nodes, along // with some private nodes - const char* val = std::getenv("TILE_SIZE"); // or any env var - size_t tile_size = 100000000; //default - if (val) { - tile_size = atoi(val); - } - std::vector byte_fields = {sizeof(char)}; - IndexSpace<1> instance_index_space(Rect<1>(0, tile_size-1)); - for (size_t i = 0; i < src_field_data_gpu.size(); i++) { - RegionInstance::create_instance(src_field_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); - } Event e03 = is_nodes.create_subspaces_by_image(src_field_data_gpu, p_garbage_edges, p_garbage_rd, @@ -1749,6 +1752,518 @@ class RangeTest : public TestInterface { } }; +class Range2DTest : public TestInterface { +public: + // graph config parameters + int num_nodes = 1000; + int num_rects = 1000; + int max_rect_size = 10; + int num_pieces = 4; + + Range2DTest(int argc, const char *argv[]) + { + for(int i = 1; i < argc; i++) { + + if(!strcmp(argv[i], "-p")) { + num_pieces = atoi(argv[++i]); + continue; + } + + if(!strcmp(argv[i], "-n")) { + num_nodes = atoi(argv[++i]); + continue; + } + + if (!strcmp(argv[i], "-r")) { + num_rects = atoi(argv[++i]); + continue; + } + + if (!strcmp(argv[i], "-m")) { + max_rect_size = atoi(argv[++i]); + continue; + } + } + + if (num_nodes <= 0 || num_rects <= 0) { + log_app.error() << "Invalid graph dimensions in input file: rects=" << num_rects << " nodes=" << num_nodes; + exit(1); + } + + } + + + + struct InitDataArgs { + int index; + RegionInstance ri_nodes; + RegionInstance ri_rects; + }; + + enum PRNGStreams { + NODE_SUBGRAPH_STREAM, + }; + + void random_rect_data(int idx, int& subgraph) + { + if(random_colors) + subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); + else + subgraph = idx * num_pieces / num_rects; + } + + void random_node_data(int idx, int& subgraph) + { + if(true) + subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); + else + subgraph = idx * num_pieces / num_nodes; + } + + void initialize_rect_data(int idx, Rect<2> &rect, int max_rect_size = 10) + { + + int x = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes); + int y = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, num_nodes); + int length = Philox_2x32<>::rand_int(random_seed, idx + 2, NODE_SUBGRAPH_STREAM, max_rect_size); + int height = Philox_2x32<>::rand_int(random_seed, idx + 3, NODE_SUBGRAPH_STREAM, max_rect_size); + rect.lo[0] = x; + rect.hi[0] = x + length; + rect.lo[1] = y; + rect.hi[1] = y + height; + } + + + static void init_data_task_wrapper(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) + { + Range2DTest *me = (Range2DTest *)testcfg; + me->init_data_task(args, arglen, p); + } + + void init_data_task(const void *args, size_t arglen, Processor p) + { + const InitDataArgs& i_args = *(const InitDataArgs *)args; + + log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes << ", ri_rects=" << i_args.ri_rects << ")"; + + i_args.ri_nodes.fetch_metadata(p).wait(); + i_args.ri_rects.fetch_metadata(p).wait(); + + IndexSpace<2> is_nodes = i_args.ri_nodes.get_indexspace<2>(); + IndexSpace<1> is_rects = i_args.ri_rects.get_indexspace<1>(); + + log_app.debug() << "N: " << is_nodes; + log_app.debug() << "E: " << is_rects; + + { + AffineAccessor a_piece_id(i_args.ri_rects, 0 /* offset */); + + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) { + int subgraph; + random_rect_data(i, subgraph); + a_piece_id.write(i, subgraph); + } + } + { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo[0]; i <= is_nodes.bounds.hi[0]; i++) { + for (int j = is_nodes.bounds.lo[1]; j <= is_nodes.bounds.hi[1]; j++) { + int idx = i * (is_nodes.bounds.hi[1] - is_nodes.bounds.lo[1] + 1) + j; + int subgraph; + random_node_data(idx, subgraph); + a_piece_id.write(Point<2>(i, j), subgraph); + } + } + } + + + { + + AffineAccessor, 1> a_rect(i_args.ri_rects, 1 * sizeof(int) /* offset */); + + // Read edges line by line + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) { + Rect<2> rect; + initialize_rect_data(i, rect, max_rect_size); + a_rect.write(i, rect); + } + } + + if(show_graph) { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo[0]; i <= is_nodes.bounds.hi[1]; i++) { + for (int j = is_nodes.bounds.lo[1]; j <= is_nodes.bounds.hi[1]; j++) { + Point<2> p(i, j); + log_app.info() << "node_id[" << p << "] = " << a_piece_id.read(p) << "\n"; + } + } + + AffineAccessor a_rect_id(i_args.ri_rects, 0 * sizeof(Point<1>) /* offset */); + + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) + log_app.info() << "rect_id[" << i << "] = " << a_rect_id.read(i) << "\n"; + + AffineAccessor,1> a_rect_val(i_args.ri_rects, 1 * sizeof(int) /* offset */); + + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) + log_app.info() << "rect_val[" << i << "] = " << a_rect_val.read(i) << "\n"; + } + } + + IndexSpace<1> is_rects; + IndexSpace<2> is_nodes; + std::vector ri_nodes; + std::vector, int> > node_id_field_data; + std::vector ri_rects; + std::vector, int> > rect_id_field_data; + std::vector, Rect<2> > > rect_val_field_data; + + virtual void print_info(void) + { + printf("Realm dependent partitioning test - 2D ranges: %d nodes, %d rects, %d pieces\n", + (int)num_nodes, (int)num_rects, (int)num_pieces); + } + + virtual Event initialize_data(const std::vector& memories, + const std::vector& procs) + { + // now create index spaces for nodes and edges + is_nodes = Rect<2>(Point<2>(0, 0), Point<2>(num_nodes - 1, num_nodes - 1)); + is_rects = Rect<1>(0, num_rects - 1); + + // equal partition is used to do initial population of edges and nodes + std::vector > ss_nodes_eq; + std::vector > ss_rects_eq; + + log_app.info() << "Creating equal subspaces" << "\n"; + + is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); + is_rects.create_equal_subspaces(num_pieces, 1, ss_rects_eq, Realm::ProfilingRequestSet()).wait(); + + log_app.debug() << "Initial partitions:\n"; + for(size_t i = 0; i < ss_nodes_eq.size(); i++) + log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i]; + for(size_t i = 0; i < ss_rects_eq.size(); i++) + log_app.debug() << " Rects #" << i << ": " << ss_rects_eq[i]; + + // create instances for each of these subspaces + std::vector node_fields, rect_fields; + node_fields.push_back(sizeof(int)); // piece_id + rect_fields.push_back(sizeof(int)); // src_node + rect_fields.push_back(sizeof(Rect<2>)); // dst_node + + ri_nodes.resize(num_pieces); + node_id_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_nodes_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, + memories[i % memories.size()], + ss_nodes_eq[i], + node_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_nodes[i] = ri; + + node_id_field_data[i].index_space = ss_nodes_eq[i]; + node_id_field_data[i].inst = ri_nodes[i]; + node_id_field_data[i].field_offset = 0; + } + + ri_rects.resize(num_pieces); + rect_id_field_data.resize(num_pieces); + rect_val_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_rects_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, + memories[i % memories.size()], + ss_rects_eq[i], + rect_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_rects[i] = ri; + + rect_id_field_data[i].index_space = ss_rects_eq[i]; + rect_id_field_data[i].inst = ri_rects[i]; + rect_id_field_data[i].field_offset = 0; + + rect_val_field_data[i].index_space = ss_rects_eq[i]; + rect_val_field_data[i].inst = ri_rects[i]; + rect_val_field_data[i].field_offset = 1 * sizeof(int); + } + + // fire off tasks to initialize data + std::set events; + for(int i = 0; i < num_pieces; i++) { + Processor p = procs[i % procs.size()]; + InitDataArgs args; + args.index = i; + args.ri_nodes = ri_nodes[i]; + args.ri_rects = ri_rects[i]; + Event e = p.spawn(INIT_RANGE2D_DATA_TASK, &args, sizeof(args)); + events.insert(e); + } + + return Event::merge_events(events); + } + + // the outputs of our partitioning will be: + // is_private, is_shared - subsets of is_nodes based on private/shared + // p_rd, p_wr, p_ghost - subsets of the above split by subckt + // p_edges - subsets of is_edges for each subckt + + std::vector > p_colored_rects; + std::vector> p_rects, p_intersect, p_diff; + std::vector> p_colored_rects_cpu; + std::vector> p_rects_cpu, p_intersect_cpu, p_diff_cpu; + + IndexSpace<2> cpu_union, gpu_union, garbage_union; + + virtual Event perform_partitioning(void) + { + // first partition nodes by subckt id (this is the independent partition, + // but not actually used by the app) + + std::vector colors(num_pieces); + for(int i = 0; i < num_pieces; i++) + colors[i] = i; + + Memory gpu_memory; + bool found_gpu_memory = false; + Machine machine = Machine::get_machine(); + std::set all_memories; + machine.get_all_memories(all_memories); + for(auto& memory : all_memories) { + if(memory.kind() == Memory::GPU_FB_MEM) { + gpu_memory = memory; + found_gpu_memory = true; + break; + } + } + assert(found_gpu_memory); + std::vector rect_fields; + rect_fields.push_back(sizeof(int)); + rect_fields.push_back(sizeof(Rect<2>)); + std::vector node_fields; + node_fields.push_back(sizeof(int)); + + std::vector, int > > node_id_data_gpu; + std::vector, int > > rect_id_data_gpu; + std::vector, Rect<2>>> rect_val_data_gpu; + node_id_data_gpu.resize(num_pieces); + rect_id_data_gpu.resize(num_pieces); + rect_val_data_gpu.resize(num_pieces); + for (int i = 0; i < num_pieces; i++) { + RegionInstance node_id_instance; + RegionInstance rect_id_instance; + RegionInstance rect_val_instance; + RegionInstance::create_instance(node_id_instance, + gpu_memory, + node_id_field_data[i].index_space, + node_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(rect_id_instance, + gpu_memory, + rect_id_field_data[i].index_space, + rect_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(rect_val_instance, + gpu_memory, + rect_val_field_data[i].index_space, + rect_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + CopySrcDstField node_id_gpu_field, node_id_cpu_field, rect_id_gpu_field, rect_id_cpu_field, rect_val_gpu_field, rect_val_cpu_field; + node_id_gpu_field.inst = node_id_instance; + node_id_gpu_field.size = sizeof(int); + node_id_gpu_field.field_id = 0; + node_id_cpu_field.inst = node_id_field_data[i].inst; + node_id_cpu_field.size = sizeof(int); + node_id_cpu_field.field_id = 0; + rect_id_gpu_field.inst = rect_id_instance; + rect_id_gpu_field.size = sizeof(int); + rect_id_gpu_field.field_id = 0; + rect_id_cpu_field.inst = rect_id_field_data[i].inst; + rect_id_cpu_field.size = sizeof(int); + rect_id_cpu_field.field_id = 0; + rect_val_gpu_field.inst = rect_val_instance; + rect_val_gpu_field.size = sizeof(Rect<2>); + rect_val_gpu_field.field_id = sizeof(int); + rect_val_cpu_field.inst = rect_val_field_data[i].inst; + rect_val_cpu_field.size = sizeof(Rect<2>); + rect_val_cpu_field.field_id = sizeof(int); + std::vector node_id_gpu_data, node_id_cpu_data, rect_id_gpu_data, rect_id_cpu_data, rect_val_gpu_data, rect_val_cpu_data; + node_id_gpu_data.push_back(node_id_gpu_field); + node_id_cpu_data.push_back(node_id_cpu_field); + rect_id_gpu_data.push_back(rect_id_gpu_field); + rect_id_cpu_data.push_back(rect_id_cpu_field); + rect_val_gpu_data.push_back(rect_val_gpu_field); + rect_val_cpu_data.push_back(rect_val_cpu_field); + Event copy_event = node_id_field_data[i].index_space.copy(node_id_cpu_data, node_id_gpu_data, Realm::ProfilingRequestSet()); + copy_event.wait(); + Event second_copy_event = rect_id_field_data[i].index_space.copy(rect_id_cpu_data, rect_id_gpu_data, Realm::ProfilingRequestSet()); + second_copy_event.wait(); + Event third_copy_event = rect_val_field_data[i].index_space.copy(rect_val_cpu_data, rect_val_gpu_data, Realm::ProfilingRequestSet()); + third_copy_event.wait(); + node_id_data_gpu[i].inst = node_id_instance; + node_id_data_gpu[i].index_space = node_id_field_data[i].index_space; + node_id_data_gpu[i].field_offset = 0; + rect_id_data_gpu[i].inst = rect_id_instance; + rect_id_data_gpu[i].index_space = rect_id_field_data[i].index_space; + rect_id_data_gpu[i].field_offset = 0; + rect_val_data_gpu[i].inst = rect_val_instance; + rect_val_data_gpu[i].index_space = rect_val_field_data[i].index_space; + rect_val_data_gpu[i].field_offset = sizeof(int); + } + wait_on_events = true; + std::vector> p_garbage_colors; + std::vector> p_garbage_rects; + log_app.info() << "WARMING UP " << "\n"; + + std::vector> field_estimate_input(rect_id_data_gpu.size()); + std::vector field_estimate_output(rect_id_data_gpu.size()); + std::vector> image_estimate_input(rect_val_data_gpu.size()); + std::vector image_estimate_output(rect_val_data_gpu.size()); + std::vector> subspace_input(colors.size()); + for (size_t i = 0; i < rect_id_data_gpu.size(); i++) { + field_estimate_input[i].location = rect_id_data_gpu[i].inst.get_location(); + field_estimate_input[i].space = rect_id_data_gpu[i].index_space; + } + for (size_t i = 0; i < rect_val_data_gpu.size(); i++) { + image_estimate_input[i].location = rect_val_data_gpu[i].inst.get_location(); + image_estimate_input[i].space = rect_val_data_gpu[i].index_space; + } + + is_rects.by_field_buffer_requirements(field_estimate_input, field_estimate_output); + std::vector byte_fields = {sizeof(char)}; + for (size_t i = 0; i < rect_id_data_gpu.size(); i++) { + IndexSpace<1> instance_index_space(Rect<1>(0, field_estimate_output[i].upper_bound-1)); + RegionInstance::create_instance(rect_id_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + } + + Event e001 = is_rects.create_subspaces_by_field(rect_id_data_gpu, + colors, + p_garbage_colors, + Realm::ProfilingRequestSet()); + if (wait_on_events) e001.wait(); + for (size_t i = 0; i < colors.size(); i++) { + subspace_input[i].space = p_garbage_colors[i]; + subspace_input[i].entries = p_garbage_colors[i].sparsity.impl()->get_entries().size(); + } + is_nodes.by_image_buffer_requirements(subspace_input, image_estimate_input, image_estimate_output); + for (size_t i = 0; i < rect_val_data_gpu.size(); i++) { + IndexSpace<1> instance_index_space(Rect<1>(0, (image_estimate_output[i].upper_bound*5)-1)); + RegionInstance::create_instance(rect_val_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + } + Event e002 = is_nodes.create_subspaces_by_image(rect_val_data_gpu, + p_garbage_colors, + p_garbage_rects, + Realm::ProfilingRequestSet(), + e001); + if(wait_on_events) e002.wait(); + + log_app.info() << "FINISHED WARMING UP " << "\n"; + log_app.info() << "starting GPU partitioning " << Clock::current_time_in_microseconds() << "\n"; + + log_app.info() << "STARTING GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + + Event e01 = is_rects.create_subspaces_by_field(rect_id_data_gpu, + colors, + p_colored_rects, + Realm::ProfilingRequestSet()); + if (wait_on_events) e01.wait(); + + log_app.info() << "FINISHED GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "STARTING GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + Event e02 = is_nodes.create_subspaces_by_image(rect_val_data_gpu, + p_colored_rects, + p_rects, + Realm::ProfilingRequestSet(), + e01); + if(wait_on_events) e02.wait(); + log_app.info() << "FINISHED GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "GPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; + + log_app.info() << "STARTING CPU partitioning " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "STARTING CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + Event e1 = is_rects.create_subspaces_by_field(rect_id_field_data, + colors, + p_colored_rects_cpu, + Realm::ProfilingRequestSet()); + if (wait_on_events) e1.wait(); + log_app.info() << "FINISHED CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "STARTING CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + Event e2 = is_nodes.create_subspaces_by_image(rect_val_field_data, + p_colored_rects_cpu, + p_rects_cpu, + Realm::ProfilingRequestSet(), + e1); + if(wait_on_events) e2.wait(); + log_app.info() << "FINISHED CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; + return e2; + } + + + + virtual int perform_dynamic_checks(void) + { + return 0; + } + + virtual int check_partitioning(void) + { + log_app.info() << "Checking correctness of partitioning " << "\n"; + int errors = 0; + + for (int i = 0; i < num_pieces; i++) { + for (IndexSpaceIterator<1> it(p_colored_rects[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if(!p_colored_rects_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra colored rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_colored_rects_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_colored_rects[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing colored rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<2> it(p_rects[i]); it.valid; it.step()) { + for (PointInRectIterator<2> point(it.rect); point.valid; point.step()) { + if (!p_rects_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<2> it(p_rects_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<2> point(it.rect); point.valid; point.step()) { + if (!p_rects[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + } + return errors; + } +}; + class MiniAeroTest : public TestInterface { public: enum ProblemType @@ -4430,6 +4945,11 @@ int main(int argc, char **argv) break; } + if (!strcmp(argv[i], "multi")) { + testcfg = new Range2DTest(argc - i, const_cast(argv + i)); + break; + } + if(!strcmp(argv[i], "pennant")) { testcfg = new PennantTest(argc - i, const_cast(argv + i)); break; @@ -4469,6 +4989,7 @@ int main(int argc, char **argv) rt.register_task(INIT_BASIC_DATA_TASK, BasicTest::init_data_task_wrapper); rt.register_task(INIT_TILE_DATA_TASK, TileTest::init_data_task_wrapper); rt.register_task(INIT_RANGE_DATA_TASK, RangeTest::init_data_task_wrapper); + rt.register_task(INIT_RANGE2D_DATA_TASK, Range2DTest::init_data_task_wrapper); rt.register_task(INIT_MINIAERO_DATA_TASK, MiniAeroTest::init_data_task_wrapper); signal(SIGALRM, sigalrm_handler); From 761cd1b32bce41c459d090aa33dbab596e454b45 Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Thu, 19 Feb 2026 13:36:56 -0800 Subject: [PATCH 14/32] working multidimensional --- src/realm/deppart/byfield_gpu_impl.hpp | 10 +- src/realm/deppart/partitions.h | 19 +- src/realm/deppart/partitions_gpu_impl.hpp | 424 +++++++++++----------- 3 files changed, 225 insertions(+), 228 deletions(-) diff --git a/src/realm/deppart/byfield_gpu_impl.hpp b/src/realm/deppart/byfield_gpu_impl.hpp index c7e619e06d..8765a57f11 100644 --- a/src/realm/deppart/byfield_gpu_impl.hpp +++ b/src/realm/deppart/byfield_gpu_impl.hpp @@ -33,10 +33,12 @@ void GPUByFieldMicroOp::execute() inst_space.num_children = field_data.size(); collapsed_space collapsed_parent; + collapsed_parent.offsets = buffer_arena.alloc(2); + collapsed_parent.num_children = 1; + std::vector> parent_spaces = {parent_space}; // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs. - GPUMicroOp::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream); - + GPUMicroOp::collapse_multi_space(parent_spaces, collapsed_parent, buffer_arena, stream); // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter // to figure out where to write each rectangle. @@ -107,7 +109,9 @@ void GPUByFieldMicroOp::execute() // Now we have everything we need to actually populate our outputs. buffer_arena.flip_parity(); assert(!buffer_arena.get_parity()); - PointDesc* d_points = buffer_arena.alloc>(total_pts); + + RegionInstance points_instance = this->realm_malloc(total_pts * sizeof(PointDesc), zcpy_mem); + PointDesc* d_points = reinterpret_cast*>(AffineAccessor(points_instance, 0).base); // This is where the work is actually done - each thread figures out which points to read, reads it, marks a PointDesc with its color, and writes it out. byfield_gpuPopulateBitmasksKernel<<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, d_colors, total_pts, colors.size(), num_valid_rects, field_data.size(), d_points); diff --git a/src/realm/deppart/partitions.h b/src/realm/deppart/partitions.h index 222e553ee5..4a8899e251 100644 --- a/src/realm/deppart/partitions.h +++ b/src/realm/deppart/partitions.h @@ -137,7 +137,12 @@ namespace Realm { template T* alloc(size_t count = 1) { - return parity_ ? alloc_right(count) : alloc_left(count); + static_assert(!std::is_void_v, "alloc is invalid"); + return reinterpret_cast(alloc_bytes(count * sizeof(T), alignof(T))); + } + + void* alloc_bytes(size_t bytes, size_t align = alignof(std::max_align_t)) { + return parity_ ? alloc_right_bytes(bytes, align) : alloc_left_bytes(bytes, align); } void flip_parity(void) noexcept { @@ -204,18 +209,6 @@ namespace Realm { return p; } - template - T* alloc_left(size_t count = 1) { - static_assert(!std::is_void_v, "alloc is invalid"); - return reinterpret_cast(alloc_left_bytes(sizeof(T) * count, alignof(T))); - } - - template - T* alloc_right(size_t count = 1) { - static_assert(!std::is_void_v, "alloc is invalid"); - return reinterpret_cast(alloc_right_bytes(sizeof(T) * count, alignof(T))); - } - static size_t align_up(size_t x, size_t a) noexcept { return (x + (a - 1)) & ~(a - 1); } diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp index de21b7fc99..565a413fa0 100644 --- a/src/realm/deppart/partitions_gpu_impl.hpp +++ b/src/realm/deppart/partitions_gpu_impl.hpp @@ -515,35 +515,30 @@ namespace Realm { cudaStream_t stream = Cuda::get_task_cuda_stream(); Memory my_mem; - bool found = find_memory(my_mem, Memory::GPU_FB_MEM); - assert(found); + assert(find_memory(my_mem, Memory::GPU_FB_MEM)); - RegionInstance srcs_instance = this->realm_malloc(4*total_rects*sizeof(int32_t), my_mem); - RegionInstance crds_instance = this->realm_malloc(4*total_rects*sizeof(T), my_mem); - RegionInstance heads_instance = this->realm_malloc(2*total_rects * sizeof(uint8_t), my_mem); - RegionInstance sum_instance = this->realm_malloc(2*total_rects * sizeof(size_t), my_mem); + assert(!my_arena.get_parity()); + size_t beginning = my_arena.mark(); - RegionInstance B_src_inst[N]; - RegionInstance B_coord_inst[N]; + uint32_t* srcs_ptr = my_arena.alloc(4 * total_rects); + T* crds_ptr = my_arena.alloc(4 * total_rects); + uint8_t* heads_ptr = my_arena.alloc(2 * total_rects); + size_t* sums_ptr = my_arena.alloc(2 * total_rects); + + size_t left_restore = my_arena.mark(); + size_t right_restore = my_arena.mark(true); size_t *B_starts[N]; size_t *B_ends[N]; T* B_coord[N]; size_t B_size[N]; - - RegionInstance B_ptrs_instance = this->realm_malloc(2 * N * sizeof(size_t*), my_mem); - size_t** B_start_ptrs = reinterpret_cast(AffineAccessor(B_ptrs_instance, 0).base); - size_t** B_end_ptrs = reinterpret_cast(AffineAccessor(B_ptrs_instance, 0).base) + N; - - RegionInstance B_coord_ptrs_instance = this->realm_malloc(N * sizeof(T*), my_mem); - T** B_coord_ptrs = reinterpret_cast(AffineAccessor(B_coord_ptrs_instance, 0).base); int threads_per_block = 256; size_t grid_size = (total_rects + threads_per_block - 1) / threads_per_block; - RegionInstance tmp_instance; size_t orig_tmp = 0; + size_t temp_restore = my_arena.mark(); void *tmp_storage = nullptr; //Our first step is to find all the unique "boundaries" in each dimension (lo coord or hi+1 coord) @@ -553,10 +548,10 @@ namespace Realm { //We need the coordinates to be sorted by our curent dim and separated by src idx grid_size = (total_rects + threads_per_block - 1) / threads_per_block; - uint32_t* d_srcs_in = reinterpret_cast(AffineAccessor(srcs_instance, 0).base); - uint32_t* d_srcs_out = reinterpret_cast(AffineAccessor(srcs_instance, 0).base) + 2* total_rects; - T* d_coord_keys_in = reinterpret_cast(AffineAccessor(crds_instance,0).base); - T* d_coord_keys_out = reinterpret_cast(AffineAccessor(crds_instance,0).base) + 2 * total_rects; + uint32_t* d_srcs_in = srcs_ptr; + uint32_t* d_srcs_out = srcs_ptr + 2* total_rects; + T* d_coord_keys_in = crds_ptr; + T* d_coord_keys_out = crds_ptr + 2 * total_rects; mark_endpoints<<>>(d_rects, total_rects, d, d_srcs_in, d_coord_keys_in); KERNEL_CHECK(stream); size_t temp_bytes; @@ -566,11 +561,10 @@ namespace Realm { 2 * total_rects, 0, 8*sizeof(T), stream); if (temp_bytes > orig_tmp) { if (orig_tmp > 0) { - tmp_instance.destroy(); + my_arena.rollback(temp_restore); } - tmp_instance = this->realm_malloc(temp_bytes, my_mem); orig_tmp = temp_bytes; - tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); } cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, d_coord_keys_in, d_coord_keys_out, @@ -584,11 +578,10 @@ namespace Realm { 2 * total_rects, 0, 8*sizeof(uint32_t), stream); if (temp_bytes > orig_tmp) { if (orig_tmp > 0) { - tmp_instance.destroy(); + my_arena.rollback(temp_restore); } - tmp_instance = this->realm_malloc(temp_bytes, my_mem); orig_tmp = temp_bytes; - tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); } cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, d_srcs_in, d_srcs_out, @@ -597,19 +590,18 @@ namespace Realm { //Now mark the unique keys grid_size = (2*total_rects + threads_per_block - 1) / threads_per_block; - uint8_t * d_heads = reinterpret_cast(AffineAccessor(heads_instance, 0).base); - size_t *d_output = reinterpret_cast(AffineAccessor(sum_instance, 0).base); + uint8_t * d_heads = heads_ptr; + size_t *d_output = sums_ptr; mark_heads<<>>(d_srcs_out, d_coord_keys_out, 2 * total_rects, d_heads); KERNEL_CHECK(stream); cub::DeviceScan::ExclusiveSum(nullptr, temp_bytes, d_heads, d_output, 2 * total_rects, stream); if (temp_bytes > orig_tmp) { if (orig_tmp > 0) { - tmp_instance.destroy(); + my_arena.rollback(temp_restore); } - tmp_instance = this->realm_malloc(temp_bytes, my_mem); orig_tmp = temp_bytes; - tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); } cub::DeviceScan::ExclusiveSum(tmp_storage, temp_bytes, d_heads, d_output, 2 * total_rects, stream); @@ -620,13 +612,21 @@ namespace Realm { CUDA_CHECK(cudaStreamSynchronize(stream), stream); num_unique += last_bit; + my_arena.flip_parity(); + assert(my_arena.get_parity()); + my_arena.rollback(right_restore); + //Collect all the data we'll need later for this dimension - starts/ends by src, unique boundaries, unique boundaries count - B_coord_inst[d] = this->realm_malloc(num_unique * sizeof(T), my_mem); - B_src_inst[d] = this->realm_malloc(2*ctr.size() * sizeof(size_t), my_mem); - B_starts[d] = reinterpret_cast(AffineAccessor(B_src_inst[d], 0).base); - B_ends[d] = reinterpret_cast(AffineAccessor(B_src_inst[d], 0).base) + ctr.size(); - B_coord[d] = reinterpret_cast(AffineAccessor(B_coord_inst[d], 0).base); + B_starts[d] = my_arena.alloc(2 *ctr.size()); + B_ends[d] = B_starts[d] + ctr.size(); + B_coord[d] = my_arena.alloc(num_unique); B_size[d] = num_unique; + + right_restore = my_arena.mark(); + my_arena.flip_parity(); + assert(!my_arena.get_parity()); + my_arena.rollback(left_restore); + CUDA_CHECK(cudaMemsetAsync(B_starts[d], 0, ctr.size() * sizeof(size_t), stream), stream); CUDA_CHECK(cudaMemsetAsync(B_ends[d], 0, ctr.size() * sizeof(size_t), stream), stream); scatter_unique<<>>(d_srcs_out, d_coord_keys_out, d_output, d_heads, 2 * total_rects, B_starts[d], B_ends[d], B_coord[d]); @@ -645,13 +645,24 @@ namespace Realm { CUDA_CHECK(cudaMemcpyAsync(B_ends[d], d_ends_host.data(), ctr.size() * sizeof(size_t), cudaMemcpyHostToDevice, stream), stream); } + assert(!my_arena.get_parity()); + my_arena.rollback(beginning); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); } - srcs_instance.destroy(); - crds_instance.destroy(); - heads_instance.destroy(); - sum_instance.destroy(); + orig_tmp = 0; + + my_arena.flip_parity(); + assert(my_arena.get_parity()); + my_arena.rollback(right_restore); + + size_t** B_start_ptrs = my_arena.alloc(2 * N); + size_t** B_end_ptrs = B_start_ptrs + N; + + T** B_coord_ptrs = my_arena.alloc(N); + + right_restore = my_arena.mark(); //We need the arrays themselves on the device CUDA_CHECK(cudaMemcpyAsync(B_coord_ptrs, B_coord, N * sizeof(T*), cudaMemcpyHostToDevice, stream), stream); @@ -660,50 +671,54 @@ namespace Realm { //Next up, we generate all the corners of all the rectangles and mark them by parity size_t num_corners = (1 << N); - RegionInstance corners_instance = this->realm_malloc(2 * num_corners * total_rects * sizeof(CornerDesc), my_mem); - CornerDesc* d_corners_in = reinterpret_cast*>(AffineAccessor(corners_instance, 0).base); - CornerDesc* d_corners_out = reinterpret_cast*>(AffineAccessor(corners_instance, 0).base) + num_corners * total_rects; + CornerDesc* d_corners_in = my_arena.alloc>(2 * num_corners * total_rects); + CornerDesc* d_corners_out = d_corners_in + num_corners * total_rects; + + size_t corner_restore = my_arena.mark(); + + my_arena.flip_parity(); + assert(!my_arena.get_parity()); + my_arena.flip_parity(); + my_arena.rollback(corner_restore); populate_corners<<>>(d_rects, total_rects, d_corners_in); KERNEL_CHECK(stream); // We have a LOT of bookkeeping to do - std::set RLE_alloc_events; size_t alloc_size_1 = std::max({sizeof(size_t), sizeof(T), sizeof(int32_t), sizeof(DeltaFlag)}); + size_t align_1 = std::max({alignof(size_t), alignof(T), alignof(int32_t), alignof(DeltaFlag)}); - RegionInstance shared_instance = this->realm_malloc(2 * num_corners * total_rects * alloc_size_1, my_mem); - - RegionInstance flags_instance = this->realm_malloc(num_corners * total_rects * sizeof(uint8_t), my_mem); - - RegionInstance exc_sum_instance = this->realm_malloc(num_corners * total_rects * sizeof(size_t), my_mem); + char* shared_ptr = reinterpret_cast(my_arena.alloc_bytes(2 * num_corners * total_rects * alloc_size_1, align_1)); + uint8_t* d_flags = my_arena.alloc(num_corners * total_rects); + size_t* d_exc_sum = my_arena.alloc(num_corners * total_rects); - size_t* d_src_keys_in = reinterpret_cast(AffineAccessor(shared_instance, 0).base); - size_t* d_src_keys_out = reinterpret_cast(AffineAccessor(shared_instance, 0).base) + num_corners * total_rects; - T* d_coord_keys_in = reinterpret_cast(AffineAccessor(shared_instance, 0).base); - T* d_coord_keys_out = reinterpret_cast(AffineAccessor(shared_instance, 0).base) + num_corners * total_rects; - int32_t* d_deltas = reinterpret_cast(AffineAccessor(shared_instance, 0).base); - int32_t* d_deltas_out = reinterpret_cast(AffineAccessor(shared_instance, 0).base) + num_corners * total_rects; - DeltaFlag* d_delta_flags_in = reinterpret_cast(AffineAccessor(shared_instance, 0).base); - DeltaFlag* d_delta_flags_out = reinterpret_cast(AffineAccessor(shared_instance, 0).base) + num_corners * total_rects; - uint8_t* d_flags = reinterpret_cast(AffineAccessor(flags_instance, 0).base); - size_t* d_exc_sum = reinterpret_cast(AffineAccessor(exc_sum_instance, 0).base); + size_t* d_src_keys_in = reinterpret_cast(shared_ptr); + size_t* d_src_keys_out = d_src_keys_in + num_corners * total_rects; + T* d_coord_keys_in = reinterpret_cast(shared_ptr); + T* d_coord_keys_out = d_coord_keys_in + num_corners * total_rects; + int32_t* d_deltas = reinterpret_cast(shared_ptr); + int32_t* d_deltas_out = d_deltas + num_corners * total_rects; + DeltaFlag* d_delta_flags_in = reinterpret_cast(shared_ptr); + DeltaFlag* d_delta_flags_out = d_delta_flags_in + num_corners * total_rects; - RegionInstance seg_bound_instance; size_t* seg_starts; size_t* seg_ends; - RegionInstance seg_counters; uint32_t* d_seg_counters; - RegionInstance seg_counters_out; uint32_t* d_seg_counters_out; grid_size = (num_corners * total_rects + threads_per_block - 1) / threads_per_block; + orig_tmp = 0; + temp_restore = my_arena.mark(); + tmp_storage = nullptr; + //We need to reduce duplicate corners by their parity, so we sort to get duplicates next to each other and then reduce by key { + NVTX_DEPPART(sort_corners); for (int dim = 0; dim < N; dim++) { build_coord_key<<>>(d_coord_keys_in, d_corners_in, num_corners * total_rects, dim); @@ -714,10 +729,11 @@ namespace Realm { d_corners_in, d_corners_out, num_corners * total_rects, 0, 8*sizeof(T), stream); if (temp_bytes > orig_tmp) { - tmp_instance.destroy(); - tmp_instance = this->realm_malloc(temp_bytes, my_mem); + if (orig_tmp > 0) { + my_arena.rollback(temp_restore); + } orig_tmp = temp_bytes; - tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); } cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, d_coord_keys_in, d_coord_keys_out, @@ -737,10 +753,11 @@ namespace Realm { d_corners_in, d_corners_out, num_corners * total_rects, 0, 8*sizeof(size_t), stream); if (temp_bytes > orig_tmp) { - tmp_instance.destroy(); - tmp_instance = this->realm_malloc(temp_bytes, my_mem); + if (orig_tmp > 0) { + my_arena.rollback(temp_restore); + } orig_tmp = temp_bytes; - tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); } cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, d_src_keys_in, d_src_keys_out, @@ -751,8 +768,8 @@ namespace Realm { get_delta<<>>(d_deltas, d_corners_in, num_corners * total_rects); KERNEL_CHECK(stream); - RegionInstance num_runs_instance = this->realm_malloc(sizeof(int), my_mem); - int* d_num_runs = reinterpret_cast(AffineAccessor(num_runs_instance, 0).base); + my_arena.rollback(temp_restore); + int* d_num_runs = my_arena.alloc(1); //See above, we have custom equality and reduction operators for CornerDesc CustomSum red_op; @@ -765,12 +782,7 @@ namespace Realm { /*num_items=*/(int) (num_corners * total_rects), /*stream=*/stream); - if (temp_bytes > orig_tmp) { - tmp_instance.destroy(); - tmp_instance = this->realm_malloc(temp_bytes, my_mem); - orig_tmp = temp_bytes; - tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); - } + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); cub::DeviceReduce::ReduceByKey( tmp_storage, temp_bytes, d_corners_in, d_corners_out, @@ -784,7 +796,7 @@ namespace Realm { CUDA_CHECK(cudaMemcpyAsync(&num_unique_corners, d_num_runs, sizeof(int), cudaMemcpyDeviceToHost, stream), stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); - num_runs_instance.destroy(); + my_arena.rollback(temp_restore); grid_size = (num_unique_corners + threads_per_block - 1) / threads_per_block; set_delta<<>>(d_deltas_out, d_corners_out, num_unique_corners); @@ -813,12 +825,10 @@ namespace Realm { d_coord_keys_in, d_coord_keys_out, d_corners_in, d_corners_out, num_intermediate, 0, 8*sizeof(T), stream); - if (temp_bytes > orig_tmp) { - tmp_instance.destroy(); - tmp_instance = this->realm_malloc(temp_bytes, my_mem); - orig_tmp = temp_bytes; - tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); - } + + my_arena.rollback(temp_restore); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, d_coord_keys_in, d_coord_keys_out, d_corners_in, d_corners_out, @@ -838,12 +848,10 @@ namespace Realm { d_coord_keys_in, d_coord_keys_out, d_corners_in, d_corners_out, num_intermediate, 0, 8*sizeof(T), stream); - if (temp_bytes > orig_tmp) { - tmp_instance.destroy(); - tmp_instance = this->realm_malloc(temp_bytes, my_mem); - orig_tmp = temp_bytes; - tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); - } + + my_arena.rollback(temp_restore); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, d_coord_keys_in, d_coord_keys_out, d_corners_in, d_corners_out, @@ -859,12 +867,10 @@ namespace Realm { d_src_keys_in, d_src_keys_out, d_corners_in, d_corners_out, num_intermediate, 0, 8*sizeof(size_t), stream); - if (temp_bytes > orig_tmp) { - tmp_instance.destroy(); - tmp_instance = this->realm_malloc(temp_bytes, my_mem); - orig_tmp = temp_bytes; - tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); - } + + my_arena.rollback(temp_restore); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, d_src_keys_in, d_src_keys_out, d_corners_in, d_corners_out, @@ -879,23 +885,20 @@ namespace Realm { KERNEL_CHECK(stream); cub::DeviceScan::InclusiveSum(nullptr, temp_bytes, d_flags, d_exc_sum, num_intermediate, stream); - if (temp_bytes > orig_tmp) { - if (orig_tmp > 0) { - tmp_instance.destroy(); - } - tmp_instance = this->realm_malloc(temp_bytes, my_mem); - orig_tmp = temp_bytes; - tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); - } + + my_arena.rollback(temp_restore); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + cub::DeviceScan::InclusiveSum(tmp_storage, temp_bytes, d_flags, d_exc_sum, num_intermediate, stream); CUDA_CHECK(cudaMemcpyAsync(&num_segments, &d_exc_sum[num_intermediate-1], sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); //Mark the beginning and end of each segment for our kernel to use in binary search - seg_bound_instance = this->realm_malloc(2 * num_segments * sizeof(size_t), my_mem); - seg_starts = reinterpret_cast(AffineAccessor(seg_bound_instance, 0).base); - seg_ends = reinterpret_cast(AffineAccessor(seg_bound_instance, 0).base) + num_segments; + seg_starts = my_arena.alloc(2 * num_segments); + seg_ends = seg_starts + num_segments; + + temp_restore = my_arena.mark(); seg_boundaries<<>>(d_flags, d_exc_sum, num_intermediate, seg_starts, seg_ends); KERNEL_CHECK(stream); @@ -911,12 +914,8 @@ namespace Realm { /*stream=*/ stream ); - if (temp_bytes > orig_tmp) { - tmp_instance.destroy(); - tmp_instance = this->realm_malloc(temp_bytes, my_mem); - orig_tmp = temp_bytes; - tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); - } + my_arena.rollback(temp_restore); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); cub::DeviceScan::InclusiveScan( /*d_temp=*/ tmp_storage, @@ -932,26 +931,21 @@ namespace Realm { //Per usual, we do a count + emit pass to track active segments and limit memory usage. If the evaluated prefix sum for a boundary within a segment //is 0, we can skip it because it won't contribute anything to future sums and also won't be emitted. - seg_counters = this->realm_malloc(num_segments * sizeof(uint32_t), my_mem); - d_seg_counters = reinterpret_cast(AffineAccessor(seg_counters, 0).base); + d_seg_counters = my_arena.alloc(2 * num_segments); + d_seg_counters_out = d_seg_counters + num_segments; CUDA_CHECK(cudaMemsetAsync(d_seg_counters, 0, num_segments * sizeof(uint32_t), stream), stream); + temp_restore = my_arena.mark(); + grid_size = ((num_segments*B_size[d]) + threads_per_block - 1) / threads_per_block; count_segments<<>>(d_delta_flags_out, seg_starts, seg_ends, B_starts[d], B_ends[d], d_corners_in, B_coord[d], B_size[d], num_segments, d, d_seg_counters); KERNEL_CHECK(stream); - seg_counters_out = this->realm_malloc(num_segments * sizeof(uint32_t), my_mem); - d_seg_counters_out = reinterpret_cast(AffineAccessor(seg_counters_out, 0).base); - cub::DeviceScan::ExclusiveSum(nullptr, temp_bytes, d_seg_counters, d_seg_counters_out, num_segments, stream); - if (temp_bytes > orig_tmp) { - if (orig_tmp > 0) { - tmp_instance.destroy(); - } - tmp_instance = this->realm_malloc(temp_bytes, my_mem); - orig_tmp = temp_bytes; - tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); - } + + my_arena.rollback(temp_restore); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + cub::DeviceScan::ExclusiveSum(tmp_storage, temp_bytes, d_seg_counters, d_seg_counters_out, num_segments, stream); uint32_t next_round; @@ -968,52 +962,57 @@ namespace Realm { break; } - RegionInstance next_corners_instance = this->realm_malloc(2 * next_round * sizeof(CornerDesc), my_mem); - CornerDesc* d_next_corners = reinterpret_cast*>(AffineAccessor(next_corners_instance, 0).base); + my_arena.flip_parity(); + if (my_arena.get_parity()) { + my_arena.rollback(right_restore); + } + + CornerDesc* d_next_corners = my_arena.alloc>(2 * next_round); CUDA_CHECK(cudaMemsetAsync(d_seg_counters, 0, num_segments*sizeof(uint32_t), stream), stream); + corner_restore = my_arena.mark(); + my_arena.flip_parity(); + my_arena.flip_parity(); + my_arena.rollback(corner_restore); + write_segments<<>>(d_delta_flags_out, seg_starts, seg_ends, B_starts[d], B_ends[d], d_corners_in, B_coord[d], d_seg_counters_out, B_size[d], num_segments, d, d_seg_counters, d_next_corners); KERNEL_CHECK(stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); - corners_instance.destroy(); - corners_instance = next_corners_instance; d_corners_in = d_next_corners; d_corners_out = d_next_corners + next_round; //The segment count in each iter is not monotonic, so we have to realloc each time + shared_ptr = reinterpret_cast(my_arena.alloc_bytes(2 * num_intermediate * alloc_size_1, align_1)); + d_flags = my_arena.alloc(num_intermediate); + d_exc_sum = my_arena.alloc(num_intermediate); + + temp_restore = my_arena.mark(); + + d_src_keys_in = reinterpret_cast(shared_ptr); + d_src_keys_out = reinterpret_cast(shared_ptr) + num_intermediate; + + d_coord_keys_in = reinterpret_cast(shared_ptr); + d_coord_keys_out = reinterpret_cast(shared_ptr) + num_intermediate; - shared_instance.destroy(); - flags_instance.destroy(); - exc_sum_instance.destroy(); - seg_bound_instance.destroy(); - seg_counters.destroy(); - seg_counters_out.destroy(); - - shared_instance = this->realm_malloc(2 * num_intermediate * alloc_size_1, my_mem); - flags_instance = this->realm_malloc(num_intermediate * sizeof(uint8_t), my_mem); - exc_sum_instance = this->realm_malloc(num_intermediate * sizeof(size_t), my_mem); - - d_src_keys_in = reinterpret_cast(AffineAccessor(shared_instance, 0).base); - d_src_keys_out = reinterpret_cast(AffineAccessor(shared_instance, 0).base) + num_intermediate; - d_coord_keys_in = reinterpret_cast(AffineAccessor(shared_instance, 0).base); - d_coord_keys_out = reinterpret_cast(AffineAccessor(shared_instance, 0).base) + num_intermediate; - d_deltas = reinterpret_cast(AffineAccessor(shared_instance, 0).base); - d_deltas_out = reinterpret_cast(AffineAccessor(shared_instance, 0).base) + num_intermediate; - - d_flags = reinterpret_cast(AffineAccessor(flags_instance, 0).base); - d_exc_sum = reinterpret_cast(AffineAccessor(exc_sum_instance, 0).base); - d_delta_flags_in = reinterpret_cast(AffineAccessor(shared_instance, 0).base); - d_delta_flags_out = reinterpret_cast(AffineAccessor(shared_instance, 0).base) + num_intermediate; + d_deltas = reinterpret_cast(shared_ptr); + d_deltas_out = reinterpret_cast(shared_ptr) + num_intermediate; + + d_delta_flags_in = reinterpret_cast(shared_ptr); + d_delta_flags_out = reinterpret_cast(shared_ptr) + num_intermediate; } } + //Get to a known state + my_arena.flip_parity(); + if (my_arena.get_parity()) { + my_arena.rollback(right_restore); + } + //For our last dim, we emit rectangles rather than segments. These rectangles are a disjoint, precise covering of the original set. - RegionInstance rects_out_instance = this->realm_malloc(2 * num_intermediate * sizeof(RectDesc), my_mem); - RectDesc* d_rects_out = reinterpret_cast*>(AffineAccessor(rects_out_instance, 0).base); - RectDesc* d_rects_in = reinterpret_cast*>(AffineAccessor(rects_out_instance, 0).base) + num_intermediate; + RectDesc* d_rects_out = my_arena.alloc>(num_intermediate); CUDA_CHECK(cudaMemsetAsync(d_seg_counters, 0, num_segments*sizeof(uint32_t), stream), stream); write_segments<<>>(d_delta_flags_out, seg_starts, seg_ends, B_start_ptrs, B_end_ptrs, d_corners_in, B_coord_ptrs, d_seg_counters_out, B_size[0], num_segments, d_seg_counters, d_rects_out); @@ -1021,36 +1020,37 @@ namespace Realm { CUDA_CHECK(cudaStreamSynchronize(stream), stream); - //Don't need these anymore - flags_instance.destroy(); - exc_sum_instance.destroy(); - seg_bound_instance.destroy(); - seg_counters.destroy(); - seg_counters_out.destroy(); - corners_instance.destroy(); - for (int d = 0; d < N; d++) { - B_coord_inst[d].destroy(); - B_src_inst[d].destroy(); + //Force the rectangles to the left side of the buffer + if (my_arena.get_parity()) { + my_arena.flip_parity(); + RectDesc* tmp_out = my_arena.alloc>(num_intermediate); + CUDA_CHECK(cudaMemcpyAsync(tmp_out, d_rects_out, num_intermediate * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); } - B_ptrs_instance.destroy(); - B_coord_ptrs_instance.destroy(); - std::swap(d_rects_out, d_rects_in); + //Clear everything out, we should be on the left + my_arena.flip_parity(); + my_arena.flip_parity(); + assert(!my_arena.get_parity()); + + RectDesc* d_rects_in = my_arena.alloc>(2 * num_intermediate); + d_rects_out = d_rects_in + num_intermediate; - shared_instance.destroy(); size_t alloc_size_2 = max(sizeof(size_t), sizeof(T)); + size_t align_2 = max(alignof(size_t), alignof(T)); + + + shared_ptr = reinterpret_cast(my_arena.alloc_bytes(2 * num_intermediate * alloc_size_2, align_2)); - shared_instance = this->realm_malloc(2 * num_intermediate * alloc_size_2, my_mem); + d_src_keys_in = reinterpret_cast(shared_ptr); + d_src_keys_out = reinterpret_cast(shared_ptr) + num_intermediate; + d_coord_keys_in = reinterpret_cast(shared_ptr); + d_coord_keys_out = reinterpret_cast(shared_ptr) + num_intermediate; - d_src_keys_in = reinterpret_cast(AffineAccessor(shared_instance, 0).base); - d_src_keys_out = reinterpret_cast(AffineAccessor(shared_instance, 0).base) + num_intermediate; - d_coord_keys_in = reinterpret_cast(AffineAccessor(shared_instance, 0).base); - d_coord_keys_out = reinterpret_cast(AffineAccessor(shared_instance, 0).base) + num_intermediate; + size_t* group_ids = reinterpret_cast(shared_ptr); - RegionInstance break_points_instance = this->realm_malloc(num_intermediate * sizeof(uint8_t), my_mem); - uint8_t* break_points = reinterpret_cast(AffineAccessor(break_points_instance, 0).base); + uint8_t* break_points = my_arena.alloc(num_intermediate); - size_t* group_ids = reinterpret_cast(AffineAccessor(shared_instance, 0).base); + temp_restore = my_arena.mark(); //Now that we have disjoint rectangles, we can do our usual sort and coalesce pass size_t last = INT_MAX; @@ -1074,12 +1074,10 @@ namespace Realm { d_coord_keys_in, d_coord_keys_out, d_rects_in, d_rects_out, num_intermediate, 0, 8*sizeof(T), stream); - if (temp_bytes > orig_tmp) { - tmp_instance.destroy(); - tmp_instance = this->realm_malloc(temp_bytes, my_mem); - orig_tmp = temp_bytes; - tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); - } + + my_arena.rollback(temp_restore); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, d_coord_keys_in, d_coord_keys_out, d_rects_in, d_rects_out, @@ -1097,12 +1095,10 @@ namespace Realm { d_coord_keys_in, d_coord_keys_out, d_rects_in, d_rects_out, num_intermediate, 0, 8*sizeof(T), stream); - if (temp_bytes > orig_tmp) { - tmp_instance.destroy(); - tmp_instance = this->realm_malloc(temp_bytes, my_mem); - orig_tmp = temp_bytes; - tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); - } + + my_arena.rollback(temp_restore); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, d_coord_keys_in, d_coord_keys_out, d_rects_in, d_rects_out, @@ -1115,12 +1111,10 @@ namespace Realm { d_coord_keys_in, d_coord_keys_out, d_rects_in, d_rects_out, num_intermediate, 0, 8*sizeof(T), stream); - if (temp_bytes > orig_tmp) { - tmp_instance.destroy(); - tmp_instance = this->realm_malloc(temp_bytes, my_mem); - orig_tmp = temp_bytes; - tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); - } + + my_arena.rollback(temp_restore); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, d_coord_keys_in, d_coord_keys_out, d_rects_in, d_rects_out, @@ -1136,12 +1130,10 @@ namespace Realm { d_src_keys_in, d_src_keys_out, d_rects_in, d_rects_out, num_intermediate, 0, 8*sizeof(size_t), stream); - if (temp_bytes > orig_tmp) { - tmp_instance.destroy(); - tmp_instance = this->realm_malloc(temp_bytes, my_mem); - orig_tmp = temp_bytes; - tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); - } + + my_arena.rollback(temp_restore); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, d_src_keys_in, d_src_keys_out, d_rects_in, d_rects_out, @@ -1154,12 +1146,8 @@ namespace Realm { cub::DeviceScan::InclusiveSum(nullptr, temp_bytes, break_points, group_ids, num_intermediate, stream); - if (temp_bytes > orig_tmp) { - tmp_instance.destroy(); - tmp_instance = this->realm_malloc(temp_bytes, my_mem); - orig_tmp = temp_bytes; - tmp_storage = reinterpret_cast(AffineAccessor(tmp_instance, 0).base); - } + my_arena.rollback(temp_restore); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); cub::DeviceScan::InclusiveSum(tmp_storage, temp_bytes, break_points, group_ids, num_intermediate, stream); @@ -1177,15 +1165,26 @@ namespace Realm { CUDA_CHECK(cudaStreamSynchronize(stream), stream); } - //And... we're done - if (out_rects > 0) { - d_out_rects = d_rects_in; + if (out_rects == 2) { + d_out_rects = d_rects; + if (d_out_rects != d_rects_in) { + CUDA_CHECK(cudaMemcpyAsync(d_out_rects, d_rects_in, num_intermediate * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); + } + out_rects = num_intermediate; + } else if (out_rects == 1) { + my_arena.reset(true); + d_out_rects = my_arena.alloc>(num_intermediate); + my_arena.commit(true); + if (d_rects_in + num_intermediate >= d_out_rects) { + assert(d_rects_out < d_rects_in); + CUDA_CHECK(cudaMemcpyAsync(d_rects_out, d_rects_in, num_intermediate * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); + std::swap(d_rects_in, d_rects_out); + } + CUDA_CHECK(cudaMemcpyAsync(d_out_rects, d_rects_in, num_intermediate * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); out_rects = num_intermediate; } else { this->send_output(d_rects_in, num_intermediate, my_arena, ctr, getIndex, getMap); - rects_out_instance.destroy(); } - } /* @@ -1212,8 +1211,9 @@ namespace Realm { size_t bytes_S = total_rects * sizeof(size_t); size_t bytes_HF = total_rects * sizeof(HiFlag); size_t max_bytes = std::max({bytes_T, bytes_HF, bytes_S}); + size_t max_align = std::max({alignof(T), alignof(HiFlag), alignof(size_t)}); - char* aux_ptr = my_arena.alloc(2 * max_bytes); + char* aux_ptr = reinterpret_cast(my_arena.alloc_bytes(2 * max_bytes, max_align)); uint8_t* break_points = my_arena.alloc(total_rects); size_t* group_ids = my_arena.alloc(total_rects); From c05776f5f9ecf12c89b47b2a6a5f4a5a297d9248 Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Thu, 19 Feb 2026 18:03:37 -0800 Subject: [PATCH 15/32] byfield tiled --- src/realm/deppart/byfield.cc | 2 +- src/realm/deppart/byfield_gpu_impl.hpp | 172 ++++++++++++++++------ src/realm/deppart/image_gpu_impl.hpp | 21 +-- src/realm/deppart/partitions_gpu_impl.hpp | 15 +- 4 files changed, 148 insertions(+), 62 deletions(-) diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc index b9d4bf5e43..ce543e1b44 100644 --- a/src/realm/deppart/byfield.cc +++ b/src/realm/deppart/byfield.cc @@ -44,7 +44,7 @@ namespace Realm { if (val) { device_size = atoi(val); } - size_t optimal_size = is.bounds.volume() * sizeof(Rect) * 100; + size_t optimal_size = is.bounds.volume() * sizeof(RectDesc); std::vector affinities; unsigned best_bandwidth = 0; Processor best_proc = Processor::NO_PROC; diff --git a/src/realm/deppart/byfield_gpu_impl.hpp b/src/realm/deppart/byfield_gpu_impl.hpp index 8765a57f11..56ab0258a2 100644 --- a/src/realm/deppart/byfield_gpu_impl.hpp +++ b/src/realm/deppart/byfield_gpu_impl.hpp @@ -32,6 +32,9 @@ void GPUByFieldMicroOp::execute() inst_space.offsets = buffer_arena.alloc(field_data.size() + 1); inst_space.num_children = field_data.size(); + Arena sys_arena; + GPUMicroOp::collapse_multi_space(field_data, inst_space, sys_arena, stream); + collapsed_space collapsed_parent; collapsed_parent.offsets = buffer_arena.alloc(2); collapsed_parent.num_children = 1; @@ -80,14 +83,125 @@ void GPUByFieldMicroOp::execute() buffer_arena.commit(false); - GPUMicroOp::collapse_multi_space(field_data, inst_space, buffer_arena, stream); + // Map colors to their output index to match send output iterator. + std::map color_indices; + for (size_t i = 0; i < colors.size(); i++) { + color_indices[colors[i]] = i; + } + + size_t num_output = 0; + RectDesc* output_start = nullptr; + size_t num_completed = 0; + size_t curr_tile = tile_size / 2; + int count = 0; + while (num_completed < inst_space.num_entries) { + try { + std::cout << "Byfield iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; + buffer_arena.start(); + if (num_completed + curr_tile > inst_space.num_entries) { + curr_tile = inst_space.num_entries - num_completed; + } - // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from. - GPUMicroOp::template construct_input_rectlist>(inst_space, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream); + collapsed_space inst_space_tile = inst_space; + inst_space_tile.num_entries = curr_tile; + inst_space_tile.entries_buffer = buffer_arena.alloc>(curr_tile); + CUDA_CHECK(cudaMemcpyAsync(inst_space_tile.entries_buffer, inst_space.entries_buffer + num_completed, curr_tile * sizeof(SparsityMapEntry), cudaMemcpyHostToDevice, stream), stream); + // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from. + GPUMicroOp::template construct_input_rectlist>(inst_space_tile, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream); - // Early out if we don't have any rectangles. - if (num_valid_rects == 0) { + + // Early out if we don't have any rectangles. + if (num_valid_rects == 0) { + num_completed += curr_tile; + curr_tile = tile_size / 2; + subtract_const<<>>(inst_space.offsets, field_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + continue; + } + + + // Prefix sum the valid rectangles by volume. + size_t total_pts; + size_t* d_prefix_rects; + + GPUMicroOp::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream); + + // Now we have everything we need to actually populate our outputs. + buffer_arena.flip_parity(); + assert(!buffer_arena.get_parity()); + + PointDesc* d_points = buffer_arena.alloc>(total_pts); + + // This is where the work is actually done - each thread figures out which points to read, reads it, marks a PointDesc with its color, and writes it out. + byfield_gpuPopulateBitmasksKernel<<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, d_colors, total_pts, colors.size(), num_valid_rects, field_data.size(), d_points); + KERNEL_CHECK(stream); + + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + // Ship off the points for final processing. + size_t num_new_rects = (num_output == 0) ? 1 : 2; + assert(!buffer_arena.get_parity()); + RectDesc* d_new_rects; + this->complete_pipeline(d_points, total_pts, d_new_rects, num_new_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& kv){ + // elem is a SparsityMap from the vector + return color_indices.at(kv.first); + }, + /* getMap: */ [&](auto const& kv){ + // return the SparsityMap key itself + return kv.second; + }); + + if (num_output==0) { + num_output = num_new_rects; + output_start = d_new_rects; + num_completed += curr_tile; + subtract_const<<>>(inst_space.offsets, field_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + curr_tile = tile_size / 2; + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + continue; + } + + //Otherwise we merge with existing rectangles + RectDesc* d_old_rects = buffer_arena.alloc>(num_output); + assert(d_old_rects == d_new_rects + num_new_rects); + CUDA_CHECK(cudaMemcpyAsync(d_old_rects, output_start, num_output * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + size_t num_final_rects = 1; + + //Send it off for processing + this->complete_rect_pipeline(d_new_rects, num_output + num_new_rects, output_start, num_final_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& kv){ + // elem is a SparsityMap from the vector + return color_indices.at(kv.first); + }, + /* getMap: */ [&](auto const& kv){ + // return the SparsityMap key itself + return kv.second; + }); + num_completed += curr_tile; + num_output = num_final_rects; + subtract_const<<>>(inst_space.offsets, field_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + curr_tile = tile_size / 2; + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } catch (arena_oom&) { + std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; + std::cout << buffer_arena.used() << " bytes used in arena." << std::endl; + curr_tile /= 2; + if (curr_tile == 0) { + throw; + } + } + } + + if (num_output == 0) { for (std::pair> it : sparsity_outputs) { SparsityMapImpl *impl = SparsityMapImpl::lookup(it.second); if (this->exclusive) { @@ -99,45 +213,15 @@ void GPUByFieldMicroOp::execute() return; } + this->send_output(output_start, num_output, buffer_arena, sparsity_outputs, + /* getIndex: */ [&](auto const& kv){ + // elem is a SparsityMap from the vector + return color_indices.at(kv.first); + }, + /* getMap: */ [&](auto const& kv){ + // return the SparsityMap key itself + return kv.second; + }); - // Prefix sum the valid rectangles by volume. - size_t total_pts; - - size_t* d_prefix_rects; - GPUMicroOp::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream); - - // Now we have everything we need to actually populate our outputs. - buffer_arena.flip_parity(); - assert(!buffer_arena.get_parity()); - - RegionInstance points_instance = this->realm_malloc(total_pts * sizeof(PointDesc), zcpy_mem); - PointDesc* d_points = reinterpret_cast*>(AffineAccessor(points_instance, 0).base); - - // This is where the work is actually done - each thread figures out which points to read, reads it, marks a PointDesc with its color, and writes it out. - byfield_gpuPopulateBitmasksKernel<<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, d_colors, total_pts, colors.size(), num_valid_rects, field_data.size(), d_points); - KERNEL_CHECK(stream); - - - // Map colors to their output index to match send output iterator. - std::map color_indices; - for (size_t i = 0; i < colors.size(); i++) { - color_indices[colors[i]] = i; - } - - CUDA_CHECK(cudaStreamSynchronize(stream), stream); - - // Ship off the points for final processing. - size_t out_rects = 0; - RectDesc* trash; - this->complete_pipeline(d_points, total_pts, trash, out_rects, buffer_arena, - /* the Container: */ sparsity_outputs, - /* getIndex: */ [&](auto const& kv){ - // elem is a SparsityMap from the vector - return color_indices.at(kv.first); - }, - /* getMap: */ [&](auto const& kv){ - // return the SparsityMap key itself - return kv.second; - }); } } diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp index ce83e03639..b22812a00c 100644 --- a/src/realm/deppart/image_gpu_impl.hpp +++ b/src/realm/deppart/image_gpu_impl.hpp @@ -93,7 +93,6 @@ void GPUImageMicroOp::gpu_populate_rngs() uint32_t* d_src_prefix = d_src_counters + sources.size(); buffer_arena.commit(false); - size_t left = buffer_arena.used(); size_t num_output = 0; RectDesc* output_start = nullptr; @@ -102,7 +101,7 @@ void GPUImageMicroOp::gpu_populate_rngs() int count = 0; while (num_completed < inst_space.num_entries) { try { - std::cout << "Tile iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; + std::cout << "Image Range iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; buffer_arena.start(); buffer_arena.flip_parity(); if (num_completed + curr_tile > inst_space.num_entries) { @@ -177,7 +176,7 @@ void GPUImageMicroOp::gpu_populate_rngs() CUDA_CHECK(cudaStreamSynchronize(stream), stream); - size_t num_new_rects = 2; + size_t num_new_rects = (num_output == 0) ? 1 : 2; assert(!buffer_arena.get_parity()); RectDesc* d_new_rects; @@ -197,13 +196,9 @@ void GPUImageMicroOp::gpu_populate_rngs() if (num_output==0) { //We need to place the new output at the rightmost end of the buffer - buffer_arena.flip_parity(); - buffer_arena.reset(true); - output_start = buffer_arena.alloc>(num_new_rects); - buffer_arena.commit(true); - CUDA_CHECK(cudaMemcpyAsync(output_start, d_new_rects, num_new_rects * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); num_output = num_new_rects; num_completed += curr_tile; + output_start = d_new_rects; subtract_const<<>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile); KERNEL_CHECK(stream); curr_tile = tile_size / 2; @@ -342,7 +337,7 @@ void GPUImageMicroOp::gpu_populate_ptrs() int count = 0; while (num_completed < inst_space.num_entries) { try { - std::cout << "Tile iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; + std::cout << "Image iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; buffer_arena.start(); std::cout << "Amount Used: " << buffer_arena.used() << std::endl; std::cout << "Expected Amount Used: " << left + num_output * sizeof(RectDesc) << std::endl; @@ -409,7 +404,7 @@ void GPUImageMicroOp::gpu_populate_ptrs() CUDA_CHECK(cudaStreamSynchronize(stream), stream); - size_t num_new_rects = 1; + size_t num_new_rects = num_output == 0 ? 1 : 2; assert(!buffer_arena.get_parity()); RectDesc* d_new_rects; @@ -426,13 +421,9 @@ void GPUImageMicroOp::gpu_populate_ptrs() }); if (num_output==0) { - buffer_arena.flip_parity(); - buffer_arena.reset(true); - output_start = buffer_arena.alloc>(num_new_rects); - buffer_arena.commit(true); - CUDA_CHECK(cudaMemcpyAsync(output_start, d_new_rects, num_new_rects * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); num_output = num_new_rects; num_completed += curr_tile; + output_start = d_new_rects; subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); KERNEL_CHECK(stream); curr_tile = tile_size / 2; diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp index 565a413fa0..6f50175ec9 100644 --- a/src/realm/deppart/partitions_gpu_impl.hpp +++ b/src/realm/deppart/partitions_gpu_impl.hpp @@ -1352,7 +1352,9 @@ namespace Realm { NVTX_DEPPART(complete_pipeline); - my_arena.flip_parity(); + if (out_rects == 2) { + my_arena.flip_parity(); + } cudaStream_t stream = Cuda::get_task_cuda_stream(); @@ -1451,12 +1453,21 @@ namespace Realm { std::swap(d_rects_in, d_rects_out); } my_arena.flip_parity(); + if (out_rects == 2) { + assert(!my_arena.get_parity()); + } else if (out_rects == 1) { + assert(my_arena.get_parity()); + my_arena.reset(true); + } d_out_rects = my_arena.alloc>(num_intermediate); + if (out_rects == 1) { + my_arena.commit(true); + } CUDA_CHECK(cudaMemcpyAsync(d_out_rects, d_rects_in, num_intermediate * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); } - if (out_rects==1) { + if (out_rects > 0) { out_rects = num_intermediate; } else { this->send_output(d_rects_in, num_intermediate, my_arena, ctr, getIndex, getMap); From 2182a0400332744c67ee495ceb3da2715c27ce50 Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Sun, 22 Feb 2026 21:46:14 -0800 Subject: [PATCH 16/32] Added host fallback --- src/realm/deppart/byfield_gpu_impl.hpp | 101 +- src/realm/deppart/image_gpu_impl.hpp | 113 +- src/realm/deppart/partitions.h | 2 + src/realm/deppart/partitions_gpu_impl.hpp | 112 +- src/realm/deppart/partitions_gpu_kernels.hpp | 8 +- src/realm/deppart/preimage.cc | 2 +- src/realm/deppart/preimage_gpu_impl.hpp | 819 +-- src/realm/deppart/sparsity_impl.cc | 13 + src/realm/deppart/sparsity_impl.h | 1 + tests/CMakeLists.txt | 1 + tests/benchmark.cc | 5019 ++++++++++++++++++ tests/deppart.cc | 16 +- 12 files changed, 5811 insertions(+), 396 deletions(-) create mode 100644 tests/benchmark.cc diff --git a/src/realm/deppart/byfield_gpu_impl.hpp b/src/realm/deppart/byfield_gpu_impl.hpp index 56ab0258a2..e309cf7609 100644 --- a/src/realm/deppart/byfield_gpu_impl.hpp +++ b/src/realm/deppart/byfield_gpu_impl.hpp @@ -89,11 +89,17 @@ void GPUByFieldMicroOp::execute() color_indices[colors[i]] = i; } + Memory sysmem; + assert(find_memory(sysmem, Memory::SYSTEM_MEM)); + size_t num_output = 0; RectDesc* output_start = nullptr; size_t num_completed = 0; size_t curr_tile = tile_size / 2; int count = 0; + bool host_fallback = false; + std::vector h_instances(colors.size(), RegionInstance::NO_INST); + std::vector entry_counts(colors.size(), 0); while (num_completed < inst_space.num_entries) { try { std::cout << "Byfield iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; @@ -155,7 +161,11 @@ void GPUByFieldMicroOp::execute() return kv.second; }); - if (num_output==0) { + if (host_fallback) { + this->split_output(d_new_rects, num_new_rects, h_instances, entry_counts, buffer_arena); + } + + if (num_output==0 || host_fallback) { num_output = num_new_rects; output_start = d_new_rects; num_completed += curr_tile; @@ -166,40 +176,44 @@ void GPUByFieldMicroOp::execute() continue; } - //Otherwise we merge with existing rectangles - RectDesc* d_old_rects = buffer_arena.alloc>(num_output); - assert(d_old_rects == d_new_rects + num_new_rects); - CUDA_CHECK(cudaMemcpyAsync(d_old_rects, output_start, num_output * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); - CUDA_CHECK(cudaStreamSynchronize(stream), stream); + //Otherwise we merge with existing rectangles + RectDesc* d_old_rects = buffer_arena.alloc>(num_output); + assert(d_old_rects == d_new_rects + num_new_rects); + CUDA_CHECK(cudaMemcpyAsync(d_old_rects, output_start, num_output * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); - size_t num_final_rects = 1; + size_t num_final_rects = 1; + //Send it off for processing + this->complete_rect_pipeline(d_new_rects, num_output + num_new_rects, output_start, num_final_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& kv){ + // elem is a SparsityMap from the vector + return color_indices.at(kv.first); + }, + /* getMap: */ [&](auto const& kv){ + // return the SparsityMap key itself + return kv.second; + }); + num_completed += curr_tile; + num_output = num_final_rects; + subtract_const<<>>(inst_space.offsets, field_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + curr_tile = tile_size / 2; + CUDA_CHECK(cudaStreamSynchronize(stream), stream); - //Send it off for processing - this->complete_rect_pipeline(d_new_rects, num_output + num_new_rects, output_start, num_final_rects, buffer_arena, - /* the Container: */ sparsity_outputs, - /* getIndex: */ [&](auto const& kv){ - // elem is a SparsityMap from the vector - return color_indices.at(kv.first); - }, - /* getMap: */ [&](auto const& kv){ - // return the SparsityMap key itself - return kv.second; - }); - num_completed += curr_tile; - num_output = num_final_rects; - subtract_const<<>>(inst_space.offsets, field_data.size()+1, curr_tile); - KERNEL_CHECK(stream); - curr_tile = tile_size / 2; - CUDA_CHECK(cudaStreamSynchronize(stream), stream); - } catch (arena_oom&) { - std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; - std::cout << buffer_arena.used() << " bytes used in arena." << std::endl; - curr_tile /= 2; - if (curr_tile == 0) { - throw; + } catch (arena_oom&) { + std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; + std::cout << buffer_arena.used() << " bytes used in arena." << std::endl; + curr_tile /= 2; + if (curr_tile == 0) { + host_fallback = true; + if (num_output > 0) { + this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); } + curr_tile = tile_size / 2; } } + } if (num_output == 0) { for (std::pair> it : sparsity_outputs) { @@ -213,7 +227,9 @@ void GPUByFieldMicroOp::execute() return; } - this->send_output(output_start, num_output, buffer_arena, sparsity_outputs, + if (!host_fallback) { + try { + this->send_output(output_start, num_output, buffer_arena, sparsity_outputs, /* getIndex: */ [&](auto const& kv){ // elem is a SparsityMap from the vector return color_indices.at(kv.first); @@ -222,6 +238,29 @@ void GPUByFieldMicroOp::execute() // return the SparsityMap key itself return kv.second; }); + } catch (arena_oom&) { + this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + host_fallback = true; + } + } + + if (host_fallback) { + for (std::pair> it : sparsity_outputs) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(it.second); + if (this->exclusive) { + impl->set_contributor_count(1); + } + size_t idx = color_indices.at(it.first); + if (entry_counts[idx] > 0) { + Rect* h_rects = reinterpret_cast *>(AffineAccessor(h_instances[idx], 0).base); + span> h_rects_span(h_rects, entry_counts[idx]); + impl->contribute_dense_rect_list(h_rects_span, false); + h_instances[idx].destroy(); + } else { + impl->contribute_nothing(); + } + } + } } } diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp index b22812a00c..643845296d 100644 --- a/src/realm/deppart/image_gpu_impl.hpp +++ b/src/realm/deppart/image_gpu_impl.hpp @@ -7,6 +7,8 @@ #include #include "realm/nvtx.h" +#include + namespace Realm { //TODO: INTERSECTING INPUT/OUTPUT RECTS CAN BE DONE WITH BVH IF BECOME EXPENSIVE @@ -74,7 +76,6 @@ void GPUImageMicroOp::gpu_populate_rngs() // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second // to track which instance each rectangle came from in the populate phase. uint32_t* d_inst_prefix = d_inst_counters + domain_transform.range_data.size(); - size_t num_valid_rects = tile_size; collapsed_space collapsed_parent; @@ -99,6 +100,10 @@ void GPUImageMicroOp::gpu_populate_rngs() size_t num_completed = 0; size_t curr_tile = tile_size / 2; int count = 0; + + bool host_fallback = false; + std::vector h_instances(sources.size(), RegionInstance::NO_INST); + std::vector entry_counts(sources.size(), 0); while (num_completed < inst_space.num_entries) { try { std::cout << "Image Range iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; @@ -112,16 +117,17 @@ void GPUImageMicroOp::gpu_populate_rngs() inst_space_tile.entries_buffer = buffer_arena.alloc>(curr_tile); CUDA_CHECK(cudaMemcpyAsync(inst_space_tile.entries_buffer, inst_space.entries_buffer + num_completed, curr_tile * sizeof(SparsityMapEntry), cudaMemcpyHostToDevice, stream), stream); + size_t num_valid_rects; RectDesc* d_valid_rects; // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from. GPUMicroOp::template construct_input_rectlist>(inst_space_tile, src_space, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream); if (num_valid_rects == 0) { num_completed += curr_tile; - curr_tile = tile_size / 2; subtract_const<<>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile); KERNEL_CHECK(stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); + curr_tile = tile_size / 2; continue; } @@ -158,10 +164,10 @@ void GPUImageMicroOp::gpu_populate_rngs() if (num_valid_output == 0) { num_completed += curr_tile; - curr_tile = tile_size / 2; subtract_const<<>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile); KERNEL_CHECK(stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); + curr_tile = tile_size / 2; continue; } @@ -192,8 +198,12 @@ void GPUImageMicroOp::gpu_populate_rngs() return elem; }); + if (host_fallback) { + this->split_output(d_new_rects, num_new_rects, h_instances, entry_counts, buffer_arena); + } + //Set our first set of output rectangles - if (num_output==0) { + if (num_output==0 || host_fallback) { //We need to place the new output at the rightmost end of the buffer num_output = num_new_rects; @@ -237,13 +247,30 @@ void GPUImageMicroOp::gpu_populate_rngs() std::cout << buffer_arena.used() << " bytes used in arena." << std::endl; curr_tile /= 2; if (curr_tile == 0) { - throw; + host_fallback = true; + if (num_output > 0) { + this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + } + curr_tile = tile_size / 2; } } } - CUDA_CHECK(cudaStreamSynchronize(stream), stream); - KERNEL_CHECK(stream); - this->send_output(output_start, num_output, buffer_arena, sparsity_outputs, + + if (num_output == 0) { + for (SparsityMap it : sparsity_outputs) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(it); + if (this->exclusive) { + impl->gpu_finalize(); + } else { + impl->contribute_nothing(); + } + } + return; + } + + if (!host_fallback) { + try { + this->send_output(output_start, num_output, buffer_arena, sparsity_outputs, /* getIndex: */ [&](auto const& elem){ // elem is a SparsityMap from the vector return size_t(&elem - sparsity_outputs.data()); @@ -252,6 +279,28 @@ void GPUImageMicroOp::gpu_populate_rngs() // return the SparsityMap key itself return elem; }); + } catch (arena_oom&) { + this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + host_fallback = true; + } + } + + if (host_fallback) { + for (size_t idx = 0; idx < sparsity_outputs.size(); ++idx) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(sparsity_outputs[idx]); + if (this->exclusive) { + impl->set_contributor_count(1); + } + if (entry_counts[idx] > 0) { + Rect* h_rects = reinterpret_cast *>(AffineAccessor(h_instances[idx], 0).base); + span> h_rects_span(h_rects, entry_counts[idx]); + impl->contribute_dense_rect_list(h_rects_span, false); + h_instances[idx].destroy(); + } else { + impl->contribute_nothing(); + } + } + } } @@ -307,7 +356,6 @@ void GPUImageMicroOp::gpu_populate_ptrs() // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second // to track which instance each rectangle came from in the populate phase. uint32_t* d_inst_prefix = d_inst_counters + domain_transform.ptr_data.size(); - size_t num_valid_rects = tile_size; //Uniform for all tiles collapsed_space collapsed_parent; @@ -335,6 +383,9 @@ void GPUImageMicroOp::gpu_populate_ptrs() size_t num_completed = 0; size_t curr_tile = tile_size / 2; int count = 0; + bool host_fallback = false; + std::vector h_instances(sources.size(), RegionInstance::NO_INST); + std::vector entry_counts(sources.size(), 0); while (num_completed < inst_space.num_entries) { try { std::cout << "Image iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; @@ -349,15 +400,16 @@ void GPUImageMicroOp::gpu_populate_ptrs() inst_space_tile.entries_buffer = buffer_arena.alloc>(curr_tile); CUDA_CHECK(cudaMemcpyAsync(inst_space_tile.entries_buffer, inst_space.entries_buffer + num_completed, curr_tile * sizeof(SparsityMapEntry), cudaMemcpyHostToDevice, stream), stream); + size_t num_valid_rects; RectDesc* d_valid_rects; GPUMicroOp::template construct_input_rectlist>(inst_space_tile, src_space, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream); if (num_valid_rects == 0) { num_completed += curr_tile; - curr_tile = tile_size / 2; subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); KERNEL_CHECK(stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); + curr_tile = tile_size / 2; continue; } @@ -385,10 +437,10 @@ void GPUImageMicroOp::gpu_populate_ptrs() if (num_valid_points == 0) { num_completed += curr_tile; - curr_tile = tile_size / 2; subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); KERNEL_CHECK(stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); + curr_tile = tile_size / 2; continue; } @@ -420,7 +472,11 @@ void GPUImageMicroOp::gpu_populate_ptrs() return elem; }); - if (num_output==0) { + if (host_fallback) { + this->split_output(d_new_rects, num_new_rects, h_instances, entry_counts, buffer_arena); + } + + if (num_output==0 || host_fallback) { num_output = num_new_rects; num_completed += curr_tile; output_start = d_new_rects; @@ -461,7 +517,11 @@ void GPUImageMicroOp::gpu_populate_ptrs() std::cout << buffer_arena.used() << " bytes used in arena." << std::endl; curr_tile /= 2; if (curr_tile == 0) { - throw; + host_fallback = true; + if (num_output > 0) { + this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + } + curr_tile = tile_size / 2; } } } @@ -477,7 +537,10 @@ void GPUImageMicroOp::gpu_populate_ptrs() } return; } - this->send_output(output_start, num_output, buffer_arena, sparsity_outputs, + + if (!host_fallback) { + try { + this->send_output(output_start, num_output, buffer_arena, sparsity_outputs, /* getIndex: */ [&](auto const& elem){ // elem is a SparsityMap from the vector return size_t(&elem - sparsity_outputs.data()); @@ -486,5 +549,27 @@ void GPUImageMicroOp::gpu_populate_ptrs() // return the SparsityMap key itself return elem; }); + } catch (arena_oom&) { + this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + host_fallback = true; + } + } + + if (host_fallback) { + for (size_t idx = 0; idx < sparsity_outputs.size(); ++idx) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(sparsity_outputs[idx]); + if (this->exclusive) { + impl->set_contributor_count(1); + } + if (entry_counts[idx] > 0) { + Rect* h_rects = reinterpret_cast *>(AffineAccessor(h_instances[idx], 0).base); + span> h_rects_span(h_rects, entry_counts[idx]); + impl->contribute_dense_rect_list(h_rects_span, false); + h_instances[idx].destroy(); + } else { + impl->contribute_nothing(); + } + } + } } } \ No newline at end of file diff --git a/src/realm/deppart/partitions.h b/src/realm/deppart/partitions.h index 4a8899e251..8b67e5e642 100644 --- a/src/realm/deppart/partitions.h +++ b/src/realm/deppart/partitions.h @@ -370,6 +370,8 @@ namespace Realm { template void complete1d_pipeline(RectDesc* d_rects, size_t total_rects, RectDesc* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap); + void split_output(RectDesc* d_rects, size_t total_rects, std::vector &output_instances, std::vector &output_counts, Arena &my_arena); + template void send_output(RectDesc* d_rects, size_t total_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap); diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp index 6f50175ec9..82abfd57d9 100644 --- a/src/realm/deppart/partitions_gpu_impl.hpp +++ b/src/realm/deppart/partitions_gpu_impl.hpp @@ -1474,6 +1474,100 @@ namespace Realm { } } + template + void GPUMicroOp::split_output(RectDesc* d_rects, size_t total_rects, std::vector &output_instances, std::vector &output_counts, Arena &my_arena) + { + NVTX_DEPPART(send_output); + + cudaStream_t stream = Cuda::get_task_cuda_stream(); + bool use_sysmem = false; + RegionInstance sys_instance = RegionInstance::NO_INST; + + Memory sysmem; + assert(find_memory(sysmem, Memory::SYSTEM_MEM)); + + Rect* final_rects; + std::vector d_starts_host(output_instances.size()), d_ends_host(output_instances.size()); + + try { + final_rects = my_arena.alloc>(total_rects); + + size_t* d_starts = my_arena.alloc(2 * output_instances.size()); + size_t* d_ends = d_starts + output_instances.size(); + + CUDA_CHECK(cudaMemsetAsync(d_starts, 0, output_instances.size()*sizeof(size_t),stream), stream); + CUDA_CHECK(cudaMemsetAsync(d_ends, 0, output_instances.size()*sizeof(size_t),stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + //Convert RectDesc to SparsityMapEntry and determine where each src's rectangles start and end. + build_final_output<<>>(d_rects, nullptr, final_rects, d_starts, d_ends, total_rects); + KERNEL_CHECK(stream); + + + //Copy starts and ends back to host and handle empty partitions + + CUDA_CHECK(cudaMemcpyAsync(d_starts_host.data(), d_starts, output_instances.size() * sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaMemcpyAsync(d_ends_host.data(), d_ends, output_instances.size() * sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } catch (arena_oom&) { + use_sysmem = true; + RegionInstance tmp_instance = this->realm_malloc(total_rects * sizeof(RectDesc), sysmem); + sys_instance = this->realm_malloc(total_rects * sizeof(Rect), sysmem); + RectDesc* h_tmp_rects = reinterpret_cast*>(tmp_instance.pointer_untyped(0, total_rects * sizeof(RectDesc))); + final_rects = reinterpret_cast*>(sys_instance.pointer_untyped(0, total_rects * sizeof(Rect))); + CUDA_CHECK(cudaMemcpyAsync(h_tmp_rects, d_rects, total_rects * sizeof(RectDesc), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t idx = 0; idx < total_rects; idx++ ) { + final_rects[idx] = h_tmp_rects[idx].rect; + + //Checks if we're the first value for a given src + if (idx == 0 || h_tmp_rects[idx].src_idx != h_tmp_rects[idx-1].src_idx) { + d_starts_host[h_tmp_rects[idx].src_idx] = idx; + } + + //Checks if we're the last value for a given src + if (idx == total_rects-1 || h_tmp_rects[idx].src_idx != h_tmp_rects[idx+1].src_idx) { + d_ends_host[h_tmp_rects[idx].src_idx] = idx+1; + } + } + tmp_instance.destroy(); + } + + for (size_t i = 1; i < output_instances.size(); i++) { + if (d_starts_host[i] < d_ends_host[i-1]) { + d_starts_host[i] = d_ends_host[i-1]; + d_ends_host[i] = d_ends_host[i-1]; + } + } + + for (size_t i = 0; i < output_instances.size(); i++) { + if (d_ends_host[i] > d_starts_host[i]) { + size_t end = d_ends_host[i]; + size_t start = d_starts_host[i]; + if (end - start > 0) { + RegionInstance new_instance = this->realm_malloc(((end - start) + output_counts[i]) * sizeof(Rect), sysmem); + Rect* h_new_rects = reinterpret_cast*>(new_instance.pointer_untyped(0, ((end - start) + output_counts[i]) * sizeof(Rect))); + if (output_counts[i] > 0) { + Rect* h_old_rects = reinterpret_cast*>(output_instances[i].pointer_untyped(0, output_counts[i] * sizeof(Rect))); + std::memcpy(h_new_rects, h_old_rects, output_counts[i] * sizeof(Rect)); + output_instances[i].destroy(); + } + if (use_sysmem) { + std::memcpy(h_new_rects + output_counts[i], final_rects + start, (end - start) * sizeof(Rect)); + } else { + CUDA_CHECK(cudaMemcpyAsync(h_new_rects + output_counts[i], final_rects + start, (end - start) * sizeof(Rect), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } + output_instances[i] = new_instance; + output_counts[i] += end - start; + } + } + } + if (use_sysmem) { + sys_instance.destroy(); + } + } + /* * Input: An array of disjoint rectangles sorted by src idx. * Output: Fills the sparsity output for each src with a host region instance @@ -1491,8 +1585,6 @@ namespace Realm { cudaStream_t stream = Cuda::get_task_cuda_stream(); - std::set output_allocs; - SparsityMapEntry* final_entries = my_arena.alloc>(total_rects); Rect* final_rects = my_arena.alloc>(total_rects); @@ -1502,9 +1594,6 @@ namespace Realm { CUDA_CHECK(cudaMemsetAsync(d_starts, 0, ctr.size()*sizeof(size_t),stream), stream); CUDA_CHECK(cudaMemsetAsync(d_ends, 0, ctr.size()*sizeof(size_t),stream), stream); - - CUDA_CHECK(cudaStreamSynchronize(stream), stream); - //Convert RectDesc to SparsityMapEntry and determine where each src's rectangles start and end. build_final_output<<>>(d_rects, final_entries, final_rects, d_starts, d_ends, total_rects); KERNEL_CHECK(stream); @@ -1522,6 +1611,8 @@ namespace Realm { } } + Memory sysmem; + assert(find_memory(sysmem, Memory::SYSTEM_MEM)); if (!this->exclusive) { for (auto const& elem : ctr) { size_t idx = getIndex(elem); @@ -1530,17 +1621,18 @@ namespace Realm { if (d_ends_host[idx] > d_starts_host[idx]) { size_t end = d_ends_host[idx]; size_t start = d_starts_host[idx]; - std::vector> h_rects(end - start); - CUDA_CHECK(cudaMemcpyAsync(h_rects.data(), final_rects + start, (end - start) * sizeof(Rect), cudaMemcpyDeviceToHost, stream), stream); + RegionInstance h_rects_instance = this->realm_malloc((end - start) * sizeof(Rect), sysmem); + Rect *h_rects = reinterpret_cast *>(AffineAccessor(h_rects_instance, 0).base); + CUDA_CHECK(cudaMemcpyAsync(h_rects, final_rects + start, (end - start) * sizeof(Rect), cudaMemcpyDeviceToHost, stream), stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); - impl->contribute_dense_rect_list(h_rects, false); + span> h_rects_span(h_rects, end - start); + impl->contribute_dense_rect_list(h_rects_span, false); + h_rects_instance.destroy(); } else { impl->contribute_nothing(); } } } else { - Memory sysmem; - assert(find_memory(sysmem, Memory::SYSTEM_MEM)); //Use provided lambdas to iterate over sparsity output container (map or vector) for (auto const& elem : ctr) { diff --git a/src/realm/deppart/partitions_gpu_kernels.hpp b/src/realm/deppart/partitions_gpu_kernels.hpp index 2f607930d9..b3bd280be4 100644 --- a/src/realm/deppart/partitions_gpu_kernels.hpp +++ b/src/realm/deppart/partitions_gpu_kernels.hpp @@ -794,9 +794,11 @@ void build_final_output(const RectDesc* d_rects, size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= numRects) return; d_rects_out[idx] = d_rects[idx].rect; - d_entries_out[idx].bounds = d_rects[idx].rect; - d_entries_out[idx].sparsity.id = 0; - d_entries_out[idx].bitmap = 0; + if (d_entries_out != nullptr) { + d_entries_out[idx].bounds = d_rects[idx].rect; + d_entries_out[idx].sparsity.id = 0; + d_entries_out[idx].bitmap = 0; + } //Checks if we're the first value for a given src if (idx == 0 || d_rects[idx].src_idx != d_rects[idx-1].src_idx) { diff --git a/src/realm/deppart/preimage.cc b/src/realm/deppart/preimage.cc index 4ae8cd4ddc..b25c8b2c41 100644 --- a/src/realm/deppart/preimage.cc +++ b/src/realm/deppart/preimage.cc @@ -408,7 +408,7 @@ namespace Realm { } } bool gpu_data = !gpu_ptr_data.empty() || !gpu_rect_data.empty(); - bool opcount = cpu_ptr_data.size() + cpu_rect_data.size() + gpu_ptr_data.size() + gpu_rect_data.size(); + size_t opcount = cpu_ptr_data.size() + cpu_rect_data.size() + gpu_ptr_data.size() + gpu_rect_data.size(); bool exclusive = (gpu_data && (opcount == 1)); if (domain_transform.type == DomainTransform::DomainTransformType::STRUCTURED && !gpu_data) { diff --git a/src/realm/deppart/preimage_gpu_impl.hpp b/src/realm/deppart/preimage_gpu_impl.hpp index 3793b32458..3e464c582f 100644 --- a/src/realm/deppart/preimage_gpu_impl.hpp +++ b/src/realm/deppart/preimage_gpu_impl.hpp @@ -15,33 +15,28 @@ namespace Realm { return; } - Memory my_mem = domain_transform.range_data[0].inst.get_location(); + RegionInstance buffer = domain_transform.range_data[0].scratch_buffer; - const char* val = std::getenv("TILE_SIZE"); // or any env var - size_t tile_size = 100000000; //default - if (val) { - tile_size = atoi(val); - } - - RegionInstance fixed_buffer = this->realm_malloc(tile_size, my_mem); - Arena buffer_arena(reinterpret_cast(AffineAccessor(fixed_buffer, 0).base), tile_size); + size_t tile_size = buffer.get_layout()->bytes_used; + std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; + Arena buffer_arena(reinterpret_cast(AffineAccessor(buffer, 0).base), tile_size); NVTX_DEPPART(gpu_preimage); + Memory sysmem; + find_memory(sysmem, Memory::SYSTEM_MEM); + cudaStream_t stream = Cuda::get_task_cuda_stream(); collapsed_space inst_space; // We combine all of our instances into one to batch work, tracking the offsets between instances. - RegionInstance inst_offsets_instance = this->realm_malloc((domain_transform.range_data.size() + 1) * sizeof(size_t), my_mem); - inst_space.offsets = reinterpret_cast(AffineAccessor(inst_offsets_instance, 0).base); + inst_space.offsets = buffer_arena.alloc(domain_transform.range_data.size() + 1); inst_space.num_children = domain_transform.range_data.size(); - RegionInstance inst_entries_instance; + Arena sys_arena; + GPUMicroOp::collapse_multi_space(domain_transform.range_data, inst_space, sys_arena, stream); - GPUMicroOp::collapse_multi_space(domain_transform.range_data, inst_space, buffer_arena, stream); - - RegionInstance parent_entries_instance; collapsed_space collapsed_parent; // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs. @@ -50,53 +45,16 @@ namespace Realm { // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter // to figure out where to write each rectangle. - RegionInstance inst_counters_instance = this->realm_malloc((2*domain_transform.range_data.size() + 1) * sizeof(uint32_t), my_mem); - uint32_t* d_inst_counters = reinterpret_cast(AffineAccessor(inst_counters_instance, 0).base); + uint32_t* d_inst_counters = buffer_arena.alloc(2 * domain_transform.range_data.size() + 1); // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second // to track which instance each rectangle came from in the populate phase. uint32_t* d_inst_prefix = d_inst_counters + domain_transform.range_data.size(); - RegionInstance out_instance; - size_t num_valid_rects; - - Rect* d_valid_rects; - - // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from. - GPUMicroOp::template construct_input_rectlist>(inst_space, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream); - inst_entries_instance.destroy(); - parent_entries_instance.destroy(); - inst_offsets_instance.destroy(); - - if (num_valid_rects == 0) { - for (auto it : sparsity_outputs) { - SparsityMapImpl *impl = SparsityMapImpl::lookup(it); - if (this->exclusive) { - impl->gpu_finalize(); - } else { - impl->contribute_nothing(); - } - } - out_instance.destroy(); - inst_counters_instance.destroy(); - return; - } - - // Prefix sum the valid rectangles by volume. - RegionInstance prefix_rects_instance; - size_t total_pts; - - size_t* d_prefix_rects; - GPUMicroOp::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream); - - nvtx_range_push("cuda", "build target entries"); collapsed_space target_space; - RegionInstance offsets_instance = this->realm_malloc((targets.size()+1) * sizeof(size_t), my_mem); - target_space.offsets = reinterpret_cast(AffineAccessor(offsets_instance, 0).base); + target_space.offsets = buffer_arena.alloc(targets.size() + 1); target_space.num_children = targets.size(); - RegionInstance targets_entries_instance; - GPUMicroOp::collapse_multi_space(targets, target_space, buffer_arena, stream); Memory zcpy_mem; @@ -107,135 +65,255 @@ namespace Realm { d_accessors[i] = AffineAccessor,N,T>(domain_transform.range_data[i].inst, domain_transform.range_data[i].field_offset); } - RegionInstance points_instance; - PointDesc* d_points; - size_t num_valid_points; - - RegionInstance target_counters_instance = this->realm_malloc((2*targets.size()+1) * sizeof(uint32_t), my_mem); - uint32_t* d_target_counters = reinterpret_cast(AffineAccessor(target_counters_instance, 0).base); + uint32_t* d_target_counters = buffer_arena.alloc(2*targets.size() + 1); uint32_t* d_targets_prefix = d_target_counters + targets.size(); CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, targets.size() * sizeof(uint32_t), stream), stream); - if (target_space.num_entries > targets.size()) { - BVH preimage_bvh; - RegionInstance bvh_instance; - GPUMicroOp::build_bvh(target_space, preimage_bvh, buffer_arena, stream); - - preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices, - preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.range_data.size(), preimage_bvh.num_leaves, nullptr, d_target_counters, nullptr); - KERNEL_CHECK(stream); - - std::vector h_target_counters(targets.size()+1); - h_target_counters[0] = 0; // prefix sum starts at 0 - CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); - CUDA_CHECK(cudaStreamSynchronize(stream), stream); - for (size_t i = 0; i < targets.size(); ++i) { - h_target_counters[i+1] += h_target_counters[i]; - } + buffer_arena.commit(false); + + size_t num_output = 0; + RectDesc* output_start = nullptr; + size_t num_completed = 0; + size_t curr_tile = tile_size / 2; + int count = 0; + + bool host_fallback = false; + std::vector h_instances(targets.size(), RegionInstance::NO_INST); + std::vector entry_counts(targets.size(), 0); + while (num_completed < inst_space.num_entries) { + try { + + std::cout << "Preimage iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; + buffer_arena.start(); + std::cout << "Amount Used: " << buffer_arena.used() << std::endl; + if (num_completed + curr_tile > inst_space.num_entries) { + curr_tile = inst_space.num_entries - num_completed; + } + + collapsed_space inst_space_tile = inst_space; + inst_space_tile.num_entries = curr_tile; + inst_space_tile.entries_buffer = buffer_arena.alloc>(curr_tile); + CUDA_CHECK(cudaMemcpyAsync(inst_space_tile.entries_buffer, inst_space.entries_buffer + num_completed, curr_tile * sizeof(SparsityMapEntry), cudaMemcpyHostToDevice, stream), stream); + + size_t num_valid_rects; + Rect* d_valid_rects; + // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from. + GPUMicroOp::template construct_input_rectlist>(inst_space_tile, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream); + + if (num_valid_rects == 0) { + num_completed += curr_tile; + subtract_const<<>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + curr_tile = tile_size / 2; + continue; + } + + // Prefix sum the valid rectangles by volume. + size_t total_pts; + size_t* d_prefix_rects; + GPUMicroOp::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream); + + nvtx_range_push("cuda", "build target entries"); - num_valid_points = h_target_counters[targets.size()]; + PointDesc* d_points; + size_t num_valid_points; - if (num_valid_points == 0) { - for (auto it : sparsity_outputs) { - SparsityMapImpl *impl = SparsityMapImpl::lookup(it); - if (this->exclusive) { - impl->gpu_finalize(); - } else { - impl->contribute_nothing(); + CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, targets.size() * sizeof(uint32_t), stream), stream); + + if (target_space.num_entries > targets.size()) { + + BVH preimage_bvh; + GPUMicroOp::build_bvh(target_space, preimage_bvh, buffer_arena, stream); + + preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices, + preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.range_data.size(), preimage_bvh.num_leaves, nullptr, d_target_counters, nullptr); + KERNEL_CHECK(stream); + + std::vector h_target_counters(targets.size()+1); + h_target_counters[0] = 0; // prefix sum starts at 0 + CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t i = 0; i < targets.size(); ++i) { + h_target_counters[i+1] += h_target_counters[i]; } - } - target_counters_instance.destroy(); - accessors_instance.destroy(); - targets_entries_instance.destroy(); - offsets_instance.destroy(); - prefix_rects_instance.destroy(); - out_instance.destroy(); - inst_counters_instance.destroy(); - bvh_instance.destroy(); - return; - } - CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream); - - points_instance = this->realm_malloc(num_valid_points * sizeof(PointDesc), my_mem); - d_points = reinterpret_cast*>(AffineAccessor(points_instance, 0).base); - - CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream); - - preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices, - preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.range_data.size(), preimage_bvh.num_leaves, d_targets_prefix, d_target_counters, d_points); - KERNEL_CHECK(stream); - CUDA_CHECK(cudaStreamSynchronize(stream), stream); - bvh_instance.destroy(); - } else { - preimage_dense_populate_bitmasks_kernel < N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts, - num_valid_rects, domain_transform.range_data.size(), targets.size(), nullptr, d_target_counters, nullptr); - KERNEL_CHECK(stream); - - std::vector h_target_counters(targets.size()+1); - h_target_counters[0] = 0; // prefix sum starts at 0 - CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); - CUDA_CHECK(cudaStreamSynchronize(stream), stream); - for (size_t i = 0; i < targets.size(); ++i) { - h_target_counters[i+1] += h_target_counters[i]; - } + num_valid_points = h_target_counters[targets.size()]; - num_valid_points = h_target_counters[targets.size()]; + if (num_valid_points == 0) { + num_completed += curr_tile; + subtract_const<<>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + curr_tile = tile_size / 2; + continue; + } - if (num_valid_points == 0) { - for (auto it : sparsity_outputs) { - SparsityMapImpl *impl = SparsityMapImpl::lookup(it); - if (this->exclusive) { - impl->gpu_finalize(); - } else { - impl->contribute_nothing(); + CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream); + + buffer_arena.flip_parity(); + d_points = buffer_arena.alloc>(num_valid_points); + + CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream); + + preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices, + preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.range_data.size(), preimage_bvh.num_leaves, d_targets_prefix, d_target_counters, d_points); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } else { + preimage_dense_populate_bitmasks_kernel< N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts, + num_valid_rects, domain_transform.range_data.size(), targets.size(), nullptr, d_target_counters, nullptr); + KERNEL_CHECK(stream); + + std::vector h_target_counters(targets.size()+1); + h_target_counters[0] = 0; // prefix sum starts at 0 + CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t i = 0; i < targets.size(); ++i) { + h_target_counters[i+1] += h_target_counters[i]; } + + num_valid_points = h_target_counters[targets.size()]; + + if (num_valid_points == 0) { + num_completed += curr_tile; + subtract_const<<>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + curr_tile = tile_size / 2; + continue; + } + + CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream); + + buffer_arena.flip_parity(); + d_points = buffer_arena.alloc>(num_valid_points); + + CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream); + + preimage_dense_populate_bitmasks_kernel< N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts, + num_valid_rects, domain_transform.range_data.size(), targets.size(), d_targets_prefix, d_target_counters, d_points); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); } - target_counters_instance.destroy(); - accessors_instance.destroy(); - targets_entries_instance.destroy(); - offsets_instance.destroy(); - prefix_rects_instance.destroy(); - out_instance.destroy(); - inst_counters_instance.destroy(); - return; - } - CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream); + buffer_arena.flip_parity(); + buffer_arena.flip_parity(); + d_points = buffer_arena.alloc>(num_valid_points); + + size_t num_new_rects = num_output == 0 ? 1 : 2; + assert(!buffer_arena.get_parity()); + RectDesc* d_new_rects; + + this->complete_pipeline(d_points, num_valid_points, d_new_rects, num_new_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + + if (host_fallback) { + this->split_output(d_new_rects, num_new_rects, h_instances, entry_counts, buffer_arena); + } - points_instance = this->realm_malloc(num_valid_points * sizeof(PointDesc), my_mem); - d_points = reinterpret_cast*>(AffineAccessor(points_instance, 0).base); + if (num_output==0 || host_fallback) { + num_output = num_new_rects; + num_completed += curr_tile; + output_start = d_new_rects; + subtract_const<<>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + curr_tile = tile_size / 2; + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + continue; + } - CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream); + RectDesc* d_old_rects = buffer_arena.alloc>(num_output); + assert(d_old_rects == d_new_rects + num_new_rects); + CUDA_CHECK(cudaMemcpyAsync(d_old_rects, output_start, num_output * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + size_t num_final_rects = 1; + + //Send it off for processing + this->complete_rect_pipeline(d_new_rects, num_output + num_new_rects, output_start, num_final_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + num_completed += curr_tile; + num_output = num_final_rects; + subtract_const<<>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + curr_tile = tile_size / 2; + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + } catch (arena_oom&) { + std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; + std::cout << buffer_arena.used() << " bytes used in arena." << std::endl; + curr_tile /= 2; + if (curr_tile == 0) { + host_fallback = true; + if (num_output > 0) { + this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + } + curr_tile = tile_size / 2; + } + } + } + if (num_output == 0) { + for (SparsityMap it : sparsity_outputs) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(it); + if (this->exclusive) { + impl->gpu_finalize(); + } else { + impl->contribute_nothing(); + } + } + return; + } - preimage_dense_populate_bitmasks_kernel < N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts, - num_valid_rects, domain_transform.range_data.size(), targets.size(), d_targets_prefix, d_target_counters, d_points); - KERNEL_CHECK(stream); - CUDA_CHECK(cudaStreamSynchronize(stream), stream); + if (!host_fallback) { + try { + this->send_output(output_start, num_output, buffer_arena, sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + } catch (arena_oom&) { + this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + host_fallback = true; + } } - target_counters_instance.destroy(); - accessors_instance.destroy(); - targets_entries_instance.destroy(); - offsets_instance.destroy(); - prefix_rects_instance.destroy(); - out_instance.destroy(); - inst_counters_instance.destroy(); - - size_t out_rects = 0; - RectDesc* trash; - this->complete_pipeline(d_points, num_valid_points, trash, out_rects, buffer_arena, - /* the Container: */ sparsity_outputs, - /* getIndex: */ [&](auto const& elem){ - // elem is a SparsityMap from the vector - return size_t(&elem - sparsity_outputs.data()); - }, - /* getMap: */ [&](auto const& elem){ - // return the SparsityMap key itself - return elem; - }); - - points_instance.destroy(); + if (host_fallback) { + for (size_t idx = 0; idx < sparsity_outputs.size(); ++idx) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(sparsity_outputs[idx]); + if (this->exclusive) { + impl->set_contributor_count(1); + } + if (entry_counts[idx] > 0) { + Rect* h_rects = reinterpret_cast *>(AffineAccessor(h_instances[idx], 0).base); + span> h_rects_span(h_rects, entry_counts[idx]); + impl->contribute_dense_rect_list(h_rects_span, false); + h_instances[idx].destroy(); + } else { + impl->contribute_nothing(); + } + } + } } template @@ -244,33 +322,28 @@ namespace Realm { return; } - Memory my_mem = domain_transform.ptr_data[0].inst.get_location(); - - const char* val = std::getenv("TILE_SIZE"); // or any env var - size_t tile_size = 100000000; //default - if (val) { - tile_size = atoi(val); - } + RegionInstance buffer = domain_transform.ptr_data[0].scratch_buffer; - RegionInstance fixed_buffer = this->realm_malloc(tile_size, my_mem); - Arena buffer_arena(reinterpret_cast(AffineAccessor(fixed_buffer, 0).base), tile_size); + size_t tile_size = buffer.get_layout()->bytes_used; + std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; + Arena buffer_arena(reinterpret_cast(AffineAccessor(buffer, 0).base), tile_size); NVTX_DEPPART(gpu_preimage); + Memory sysmem; + find_memory(sysmem, Memory::SYSTEM_MEM); + cudaStream_t stream = Cuda::get_task_cuda_stream(); collapsed_space inst_space; // We combine all of our instances into one to batch work, tracking the offsets between instances. - RegionInstance inst_offsets_instance = this->realm_malloc((domain_transform.ptr_data.size() + 1) * sizeof(size_t), my_mem); - inst_space.offsets = reinterpret_cast(AffineAccessor(inst_offsets_instance, 0).base); + inst_space.offsets = buffer_arena.alloc(domain_transform.ptr_data.size() + 1); inst_space.num_children = domain_transform.ptr_data.size(); - RegionInstance inst_entries_instance; - - GPUMicroOp::collapse_multi_space(domain_transform.ptr_data, inst_space, buffer_arena, stream); + Arena sys_arena; + GPUMicroOp::collapse_multi_space(domain_transform.ptr_data, inst_space, sys_arena, stream); - RegionInstance parent_entries_instance; collapsed_space collapsed_parent; // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs. @@ -279,52 +352,16 @@ namespace Realm { // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter // to figure out where to write each rectangle. - RegionInstance inst_counters_instance = this->realm_malloc((2*domain_transform.ptr_data.size() + 1) * sizeof(uint32_t), my_mem); - uint32_t* d_inst_counters = reinterpret_cast(AffineAccessor(inst_counters_instance, 0).base); + uint32_t* d_inst_counters = buffer_arena.alloc(2 * domain_transform.ptr_data.size() + 1); // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second // to track which instance each rectangle came from in the populate phase. uint32_t* d_inst_prefix = d_inst_counters + domain_transform.ptr_data.size(); - RegionInstance out_instance; - size_t num_valid_rects; - - Rect* d_valid_rects; - // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from. - GPUMicroOp::template construct_input_rectlist>(inst_space, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream); - inst_entries_instance.destroy(); - parent_entries_instance.destroy(); - inst_offsets_instance.destroy(); - - if (num_valid_rects == 0) { - for (auto it : sparsity_outputs) { - SparsityMapImpl *impl = SparsityMapImpl::lookup(it); - if (this->exclusive) { - impl->gpu_finalize(); - } else { - impl->contribute_nothing(); - } - } - out_instance.destroy(); - inst_counters_instance.destroy(); - return; - } - - // Prefix sum the valid rectangles by volume. - RegionInstance prefix_rects_instance; - size_t total_pts; - - size_t* d_prefix_rects; - GPUMicroOp::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream); - - nvtx_range_push("cuda", "build target entries"); collapsed_space target_space; - RegionInstance offsets_instance = this->realm_malloc((targets.size()+1) * sizeof(size_t), my_mem); - target_space.offsets = reinterpret_cast(AffineAccessor(offsets_instance, 0).base); + target_space.offsets = buffer_arena.alloc(targets.size() + 1); target_space.num_children = targets.size(); - RegionInstance targets_entries_instance; - GPUMicroOp::collapse_multi_space(targets, target_space, buffer_arena, stream); Memory zcpy_mem; @@ -335,134 +372,254 @@ namespace Realm { d_accessors[i] = AffineAccessor,N,T>(domain_transform.ptr_data[i].inst, domain_transform.ptr_data[i].field_offset); } - RegionInstance points_instance; - PointDesc* d_points; - size_t num_valid_points; - - RegionInstance target_counters_instance = this->realm_malloc((2*targets.size()+1) * sizeof(uint32_t), my_mem); - uint32_t* d_target_counters = reinterpret_cast(AffineAccessor(target_counters_instance, 0).base); + uint32_t* d_target_counters = buffer_arena.alloc(2*targets.size() + 1); uint32_t* d_targets_prefix = d_target_counters + targets.size(); CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, targets.size() * sizeof(uint32_t), stream), stream); - if (target_space.num_entries > targets.size()) { - BVH preimage_bvh; - RegionInstance bvh_instance; - GPUMicroOp::build_bvh(target_space, preimage_bvh, buffer_arena, stream); - - preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices, - preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.ptr_data.size(), preimage_bvh.num_leaves, nullptr, d_target_counters, nullptr); - KERNEL_CHECK(stream); - - std::vector h_target_counters(targets.size()+1); - h_target_counters[0] = 0; // prefix sum starts at 0 - CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); - CUDA_CHECK(cudaStreamSynchronize(stream), stream); - for (size_t i = 0; i < targets.size(); ++i) { - h_target_counters[i+1] += h_target_counters[i]; - } + buffer_arena.commit(false); + + size_t num_output = 0; + RectDesc* output_start = nullptr; + size_t num_completed = 0; + size_t curr_tile = tile_size / 2; + int count = 0; + + bool host_fallback = false; + std::vector h_instances(targets.size(), RegionInstance::NO_INST); + std::vector entry_counts(targets.size(), 0); + while (num_completed < inst_space.num_entries) { + try { + + std::cout << "Preimage iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; + buffer_arena.start(); + std::cout << "Amount Used: " << buffer_arena.used() << std::endl; + if (num_completed + curr_tile > inst_space.num_entries) { + curr_tile = inst_space.num_entries - num_completed; + } + + collapsed_space inst_space_tile = inst_space; + inst_space_tile.num_entries = curr_tile; + inst_space_tile.entries_buffer = buffer_arena.alloc>(curr_tile); + CUDA_CHECK(cudaMemcpyAsync(inst_space_tile.entries_buffer, inst_space.entries_buffer + num_completed, curr_tile * sizeof(SparsityMapEntry), cudaMemcpyHostToDevice, stream), stream); + + size_t num_valid_rects; + Rect* d_valid_rects; + // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from. + GPUMicroOp::template construct_input_rectlist>(inst_space_tile, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream); + + if (num_valid_rects == 0) { + num_completed += curr_tile; + subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + curr_tile = tile_size / 2; + continue; + } + + // Prefix sum the valid rectangles by volume. + size_t total_pts; + size_t* d_prefix_rects; + GPUMicroOp::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream); + + nvtx_range_push("cuda", "build target entries"); + + PointDesc* d_points; + size_t num_valid_points; - num_valid_points = h_target_counters[targets.size()]; + CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream); - if (num_valid_points == 0) { - for (auto it : sparsity_outputs) { - SparsityMapImpl *impl = SparsityMapImpl::lookup(it); - if (this->exclusive) { - impl->gpu_finalize(); - } else { - impl->contribute_nothing(); + if (target_space.num_entries > targets.size()) { + + BVH preimage_bvh; + GPUMicroOp::build_bvh(target_space, preimage_bvh, buffer_arena, stream); + + preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices, + preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.ptr_data.size(), preimage_bvh.num_leaves, nullptr, d_target_counters, nullptr); + KERNEL_CHECK(stream); + + std::vector h_target_counters(targets.size()+1); + h_target_counters[0] = 0; // prefix sum starts at 0 + CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t i = 0; i < targets.size(); ++i) { + h_target_counters[i+1] += h_target_counters[i]; } - } - target_counters_instance.destroy(); - accessors_instance.destroy(); - targets_entries_instance.destroy(); - offsets_instance.destroy(); - prefix_rects_instance.destroy(); - out_instance.destroy(); - inst_counters_instance.destroy(); - bvh_instance.destroy(); - return; - } - CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream); - - points_instance = this->realm_malloc(num_valid_points * sizeof(PointDesc), my_mem); - d_points = reinterpret_cast*>(AffineAccessor(points_instance, 0).base); - - CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream); - - preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices, - preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.ptr_data.size(), preimage_bvh.num_leaves, d_targets_prefix, d_target_counters, d_points); - KERNEL_CHECK(stream); - CUDA_CHECK(cudaStreamSynchronize(stream), stream); - bvh_instance.destroy(); - } else { - preimage_dense_populate_bitmasks_kernel< N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts, - num_valid_rects, domain_transform.ptr_data.size(), targets.size(), nullptr, d_target_counters, nullptr); - KERNEL_CHECK(stream); - - std::vector h_target_counters(targets.size()+1); - h_target_counters[0] = 0; // prefix sum starts at 0 - CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); - CUDA_CHECK(cudaStreamSynchronize(stream), stream); - for (size_t i = 0; i < targets.size(); ++i) { - h_target_counters[i+1] += h_target_counters[i]; - } + num_valid_points = h_target_counters[targets.size()]; + + if (num_valid_points == 0) { + num_completed += curr_tile; + subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + curr_tile = tile_size / 2; + continue; + } - num_valid_points = h_target_counters[targets.size()]; + CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream); - if (num_valid_points == 0) { - for (auto it : sparsity_outputs) { - SparsityMapImpl *impl = SparsityMapImpl::lookup(it); - if (this->exclusive) { - impl->gpu_finalize(); - } else { - impl->contribute_nothing(); + buffer_arena.flip_parity(); + d_points = buffer_arena.alloc>(num_valid_points); + + CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream); + + preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices, + preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.ptr_data.size(), preimage_bvh.num_leaves, d_targets_prefix, d_target_counters, d_points); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } else { + preimage_dense_populate_bitmasks_kernel< N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts, + num_valid_rects, domain_transform.ptr_data.size(), targets.size(), nullptr, d_target_counters, nullptr); + KERNEL_CHECK(stream); + + std::vector h_target_counters(targets.size()+1); + h_target_counters[0] = 0; // prefix sum starts at 0 + CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t i = 0; i < targets.size(); ++i) { + h_target_counters[i+1] += h_target_counters[i]; } + + num_valid_points = h_target_counters[targets.size()]; + + if (num_valid_points == 0) { + num_completed += curr_tile; + subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + curr_tile = tile_size / 2; + continue; + } + + CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream); + + buffer_arena.flip_parity(); + d_points = buffer_arena.alloc>(num_valid_points); + + CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream); + + preimage_dense_populate_bitmasks_kernel< N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts, + num_valid_rects, domain_transform.ptr_data.size(), targets.size(), d_targets_prefix, d_target_counters, d_points); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); } - target_counters_instance.destroy(); - accessors_instance.destroy(); - targets_entries_instance.destroy(); - offsets_instance.destroy(); - prefix_rects_instance.destroy(); - out_instance.destroy(); - inst_counters_instance.destroy(); - return; - } - CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream); + buffer_arena.flip_parity(); + buffer_arena.flip_parity(); + d_points = buffer_arena.alloc>(num_valid_points); + + size_t num_new_rects = num_output == 0 ? 1 : 2; + assert(!buffer_arena.get_parity()); + RectDesc* d_new_rects; + + this->complete_pipeline(d_points, num_valid_points, d_new_rects, num_new_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + + if (host_fallback) { + this->split_output(d_new_rects, num_new_rects, h_instances, entry_counts, buffer_arena); + } - points_instance = this->realm_malloc(num_valid_points * sizeof(PointDesc), my_mem); - d_points = reinterpret_cast*>(AffineAccessor(points_instance, 0).base); + if (num_output==0 || host_fallback) { + num_output = num_new_rects; + num_completed += curr_tile; + output_start = d_new_rects; + subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + curr_tile = tile_size / 2; + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + continue; + } - CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream); + RectDesc* d_old_rects = buffer_arena.alloc>(num_output); + assert(d_old_rects == d_new_rects + num_new_rects); + CUDA_CHECK(cudaMemcpyAsync(d_old_rects, output_start, num_output * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + size_t num_final_rects = 1; + + //Send it off for processing + this->complete_rect_pipeline(d_new_rects, num_output + num_new_rects, output_start, num_final_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + num_completed += curr_tile; + num_output = num_final_rects; + subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + curr_tile = tile_size / 2; + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + } catch (arena_oom&) { + std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; + std::cout << buffer_arena.used() << " bytes used in arena." << std::endl; + curr_tile /= 2; + if (curr_tile == 0) { + host_fallback = true; + if (num_output > 0) { + this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + } + curr_tile = tile_size / 2; + } + } + } + if (num_output == 0) { + for (SparsityMap it : sparsity_outputs) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(it); + if (this->exclusive) { + impl->gpu_finalize(); + } else { + impl->contribute_nothing(); + } + } + return; + } - preimage_dense_populate_bitmasks_kernel< N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts, - num_valid_rects, domain_transform.ptr_data.size(), targets.size(), d_targets_prefix, d_target_counters, d_points); - KERNEL_CHECK(stream); - CUDA_CHECK(cudaStreamSynchronize(stream), stream); + if (!host_fallback) { + try { + this->send_output(output_start, num_output, buffer_arena, sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + } catch (arena_oom&) { + this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + host_fallback = true; + } } - target_counters_instance.destroy(); - accessors_instance.destroy(); - targets_entries_instance.destroy(); - offsets_instance.destroy(); - prefix_rects_instance.destroy(); - out_instance.destroy(); - inst_counters_instance.destroy(); - - size_t out_rects = 0; - RectDesc* trash; - this->complete_pipeline(d_points, num_valid_points, trash, out_rects, buffer_arena, - /* the Container: */ sparsity_outputs, - /* getIndex: */ [&](auto const& elem){ - // elem is a SparsityMap from the vector - return size_t(&elem - sparsity_outputs.data()); - }, - /* getMap: */ [&](auto const& elem){ - // return the SparsityMap key itself - return elem; - }); - - points_instance.destroy(); + if (host_fallback) { + for (size_t idx = 0; idx < sparsity_outputs.size(); ++idx) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(sparsity_outputs[idx]); + if (this->exclusive) { + impl->set_contributor_count(1); + } + if (entry_counts[idx] > 0) { + Rect* h_rects = reinterpret_cast *>(AffineAccessor(h_instances[idx], 0).base); + span> h_rects_span(h_rects, entry_counts[idx]); + impl->contribute_dense_rect_list(h_rects_span, false); + h_instances[idx].destroy(); + } else { + impl->contribute_nothing(); + } + } + } } } \ No newline at end of file diff --git a/src/realm/deppart/sparsity_impl.cc b/src/realm/deppart/sparsity_impl.cc index c674a98b32..b4938edb3b 100644 --- a/src/realm/deppart/sparsity_impl.cc +++ b/src/realm/deppart/sparsity_impl.cc @@ -1144,6 +1144,19 @@ SparsityMapImpl::~SparsityMapImpl(void) contribute_raw_rects((rects.empty() ? 0 : &rects[0]), rects.size(), 1, disjoint, 0); } + template + void + SparsityMapImpl::contribute_dense_rect_list(const span> &rects, + bool disjoint) + { + + HybridRectangleList h_rect_list; + for (size_t i = 0; i < rects.size(); ++i) { + h_rect_list.add_rect(rects[i]); + } + contribute_dense_rect_list(h_rect_list.convert_to_vector(), disjoint); + } + template void SparsityMapImpl::contribute_raw_rects(const Rect *rects, size_t count, size_t piece_count, bool disjoint, diff --git a/src/realm/deppart/sparsity_impl.h b/src/realm/deppart/sparsity_impl.h index 2618f4decc..f9656e65b6 100644 --- a/src/realm/deppart/sparsity_impl.h +++ b/src/realm/deppart/sparsity_impl.h @@ -127,6 +127,7 @@ namespace Realm { void contribute_nothing(void); void contribute_dense_rect_list(const std::vector> &rects, bool disjoint); + void contribute_dense_rect_list(const span> &rects, bool disjoint); void contribute_raw_rects(const Rect *rects, size_t count, size_t piece_count, bool disjoint, size_t total_count); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e166888637..bc6123b299 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -277,6 +277,7 @@ add_integration_test(transpose "${REALM_TEST_DIR}/transpose.cc") set(proc_group_ARGS -ll:cpu 4) add_integration_test(proc_group "${REALM_TEST_DIR}/proc_group.cc") add_integration_test(deppart "${REALM_TEST_DIR}/deppart.cc") +add_integration_test(benchmark "${REALM_TEST_DIR}/benchmark.cc") set(scatter_ARGS -p1 2 -p2 2) add_integration_test(scatter "${REALM_TEST_DIR}/scatter.cc") set(proc_group_ARGS -ll:cpu 4) diff --git a/tests/benchmark.cc b/tests/benchmark.cc new file mode 100644 index 0000000000..b6847f5513 --- /dev/null +++ b/tests/benchmark.cc @@ -0,0 +1,5019 @@ +/* + * Copyright 2025 Stanford University, NVIDIA Corporation + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "realm.h" + +#include +#include +#include +#include +#include +#include + +#include + +#include "osdep.h" + +#include "philox.h" + +using namespace Realm; + +#define USE_IMAGE_DIFF + +Logger log_app("app"); + +// Task IDs, some IDs are reserved so start at first available number +enum +{ + TOP_LEVEL_TASK = Processor::TASK_ID_FIRST_AVAILABLE + 0, + INIT_CIRCUIT_DATA_TASK, + INIT_BASIC_DATA_TASK, + INIT_TILE_DATA_TASK, + INIT_RANGE_DATA_TASK, + INIT_RANGE2D_DATA_TASK, + INIT_PENNANT_DATA_TASK, + INIT_MINIAERO_DATA_TASK, +}; + +enum TransformType +{ + AFFINE = 0, + TRANSLATION = 1, +}; + +namespace std { + template + std::ostream &operator<<(std::ostream &os, const std::vector &v) + { + os << v.size() << "{"; + if(v.empty()) { + os << "}"; + } else { + os << " "; + typename std::vector::const_iterator it = v.begin(); + os << *it; + ++it; + while(it != v.end()) { + os << ", " << *it; + ++it; + } + os << " }"; + } + return os; + } +}; // namespace std + +// we're going to use alarm() as a watchdog to detect deadlocks +void sigalrm_handler(int sig) +{ + fprintf(stderr, "HELP! Alarm triggered - likely deadlock!\n"); + exit(1); +} + +template +void dump_sparse_index_space(const char *pfx, IndexSpace is) +{ + std::cout << pfx << ": " << is << "\n"; + if(!is.sparsity.exists()) + return; + SparsityMapPublicImpl *impl = is.sparsity.impl(); + span> entries = impl->get_entries(); + for(size_t i = 0; i < entries.size(); i++) { + SparsityMapEntry entry = entries[i]; + std::cout << " " << entry.bounds; + if(entry.bitmap) + std::cout << " bitmap(" << entry.bitmap << ")"; + if(entry.sparsity.exists()) + std::cout << " sparsity(" << entry.sparsity << ")"; + std::cout << "\n"; + } +} + +static int check_empty(Event e, const std::vector> &p, const char *pfx) +{ + int errors = 0; + e.wait(); + for(size_t i = 0; i < p.size(); i++) { + p[i].make_valid().wait(); + if(p[i].volume() > 0) { + log_app.error() << "HELP! " << pfx << "[" << i << "] space " << p[i] + << " isn't empty?"; + dump_sparse_index_space(pfx, p[i]); + errors++; + } + } + return errors; +} + +class TestInterface { +public: + virtual ~TestInterface(void) {} + + virtual void print_info(void) = 0; + + virtual Event initialize_data(const std::vector &memories, + const std::vector &procs) = 0; + + virtual Event perform_partitioning(void) = 0; + + virtual int perform_dynamic_checks(void) = 0; + + virtual int check_partitioning(void) = 0; +}; + +// generic configuration settings +namespace { + int random_seed = 12345; + bool random_colors = false; + bool wait_on_events = false; + bool show_graph = false; + bool skip_check = false; + TestInterface *testcfg = 0; +}; // namespace + +template +void split_evenly(T total, T pieces, std::vector &cuts) +{ + cuts.resize(pieces + 1); + for(T i = 0; i <= pieces; i++) + cuts[i] = ((long long)total * i) / pieces; +} + +template +int find_split(const std::vector &cuts, T v) +{ + // dumb linear search + assert(v >= cuts[0]); + for(size_t i = 1; i < cuts.size(); i++) + if(v < cuts[i]) + return i - 1; + assert(false); + return 0; +} + +/* + * Basic test - create a graph, partition it by + * node subgraph id and then check that the partitioning + * is correct + */ +class BasicTest : public TestInterface { +public: + // graph config parameters + int num_nodes = 1000; + int num_edges = 1000; + int num_pieces = 4; + std::string filename; + + BasicTest(int argc, const char *argv[]) + { + for(int i = 1; i < argc; i++) { + + if(!strcmp(argv[i], "-p")) { + num_pieces = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-n")) { + num_nodes = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-e")) { + num_edges = atoi(argv[++i]); + continue; + } + } + + + if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0) { + log_app.error() << "Invalid config: nodes=" << num_nodes << " edges=" << num_edges << " pieces=" << num_pieces << "\n"; + exit(1); + } + } + + struct InitDataArgs { + int index; + RegionInstance ri_nodes; + RegionInstance ri_edges; + }; + + enum PRNGStreams + { + NODE_SUBGRAPH_STREAM, + }; + + // assign subgraph ids to nodes + void random_node_data(int idx, int &subgraph) + { + if(random_colors) + subgraph = + Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); + else + subgraph = idx * num_pieces / num_nodes; + } + + void random_edge_data(int idx, int& src, int& dst) + { + src = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes); + dst = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, num_nodes); + } + + static void init_data_task_wrapper(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) + { + BasicTest *me = (BasicTest *)testcfg; + me->init_data_task(args, arglen, p); + } + + //Each piece has a task to initialize its data + void init_data_task(const void *args, size_t arglen, Processor p) + { + const InitDataArgs &i_args = *(const InitDataArgs *)args; + + log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes + << ")"; + + i_args.ri_nodes.fetch_metadata(p).wait(); + i_args.ri_edges.fetch_metadata(p).wait(); + + IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>(); + IndexSpace<1> is_edges = i_args.ri_edges.get_indexspace<1>(); + + log_app.debug() << "N: " << is_nodes; + log_app.debug() << "E: " << is_edges; + + //For each node in the graph, mark it with a random (or deterministic) subgraph id + { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) { + int subgraph; + random_node_data(i, subgraph); + a_piece_id.write(i, subgraph); + } + + AffineAccessor,1> a_src(i_args.ri_edges, 0 /* offset */); + AffineAccessor,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */); + + for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) { + int src, dst; + random_edge_data(i, src, dst); + a_src.write(i, Point<1>(src)); + a_dst.write(i, Point<1>(dst)); + } + } + + //Optionally print out the assigned subgraph ids + if(show_graph) { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) + log_app.info() << "piece_id[" << i << "] = " << a_piece_id.read(i) << "\n"; + + AffineAccessor,1> a_src(i_args.ri_edges, 0 /* offset */); + AffineAccessor,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */); + + for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) + log_app.info() << "src, dst[" << i << "] = " << a_src.read(i) << ", " << a_dst.read(i) << "\n"; + } + } + + IndexSpace<1> is_nodes, is_edges; + std::vector ri_nodes, ri_edges; + std::vector, int> > piece_id_field_data; + std::vector, Point<1> > > src_node_field_data, dst_node_field_data; + + virtual void print_info(void) + { + printf("Realm dependent partitioning test - basic: %d nodes, %d edges, %d pieces\n", + (int)num_nodes, (int) num_edges, (int)num_pieces); + } + + virtual Event initialize_data(const std::vector &memories, + const std::vector &procs) + { + // now create index space for nodes + is_nodes = Rect<1>(0, num_nodes - 1); + is_edges = Rect<1>(0, num_edges - 1); + + // equal partition is used to do initial population of edges and nodes + std::vector > ss_nodes_eq; + std::vector > ss_edges_eq; + + log_app.info() << "Creating equal subspaces\n"; + + is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); + is_edges.create_equal_subspaces(num_pieces, 1, ss_edges_eq, Realm::ProfilingRequestSet()).wait(); + + log_app.debug() << "Initial partitions:"; + for(size_t i = 0; i < ss_nodes_eq.size(); i++) + log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i]; + for(size_t i = 0; i < ss_edges_eq.size(); i++) + log_app.debug() << " Edges #" << i << ": " << ss_edges_eq[i]; + + // create instances for each of these subspaces + std::vector node_fields, edge_fields; + node_fields.push_back(sizeof(int)); // piece_id + assert(sizeof(int) == sizeof(Point<1>)); + edge_fields.push_back(sizeof(Point<1>)); // src_node + edge_fields.push_back(sizeof(Point<1>)); // dst_node + + ri_nodes.resize(num_pieces); + piece_id_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_nodes_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i], + node_fields, 0 /*SOA*/, + Realm::ProfilingRequestSet()) + .wait(); + ri_nodes[i] = ri; + + piece_id_field_data[i].index_space = ss_nodes_eq[i]; + piece_id_field_data[i].inst = ri_nodes[i]; + piece_id_field_data[i].field_offset = 0; + } + + + // Fire off tasks to initialize data + ri_edges.resize(num_pieces); + src_node_field_data.resize(num_pieces); + dst_node_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_edges_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, + memories[i % memories.size()], + ss_edges_eq[i], + edge_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_edges[i] = ri; + + src_node_field_data[i].index_space = ss_edges_eq[i]; + src_node_field_data[i].inst = ri_edges[i]; + src_node_field_data[i].field_offset = 0 * sizeof(Point<1>); + + dst_node_field_data[i].index_space = ss_edges_eq[i]; + dst_node_field_data[i].inst = ri_edges[i]; + dst_node_field_data[i].field_offset = 1 * sizeof(Point<1>); + } + + // fire off tasks to initialize data + std::set events; + for(int i = 0; i < num_pieces; i++) { + Processor p = procs[i % procs.size()]; + InitDataArgs args; + args.index = i; + args.ri_nodes = ri_nodes[i]; + args.ri_edges = ri_edges[i]; + Event e = p.spawn(INIT_BASIC_DATA_TASK, &args, sizeof(args)); + events.insert(e); + } + + return Event::merge_events(events); + } + + // the outputs of our partitioning will be: + // p_nodes - nodes partitioned by subgraph id (from GPU) + // p_nodes_cpu - nodes partitioned by subgraph id (from CPU) + + + std::vector > p_nodes, p_rd; + std::vector > p_edges, p_preimage_edges; + + std::vector > p_nodes_cpu, p_rd_cpu; + std::vector > p_edges_cpu, p_preimage_edges_cpu; + + virtual Event perform_partitioning(void) + { + // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU + // Ensure that the results are identical + + std::vector colors(num_pieces); + for(int i = 0; i < num_pieces; i++) + colors[i] = i; + + // We need a GPU memory for GPU partitioning + Memory gpu_memory; + bool found_gpu_memory = false; + Machine machine = Machine::get_machine(); + std::set all_memories; + machine.get_all_memories(all_memories); + for(Memory memory : all_memories) { + if(memory.kind() == Memory::GPU_FB_MEM) { + gpu_memory = memory; + found_gpu_memory = true; + break; + } + } + if (!found_gpu_memory) { + log_app.error() << "No GPU memory found for partitioning test\n"; + return Event::NO_EVENT; + } + std::vector edge_fields; + edge_fields.push_back(sizeof(Point<1>)); + edge_fields.push_back(sizeof(Point<1>)) ; + std::vector node_fields; + node_fields.push_back(sizeof(int)); + + std::vector, Point<1> > > src_field_data_gpu; + std::vector, Point<1> > > dst_field_data_gpu; + std::vector, int> > piece_field_data_gpu; + piece_field_data_gpu.resize(num_pieces); + src_field_data_gpu.resize(num_pieces); + dst_field_data_gpu.resize(num_pieces); + for (int i = 0; i < num_pieces; i++) { + RegionInstance src_gpu_instance; + RegionInstance dst_gpu_instance; + RegionInstance piece_gpu_instance; + RegionInstance::create_instance(src_gpu_instance, + gpu_memory, + src_node_field_data[i].index_space, + edge_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(dst_gpu_instance, + gpu_memory, + dst_node_field_data[i].index_space, + edge_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(piece_gpu_instance, + gpu_memory, + piece_id_field_data[i].index_space, + node_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + CopySrcDstField src_gpu_field, src_cpu_field, dst_gpu_field, dst_cpu_field, piece_gpu_field, piece_cpu_field; + src_gpu_field.inst = src_gpu_instance; + src_gpu_field.size = sizeof(Point<1>); + src_gpu_field.field_id = 0; + src_cpu_field.inst = src_node_field_data[i].inst; + src_cpu_field.size = sizeof(Point<1>); + src_cpu_field.field_id = 0; + dst_gpu_field.inst = dst_gpu_instance; + dst_gpu_field.size = sizeof(Point<1>); + dst_gpu_field.field_id = sizeof(Point<1>); + dst_cpu_field.inst = dst_node_field_data[i].inst; + dst_cpu_field.size = sizeof(Point<1>); + dst_cpu_field.field_id = sizeof(Point<1>); + piece_gpu_field.inst = piece_gpu_instance; + piece_gpu_field.size = sizeof(int); + piece_gpu_field.field_id = 0; + piece_cpu_field.inst = piece_id_field_data[i].inst; + piece_cpu_field.size = sizeof(int); + piece_cpu_field.field_id = 0; + std::vector src_cpu_data, src_gpu_data, dst_cpu_data, dst_gpu_data, piece_cpu_data, piece_gpu_data; + src_cpu_data.push_back(src_cpu_field); + dst_cpu_data.push_back(dst_cpu_field); + src_gpu_data.push_back(src_gpu_field); + dst_gpu_data.push_back(dst_gpu_field); + piece_gpu_data.push_back(piece_gpu_field); + piece_cpu_data.push_back(piece_cpu_field); + Event copy_event = src_node_field_data[i].index_space.copy(src_cpu_data, src_gpu_data, Realm::ProfilingRequestSet()); + copy_event.wait(); + Event second_copy_event = dst_node_field_data[i].index_space.copy(dst_cpu_data, dst_gpu_data, Realm::ProfilingRequestSet()); + second_copy_event.wait(); + Event third_copy_event = piece_id_field_data[i].index_space.copy(piece_cpu_data, piece_gpu_data, Realm::ProfilingRequestSet()); + third_copy_event.wait(); + src_field_data_gpu[i].inst = src_gpu_instance; + src_field_data_gpu[i].index_space = src_node_field_data[i].index_space; + src_field_data_gpu[i].field_offset = 0; + dst_field_data_gpu[i].inst = dst_gpu_instance; + dst_field_data_gpu[i].index_space = dst_node_field_data[i].index_space; + dst_field_data_gpu[i].field_offset = 1 * sizeof(Point<1>); + piece_field_data_gpu[i].inst = piece_gpu_instance; + piece_field_data_gpu[i].index_space = piece_id_field_data[i].index_space; + piece_field_data_gpu[i].field_offset = 0; + } + wait_on_events = true; + log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; + const char* val = std::getenv("TILE_SIZE"); // or any env var + size_t tile_size = 10000000; //default + if (val) { + tile_size = atoi(val); + } + std::vector byte_fields = {sizeof(char)}; + IndexSpace<1> instance_index_space(Rect<1>(0, tile_size-1)); + IndexSpace<1> dst_index_space(Rect<1>(0, tile_size/100-1)); + for (size_t i = 0; i < piece_field_data_gpu.size(); i++) { + RegionInstance::create_instance(piece_field_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + } + for (size_t i = 0; i < src_field_data_gpu.size(); i++) { + RegionInstance::create_instance(src_field_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + } + for (size_t i = 0; i < dst_field_data_gpu.size(); i++) { + RegionInstance::create_instance(dst_field_data_gpu[i].scratch_buffer, gpu_memory, dst_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + } + std::vector > p_garbage_nodes, p_garbage_edges, p_garbage_rd, p_garbage_preimage_edges; + Event e01 = is_nodes.create_subspaces_by_field(piece_field_data_gpu, + colors, + p_garbage_nodes, + Realm::ProfilingRequestSet()); + if (wait_on_events) e01.wait(); + Event e02 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu, + p_garbage_nodes, + p_garbage_edges, + Realm::ProfilingRequestSet(), + e01); + if(wait_on_events) e02.wait(); + + // an image of p_edges through out_node gives us all the shared nodes, along + // with some private nodes + Event e03 = is_nodes.create_subspaces_by_image(src_field_data_gpu, + p_garbage_edges, + p_garbage_rd, + Realm::ProfilingRequestSet(), + e02); + if(wait_on_events) e03.wait(); + + Event e04 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu, + p_garbage_rd, + p_garbage_preimage_edges, + Realm::ProfilingRequestSet(), + e03); + e04.wait(); + log_app.info() << "warming up complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU Partitioning " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU By Field " << Clock::current_time_in_microseconds() << "\n"; + Event e1 = is_nodes.create_subspaces_by_field(piece_field_data_gpu, + colors, + p_nodes, + Realm::ProfilingRequestSet()); + if(wait_on_events) e1.wait(); + log_app.info() << "GPU By Field complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU Preimage " << Clock::current_time_in_microseconds() << "\n"; + // now compute p_edges based on the color of their in_node (i.e. a preimage) + Event e2 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu, + p_nodes, + p_edges, + Realm::ProfilingRequestSet(), + e1); + if(wait_on_events) e2.wait(); + log_app.info() << "GPU Preimage complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU Image " << Clock::current_time_in_microseconds() << "\n"; + + std::vector> spaces = {}; + std::vector requirements; + is_nodes.by_field_buffer_requirements(spaces, requirements); + // an image of p_edges through out_node gives us all the shared nodes, along + // with some private nodes + Event e3 = is_nodes.create_subspaces_by_image(src_field_data_gpu, + p_edges, + p_rd, + Realm::ProfilingRequestSet(), + e2); + if(wait_on_events) e3.wait(); + log_app.info() << "GPU Image complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting second GPU preimage " << Clock::current_time_in_microseconds() << "\n"; + + Event e4 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu, + p_rd, + p_preimage_edges, + Realm::ProfilingRequestSet(), + e3); + e4.wait(); + log_app.info() << "Second GPU preimage complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "GPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting CPU Partitioning " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting CPU By Field " << Clock::current_time_in_microseconds() << "\n"; + Event e5 = is_nodes.create_subspaces_by_field(piece_id_field_data, + colors, + p_nodes_cpu, + Realm::ProfilingRequestSet()); + if(wait_on_events) e5.wait(); + log_app.info() << "CPU By Field complete " << Clock::current_time_in_microseconds() << "\n"; + // now compute p_edges based on the color of their in_node (i.e. a preimage) + log_app.info() << "Starting CPU Preimage " << Clock::current_time_in_microseconds() << "\n"; + Event e6 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_nodes_cpu, + p_edges_cpu, + Realm::ProfilingRequestSet(), + e5); + if(wait_on_events) e6.wait(); + log_app.info() << "CPU Preimage complete " << Clock::current_time_in_microseconds() << "\n"; + + // an image of p_edges through out_node gives us all the shared nodes, along + // with some private nodes + log_app.info() << "Starting CPU Image " << Clock::current_time_in_microseconds() << "\n"; + Event e7 = is_nodes.create_subspaces_by_image(src_node_field_data, + p_edges_cpu, + p_rd_cpu, + Realm::ProfilingRequestSet(), + e6); + if(wait_on_events) e7.wait(); + log_app.info() << "CPU Image complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting second CPU preimage " << Clock::current_time_in_microseconds() << "\n"; + + Event e8 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_rd_cpu, + p_preimage_edges_cpu, + Realm::ProfilingRequestSet(), + e7); + e8.wait(); + log_app.info() << "Second CPU preimage complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; + return e8; + } + + virtual int perform_dynamic_checks(void) + { + // Nothing to do here + return 0; + } + + virtual int check_partitioning(void) + { + int errors = 0; + + if (!p_nodes.size()) { + return 0; + } + + log_app.info() << "Checking correctness of partitioning " << "\n"; + + for(int i = 0; i < num_pieces; i++) { + for(IndexSpaceIterator<1> it(p_nodes[i]); it.valid; it.step()) { + for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_nodes_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra byfield point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for(IndexSpaceIterator<1> it(p_nodes_cpu[i]); it.valid; it.step()) { + for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_nodes[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing byfield point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_edges[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_edges_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_edges_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_edges[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_rd[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_rd_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra image node " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_rd_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_rd[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing image node " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_preimage_edges[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_preimage_edges_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra second preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_preimage_edges_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_preimage_edges[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing second preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + + } + return errors; + } +}; + +class TileTest : public TestInterface { +public: + // graph config parameters + int num_nodes = 1000; + int num_edges = 1000; + int num_pieces = 4; + int num_tiles = 1; + std::string filename; + + TileTest(int argc, const char *argv[]) + { + for(int i = 1; i < argc; i++) { + + if(!strcmp(argv[i], "-p")) { + num_pieces = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-n")) { + num_nodes = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-e")) { + num_edges = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-t")) { + num_tiles = atoi(argv[++i]); + continue; + } + } + + + if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0) { + log_app.error() << "Invalid config: nodes=" << num_nodes << " edges=" << num_edges << " pieces=" << num_pieces << "\n"; + exit(1); + } + } + + struct InitDataArgs { + int index; + RegionInstance ri_nodes; + RegionInstance ri_edges; + }; + + enum PRNGStreams + { + NODE_SUBGRAPH_STREAM, + }; + + // assign subgraph ids to nodes + void random_node_data(int idx, int &subgraph) + { + if(random_colors) + subgraph = + Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); + else + subgraph = idx * num_pieces / num_nodes; + } + + void random_edge_data(int idx, int& src, int& dst) + { + src = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes); + dst = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, num_nodes); + } + + static void init_data_task_wrapper(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) + { + TileTest *me = (TileTest *)testcfg; + me->init_data_task(args, arglen, p); + } + + //Each piece has a task to initialize its data + void init_data_task(const void *args, size_t arglen, Processor p) + { + const InitDataArgs &i_args = *(const InitDataArgs *)args; + + log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes + << ")"; + + i_args.ri_nodes.fetch_metadata(p).wait(); + i_args.ri_edges.fetch_metadata(p).wait(); + + IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>(); + IndexSpace<1> is_edges = i_args.ri_edges.get_indexspace<1>(); + + log_app.debug() << "N: " << is_nodes; + log_app.debug() << "E: " << is_edges; + + //For each node in the graph, mark it with a random (or deterministic) subgraph id + { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) { + int subgraph; + random_node_data(i, subgraph); + a_piece_id.write(i, subgraph); + } + + AffineAccessor,1> a_src(i_args.ri_edges, 0 /* offset */); + AffineAccessor,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */); + + for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) { + int src, dst; + random_edge_data(i, src, dst); + a_src.write(i, Point<1>(src)); + a_dst.write(i, Point<1>(dst)); + } + } + + //Optionally print out the assigned subgraph ids + if(show_graph) { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) + log_app.info() << "piece_id[" << i << "] = " << a_piece_id.read(i) << "\n"; + + AffineAccessor,1> a_src(i_args.ri_edges, 0 /* offset */); + AffineAccessor,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */); + + for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) + log_app.info() << "src, dst[" << i << "] = " << a_src.read(i) << ", " << a_dst.read(i) << "\n"; + } + } + + IndexSpace<1> is_nodes, is_edges; + std::vector ri_nodes, ri_edges; + std::vector, int> > piece_id_field_data; + std::vector, Point<1> > > src_node_field_data, dst_node_field_data; + + virtual void print_info(void) + { + printf("Realm dependent partitioning test - tile: %d nodes, %d edges, %d pieces, %d tiles\n", + (int)num_nodes, (int) num_edges, (int)num_pieces, (int)num_tiles); + } + + virtual Event initialize_data(const std::vector &memories, + const std::vector &procs) + { + // now create index space for nodes + is_nodes = Rect<1>(0, num_nodes - 1); + is_edges = Rect<1>(0, num_edges - 1); + // equal partition is used to do initial population of edges and nodes + std::vector > ss_nodes_eq; + std::vector > ss_edges_eq; + + log_app.info() << "Creating equal subspaces\n"; + + is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); + is_edges.create_equal_subspaces(num_pieces, 1, ss_edges_eq, Realm::ProfilingRequestSet()).wait(); + + log_app.debug() << "Initial partitions:"; + for(size_t i = 0; i < ss_nodes_eq.size(); i++) + log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i]; + for(size_t i = 0; i < ss_edges_eq.size(); i++) + log_app.debug() << " Edges #" << i << ": " << ss_edges_eq[i]; + + // create instances for each of these subspaces + std::vector node_fields, edge_fields; + node_fields.push_back(sizeof(int)); // piece_id + assert(sizeof(int) == sizeof(Point<1>)); + edge_fields.push_back(sizeof(Point<1>)); // src_node + edge_fields.push_back(sizeof(Point<1>)); // dst_node + + ri_nodes.resize(num_pieces); + piece_id_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_nodes_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i], + node_fields, 0 /*SOA*/, + Realm::ProfilingRequestSet()) + .wait(); + ri_nodes[i] = ri; + + piece_id_field_data[i].index_space = ss_nodes_eq[i]; + piece_id_field_data[i].inst = ri_nodes[i]; + piece_id_field_data[i].field_offset = 0; + } + + + // Fire off tasks to initialize data + ri_edges.resize(num_pieces); + src_node_field_data.resize(num_pieces); + dst_node_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_edges_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, + memories[i % memories.size()], + ss_edges_eq[i], + edge_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_edges[i] = ri; + + src_node_field_data[i].index_space = ss_edges_eq[i]; + src_node_field_data[i].inst = ri_edges[i]; + src_node_field_data[i].field_offset = 0 * sizeof(Point<1>); + + dst_node_field_data[i].index_space = ss_edges_eq[i]; + dst_node_field_data[i].inst = ri_edges[i]; + dst_node_field_data[i].field_offset = 1 * sizeof(Point<1>); + } + + // fire off tasks to initialize data + std::set events; + for(int i = 0; i < num_pieces; i++) { + Processor p = procs[i % procs.size()]; + InitDataArgs args; + args.index = i; + args.ri_nodes = ri_nodes[i]; + args.ri_edges = ri_edges[i]; + Event e = p.spawn(INIT_TILE_DATA_TASK, &args, sizeof(args)); + events.insert(e); + } + + return Event::merge_events(events); + } + + // the outputs of our partitioning will be: + // p_nodes - nodes partitioned by subgraph id (from GPU) + // p_nodes_cpu - nodes partitioned by subgraph id (from CPU) + + + std::vector > p_nodes, p_rd; + std::vector > p_edges, p_preimage_edges; + + std::vector > p_nodes_cpu, p_rd_cpu; + std::vector > p_edges_cpu, p_preimage_edges_cpu; + + virtual Event perform_partitioning(void) + { + // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU + // Ensure that the results are identical + + std::vector colors(num_pieces); + for(int i = 0; i < num_pieces; i++) + colors[i] = i; + + // We need a GPU memory for GPU partitioning + Memory gpu_memory; + bool found_gpu_memory = false; + Machine machine = Machine::get_machine(); + std::set all_memories; + machine.get_all_memories(all_memories); + for(Memory memory : all_memories) { + if(memory.kind() == Memory::GPU_FB_MEM) { + gpu_memory = memory; + found_gpu_memory = true; + break; + } + } + if (!found_gpu_memory) { + log_app.error() << "No GPU memory found for partitioning test\n"; + return Event::NO_EVENT; + } + std::vector edge_fields; + edge_fields.push_back(sizeof(Point<1>)); + edge_fields.push_back(sizeof(Point<1>)) ; + std::vector node_fields; + node_fields.push_back(sizeof(int)); + + std::vector, Point<1> > > src_field_data_gpu; + std::vector, Point<1> > > dst_field_data_gpu; + std::vector, int> > piece_field_data_gpu; + piece_field_data_gpu.resize(num_pieces); + src_field_data_gpu.resize(num_pieces); + dst_field_data_gpu.resize(num_pieces); + for (int i = 0; i < num_pieces; i++) { + RegionInstance src_gpu_instance; + RegionInstance dst_gpu_instance; + RegionInstance piece_gpu_instance; + RegionInstance::create_instance(src_gpu_instance, + gpu_memory, + src_node_field_data[i].index_space, + edge_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(dst_gpu_instance, + gpu_memory, + dst_node_field_data[i].index_space, + edge_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(piece_gpu_instance, + gpu_memory, + piece_id_field_data[i].index_space, + node_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + CopySrcDstField src_gpu_field, src_cpu_field, dst_gpu_field, dst_cpu_field, piece_gpu_field, piece_cpu_field; + src_gpu_field.inst = src_gpu_instance; + src_gpu_field.size = sizeof(Point<1>); + src_gpu_field.field_id = 0; + src_cpu_field.inst = src_node_field_data[i].inst; + src_cpu_field.size = sizeof(Point<1>); + src_cpu_field.field_id = 0; + dst_gpu_field.inst = dst_gpu_instance; + dst_gpu_field.size = sizeof(Point<1>); + dst_gpu_field.field_id = sizeof(Point<1>); + dst_cpu_field.inst = dst_node_field_data[i].inst; + dst_cpu_field.size = sizeof(Point<1>); + dst_cpu_field.field_id = sizeof(Point<1>); + piece_gpu_field.inst = piece_gpu_instance; + piece_gpu_field.size = sizeof(int); + piece_gpu_field.field_id = 0; + piece_cpu_field.inst = piece_id_field_data[i].inst; + piece_cpu_field.size = sizeof(int); + piece_cpu_field.field_id = 0; + std::vector src_cpu_data, src_gpu_data, dst_cpu_data, dst_gpu_data, piece_cpu_data, piece_gpu_data; + src_cpu_data.push_back(src_cpu_field); + dst_cpu_data.push_back(dst_cpu_field); + src_gpu_data.push_back(src_gpu_field); + dst_gpu_data.push_back(dst_gpu_field); + piece_gpu_data.push_back(piece_gpu_field); + piece_cpu_data.push_back(piece_cpu_field); + Event copy_event = src_node_field_data[i].index_space.copy(src_cpu_data, src_gpu_data, Realm::ProfilingRequestSet()); + copy_event.wait(); + Event second_copy_event = dst_node_field_data[i].index_space.copy(dst_cpu_data, dst_gpu_data, Realm::ProfilingRequestSet()); + second_copy_event.wait(); + Event third_copy_event = piece_id_field_data[i].index_space.copy(piece_cpu_data, piece_gpu_data, Realm::ProfilingRequestSet()); + third_copy_event.wait(); + src_field_data_gpu[i].inst = src_gpu_instance; + src_field_data_gpu[i].index_space = src_node_field_data[i].index_space; + src_field_data_gpu[i].field_offset = 0; + dst_field_data_gpu[i].inst = dst_gpu_instance; + dst_field_data_gpu[i].index_space = dst_node_field_data[i].index_space; + dst_field_data_gpu[i].field_offset = 1 * sizeof(Point<1>); + piece_field_data_gpu[i].inst = piece_gpu_instance; + piece_field_data_gpu[i].index_space = piece_id_field_data[i].index_space; + piece_field_data_gpu[i].field_offset = 0; + } + wait_on_events = true; + log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; + std::vector > p_garbage_nodes, p_garbage_edges, p_garbage_rd, p_garbage_preimage_edges; + Event e01 = is_nodes.create_subspaces_by_field(piece_field_data_gpu, + colors, + p_garbage_nodes, + Realm::ProfilingRequestSet()); + if (wait_on_events) e01.wait(); + Event e02 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_garbage_nodes, + p_garbage_edges, + Realm::ProfilingRequestSet(), + e01); + if(wait_on_events) e02.wait(); + + // an image of p_edges through out_node gives us all the shared nodes, along + // with some private nodes + Event e03 = is_nodes.create_subspaces_by_image(src_field_data_gpu, + p_garbage_edges, + p_garbage_rd, + Realm::ProfilingRequestSet(), + e02); + if(wait_on_events) e03.wait(); + + Event e04 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_garbage_rd, + p_garbage_preimage_edges, + Realm::ProfilingRequestSet(), + e03); + e04.wait(); + log_app.info() << "warming up complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU Partitioning " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU By Field " << Clock::current_time_in_microseconds() << "\n"; + Event e1 = is_nodes.create_subspaces_by_field(piece_field_data_gpu, + colors, + p_nodes, + Realm::ProfilingRequestSet()); + if(wait_on_events) e1.wait(); + log_app.info() << "GPU By Field complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU Preimage " << Clock::current_time_in_microseconds() << "\n"; + // now compute p_edges based on the color of their in_node (i.e. a preimage) + Event e2 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_nodes, + p_edges, + Realm::ProfilingRequestSet(), + e1); + if(wait_on_events) e2.wait(); + log_app.info() << "GPU Preimage complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU Image " << Clock::current_time_in_microseconds() << "\n"; + + // an image of p_edges through out_node gives us all the shared nodes, along + // with some private nodes + Event e3 = is_nodes.create_subspaces_by_image(src_field_data_gpu, + p_edges, + p_rd, + Realm::ProfilingRequestSet(), + e2); + if(wait_on_events) e3.wait(); + log_app.info() << "GPU Image complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting second GPU preimage " << Clock::current_time_in_microseconds() << "\n"; + + Event e4 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_rd, + p_preimage_edges, + Realm::ProfilingRequestSet(), + e3); + e4.wait(); + log_app.info() << "Second GPU preimage complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "GPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting CPU Partitioning " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting CPU By Field " << Clock::current_time_in_microseconds() << "\n"; + Event e5 = is_nodes.create_subspaces_by_field(piece_id_field_data, + colors, + p_nodes_cpu, + Realm::ProfilingRequestSet()); + if(wait_on_events) e5.wait(); + log_app.info() << "CPU By Field complete " << Clock::current_time_in_microseconds() << "\n"; + // now compute p_edges based on the color of their in_node (i.e. a preimage) + log_app.info() << "Starting CPU Preimage " << Clock::current_time_in_microseconds() << "\n"; + Event e6 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_nodes_cpu, + p_edges_cpu, + Realm::ProfilingRequestSet(), + e5); + if(wait_on_events) e6.wait(); + log_app.info() << "CPU Preimage complete " << Clock::current_time_in_microseconds() << "\n"; + + // an image of p_edges through out_node gives us all the shared nodes, along + // with some private nodes + log_app.info() << "Starting CPU Image " << Clock::current_time_in_microseconds() << "\n"; + Event e7 = is_nodes.create_subspaces_by_image(src_node_field_data, + p_edges_cpu, + p_rd_cpu, + Realm::ProfilingRequestSet(), + e6); + if(wait_on_events) e7.wait(); + log_app.info() << "CPU Image complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting second CPU preimage " << Clock::current_time_in_microseconds() << "\n"; + + Event e8 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_rd_cpu, + p_preimage_edges_cpu, + Realm::ProfilingRequestSet(), + e7); + e8.wait(); + log_app.info() << "Second CPU preimage complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; + return e8; + } + + virtual int perform_dynamic_checks(void) + { + // Nothing to do here + return 0; + } + + virtual int check_partitioning(void) + { + int errors = 0; + + if (!p_nodes.size()) { + return 0; + } + + log_app.info() << "Checking correctness of partitioning " << "\n"; + + for(int i = 0; i < num_pieces; i++) { + for(IndexSpaceIterator<1> it(p_nodes[i]); it.valid; it.step()) { + for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_nodes_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra byfield point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for(IndexSpaceIterator<1> it(p_nodes_cpu[i]); it.valid; it.step()) { + for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_nodes[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing byfield point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_edges[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_edges_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_edges_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_edges[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_rd[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_rd_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra image node " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_rd_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_rd[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing image node " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_preimage_edges[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_preimage_edges_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra second preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_preimage_edges_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_preimage_edges[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing second preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + + } + return errors; + } +}; + +class RangeTest : public TestInterface { +public: + // graph config parameters + int num_nodes = 1000; + int num_rects = 1000; + int max_rect_size = 10; + int num_pieces = 4; + std::string filename; + + RangeTest(int argc, const char *argv[]) + { + for(int i = 1; i < argc; i++) { + + if(!strcmp(argv[i], "-p")) { + num_pieces = atoi(argv[++i]); + continue; + } + + if(!strcmp(argv[i], "-n")) { + num_nodes = atoi(argv[++i]); + continue; + } + + if(!strcmp(argv[i], "-r")) { + num_rects = atoi(argv[++i]); + continue; + } + + if(!strcmp(argv[i], "-m")) { + max_rect_size = atoi(argv[++i]); + continue; + } + } + + + + if (num_nodes <= 0 || num_rects <= 0) { + log_app.error() << "Invalid graph dimensions in input file: rects=" << num_rects << " nodes=" << num_nodes; + exit(1); + } + + } + + + + struct InitDataArgs { + int index; + RegionInstance ri_nodes; + RegionInstance ri_rects; + }; + + enum PRNGStreams { + NODE_SUBGRAPH_STREAM, + }; + + void random_rect_data(int idx, int& subgraph) + { + if(random_colors) + subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); + else + subgraph = idx * num_pieces / num_rects; + } + + void random_node_data(int idx, int& subgraph) + { + if(true) + subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); + else + subgraph = idx * num_pieces / num_nodes; + } + + void initialize_rect_data(int idx, Rect<1> &rect, int max_rect_size = 10) + { + + int first = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes); + int amount = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, max_rect_size); + rect = Rect<1>(first, first + amount); + } + + + static void init_data_task_wrapper(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) + { + RangeTest *me = (RangeTest *)testcfg; + me->init_data_task(args, arglen, p); + } + + void init_data_task(const void *args, size_t arglen, Processor p) + { + const InitDataArgs& i_args = *(const InitDataArgs *)args; + + log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes << ", ri_rects=" << i_args.ri_rects << ")"; + + i_args.ri_nodes.fetch_metadata(p).wait(); + i_args.ri_rects.fetch_metadata(p).wait(); + + IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>(); + IndexSpace<1> is_rects = i_args.ri_rects.get_indexspace<1>(); + + log_app.debug() << "N: " << is_nodes; + log_app.debug() << "E: " << is_rects; + + //Write out colors and rectangles + + { + AffineAccessor a_rect_id(i_args.ri_rects, 0 /* offset */); + + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) { + int subgraph; + random_rect_data(i, subgraph); + a_rect_id.write(i, subgraph); + } + } + { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) { + int subgraph; + random_node_data(i, subgraph); + a_piece_id.write(i, subgraph); + } + } + + + { + + AffineAccessor, 1> a_rect_val(i_args.ri_rects, 1 * sizeof(int) /* offset */); + + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) { + Rect<1> rect; + initialize_rect_data(i, rect, max_rect_size); + a_rect_val.write(i, rect); + } + } + + if(show_graph) { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) + log_app.info() << "node_id[" << i << "] = " << a_piece_id.read(i) << "\n"; + + AffineAccessor a_rect_id(i_args.ri_rects, 0 * sizeof(Point<1>) /* offset */); + + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) + log_app.info() << "rect_id[" << i << "] = " << a_rect_id.read(i) << "\n"; + + AffineAccessor,1> a_rect_val(i_args.ri_rects, 1 * sizeof(int) /* offset */); + + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) + log_app.info() << "rect_val[" << i << "] = " << a_rect_val.read(i) << "\n"; + } + } + + IndexSpace<1> is_nodes, is_rects; + std::vector ri_nodes; + std::vector, int> > node_id_field_data; + std::vector ri_rects; + std::vector, int> > rect_id_field_data; + std::vector, Rect<1> > > rect_val_field_data; + + virtual void print_info(void) + { + printf("Realm dependent partitioning test - ranges: %d nodes, %d rects, %d pieces\n", + (int)num_nodes, (int)num_rects, (int)num_pieces); + } + + virtual Event initialize_data(const std::vector& memories, + const std::vector& procs) + { + // now create index spaces for nodes and edges + is_nodes = Rect<1>(0, num_nodes - 1); + is_rects = Rect<1>(0, num_rects - 1); + + // equal partition is used to do initial population of edges and nodes + std::vector > ss_nodes_eq; + std::vector > ss_rects_eq; + + log_app.info() << "Creating equal subspaces" << "\n"; + + is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); + is_rects.create_equal_subspaces(num_pieces, 1, ss_rects_eq, Realm::ProfilingRequestSet()).wait(); + + log_app.debug() << "Initial partitions:"; + for(size_t i = 0; i < ss_nodes_eq.size(); i++) + log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i]; + for(size_t i = 0; i < ss_rects_eq.size(); i++) + log_app.debug() << " Rects #" << i << ": " << ss_rects_eq[i]; + + // create instances for each of these subspaces + std::vector node_fields, rect_fields; + node_fields.push_back(sizeof(int)); // piece_id + rect_fields.push_back(sizeof(int)); // src_node + rect_fields.push_back(sizeof(Rect<1>)); // dst_node + + ri_nodes.resize(num_pieces); + node_id_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_nodes_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, + memories[i % memories.size()], + ss_nodes_eq[i], + node_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_nodes[i] = ri; + + node_id_field_data[i].index_space = ss_nodes_eq[i]; + node_id_field_data[i].inst = ri_nodes[i]; + node_id_field_data[i].field_offset = 0; + } + + ri_rects.resize(num_pieces); + rect_id_field_data.resize(num_pieces); + rect_val_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_rects_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, + memories[i % memories.size()], + ss_rects_eq[i], + rect_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_rects[i] = ri; + + rect_id_field_data[i].index_space = ss_rects_eq[i]; + rect_id_field_data[i].inst = ri_rects[i]; + rect_id_field_data[i].field_offset = 0; + + rect_val_field_data[i].index_space = ss_rects_eq[i]; + rect_val_field_data[i].inst = ri_rects[i]; + rect_val_field_data[i].field_offset = 1 * sizeof(int); + } + + // fire off tasks to initialize data + std::set events; + for(int i = 0; i < num_pieces; i++) { + Processor p = procs[i % procs.size()]; + InitDataArgs args; + args.index = i; + args.ri_nodes = ri_nodes[i]; + args.ri_rects = ri_rects[i]; + Event e = p.spawn(INIT_RANGE_DATA_TASK, &args, sizeof(args)); + events.insert(e); + } + + return Event::merge_events(events); + } + + // the outputs of our partitioning will be: + //p_colored_rects -> all of our rectangles marked with the color given by random_rect_data + //p_rects -> image range by p colored rects into nodes + + std::vector > p_colored_rects, p_rects; + std::vector > p_colored_rects_cpu, p_rects_cpu; + + virtual Event perform_partitioning(void) + { + + std::vector colors(num_pieces); + for(int i = 0; i < num_pieces; i++) + colors[i] = i; + + Memory gpu_memory; + bool found_gpu_memory = false; + Machine machine = Machine::get_machine(); + std::set all_memories; + machine.get_all_memories(all_memories); + for(auto& memory : all_memories) { + if(memory.kind() == Memory::GPU_FB_MEM) { + gpu_memory = memory; + found_gpu_memory = true; + break; + } + } + assert(found_gpu_memory); + std::vector rect_fields; + rect_fields.push_back(sizeof(int)); + rect_fields.push_back(sizeof(Rect<1>)); + std::vector node_fields; + node_fields.push_back(sizeof(int)); + + std::vector, int > > node_id_data_gpu; + std::vector, int > > rect_id_data_gpu; + std::vector, Rect<1>>> rect_val_data_gpu; + node_id_data_gpu.resize(num_pieces); + rect_id_data_gpu.resize(num_pieces); + rect_val_data_gpu.resize(num_pieces); + for (int i = 0; i < num_pieces; i++) { + RegionInstance node_id_instance; + RegionInstance rect_id_instance; + RegionInstance rect_val_instance; + RegionInstance::create_instance(node_id_instance, + gpu_memory, + node_id_field_data[i].index_space, + node_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(rect_id_instance, + gpu_memory, + rect_id_field_data[i].index_space, + rect_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(rect_val_instance, + gpu_memory, + rect_val_field_data[i].index_space, + rect_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + CopySrcDstField node_id_gpu_field, node_id_cpu_field, rect_id_gpu_field, rect_id_cpu_field, rect_val_gpu_field, rect_val_cpu_field; + node_id_gpu_field.inst = node_id_instance; + node_id_gpu_field.size = sizeof(int); + node_id_gpu_field.field_id = 0; + node_id_cpu_field.inst = node_id_field_data[i].inst; + node_id_cpu_field.size = sizeof(int); + node_id_cpu_field.field_id = 0; + rect_id_gpu_field.inst = rect_id_instance; + rect_id_gpu_field.size = sizeof(int); + rect_id_gpu_field.field_id = 0; + rect_id_cpu_field.inst = rect_id_field_data[i].inst; + rect_id_cpu_field.size = sizeof(int); + rect_id_cpu_field.field_id = 0; + rect_val_gpu_field.inst = rect_val_instance; + rect_val_gpu_field.size = sizeof(Rect<1>); + rect_val_gpu_field.field_id = sizeof(int); + rect_val_cpu_field.inst = rect_val_field_data[i].inst; + rect_val_cpu_field.size = sizeof(Rect<1>); + rect_val_cpu_field.field_id = sizeof(int); + std::vector node_id_gpu_data, node_id_cpu_data, rect_id_gpu_data, rect_id_cpu_data, rect_val_gpu_data, rect_val_cpu_data; + node_id_gpu_data.push_back(node_id_gpu_field); + node_id_cpu_data.push_back(node_id_cpu_field); + rect_id_gpu_data.push_back(rect_id_gpu_field); + rect_id_cpu_data.push_back(rect_id_cpu_field); + rect_val_gpu_data.push_back(rect_val_gpu_field); + rect_val_cpu_data.push_back(rect_val_cpu_field); + Event copy_event = node_id_field_data[i].index_space.copy(node_id_cpu_data, node_id_gpu_data, Realm::ProfilingRequestSet()); + copy_event.wait(); + Event second_copy_event = rect_id_field_data[i].index_space.copy(rect_id_cpu_data, rect_id_gpu_data, Realm::ProfilingRequestSet()); + second_copy_event.wait(); + Event third_copy_event = rect_val_field_data[i].index_space.copy(rect_val_cpu_data, rect_val_gpu_data, Realm::ProfilingRequestSet()); + third_copy_event.wait(); + node_id_data_gpu[i].inst = node_id_instance; + node_id_data_gpu[i].index_space = node_id_field_data[i].index_space; + node_id_data_gpu[i].field_offset = 0; + rect_id_data_gpu[i].inst = rect_id_instance; + rect_id_data_gpu[i].index_space = rect_id_field_data[i].index_space; + rect_id_data_gpu[i].field_offset = 0; + rect_val_data_gpu[i].inst = rect_val_instance; + rect_val_data_gpu[i].index_space = rect_val_field_data[i].index_space; + rect_val_data_gpu[i].field_offset = sizeof(int); + } + wait_on_events = true; + std::vector> p_garbage_rects, p_garbage_colors; + log_app.info() << "WARMING UP " << "\n"; + + std::vector> field_estimate_input(rect_id_data_gpu.size()); + std::vector field_estimate_output(rect_id_data_gpu.size()); + std::vector> image_estimate_input(rect_val_data_gpu.size()); + std::vector image_estimate_output(rect_val_data_gpu.size()); + std::vector> subspace_input(colors.size()); + for (size_t i = 0; i < rect_id_data_gpu.size(); i++) { + field_estimate_input[i].location = rect_id_data_gpu[i].inst.get_location(); + field_estimate_input[i].space = rect_id_data_gpu[i].index_space; + } + for (size_t i = 0; i < rect_val_data_gpu.size(); i++) { + image_estimate_input[i].location = rect_val_data_gpu[i].inst.get_location(); + image_estimate_input[i].space = rect_val_data_gpu[i].index_space; + } + + is_rects.by_field_buffer_requirements(field_estimate_input, field_estimate_output); + std::vector byte_fields = {sizeof(char)}; + for (size_t i = 0; i < rect_id_data_gpu.size(); i++) { + IndexSpace<1> instance_index_space(Rect<1>(0, field_estimate_output[i].upper_bound-1)); + RegionInstance::create_instance(rect_id_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + } + + Event e001 = is_rects.create_subspaces_by_field(rect_id_data_gpu, + colors, + p_garbage_colors, + Realm::ProfilingRequestSet()); + if (wait_on_events) e001.wait(); + for (size_t i = 0; i < colors.size(); i++) { + subspace_input[i].space = p_garbage_colors[i]; + subspace_input[i].entries = p_garbage_colors[i].sparsity.impl()->get_entries().size(); + } + is_nodes.by_image_buffer_requirements(subspace_input, image_estimate_input, image_estimate_output); + for (size_t i = 0; i < rect_val_data_gpu.size(); i++) { + IndexSpace<1> instance_index_space(Rect<1>(0, (image_estimate_output[i].upper_bound)/12-1)); + RegionInstance::create_instance(rect_val_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + } + Event e002 = is_nodes.create_subspaces_by_image(rect_val_data_gpu, + p_garbage_colors, + p_garbage_rects, + Realm::ProfilingRequestSet(), + e001); + if(wait_on_events) e002.wait(); + + log_app.info() << "FINISHED WARMING UP " << "\n"; + log_app.info() << "starting GPU partitioning " << Clock::current_time_in_microseconds() << "\n"; + + log_app.info() << "STARTING GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + + Event e01 = is_rects.create_subspaces_by_field(rect_id_data_gpu, + colors, + p_colored_rects, + Realm::ProfilingRequestSet()); + if (wait_on_events) e01.wait(); + + log_app.info() << "FINISHED GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "STARTING GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + Event e02 = is_nodes.create_subspaces_by_image(rect_val_data_gpu, + p_colored_rects, + p_rects, + Realm::ProfilingRequestSet(), + e01); + if(wait_on_events) e02.wait(); + + log_app.info() << "FINISHED GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "STARTING CPU partitioning " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "STARTING CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + Event e1 = is_rects.create_subspaces_by_field(rect_id_field_data, + colors, + p_colored_rects_cpu, + Realm::ProfilingRequestSet()); + if (wait_on_events) e1.wait(); + log_app.info() << "FINISHED CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "STARTING CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + Event e2 = is_nodes.create_subspaces_by_image(rect_val_field_data, + p_colored_rects_cpu, + p_rects_cpu, + Realm::ProfilingRequestSet(), + e1); + if(wait_on_events) e2.wait(); + log_app.info() << "FINISHED CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; + return e2; + } + + + + virtual int perform_dynamic_checks(void) + { + return 0; + } + + virtual int check_partitioning(void) + { + log_app.info() << "Checking correctness of partitioning " << "\n"; + int errors = 0; + + for (int i = 0; i < num_pieces; i++) { + for (IndexSpaceIterator<1> it(p_colored_rects[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_colored_rects_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra colored rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_colored_rects_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if(!p_colored_rects[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing colored rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_rects[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_rects_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_rects_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if(!p_rects[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + } + return errors; + } +}; + +class Range2DTest : public TestInterface { +public: + // graph config parameters + int num_nodes = 1000; + int num_rects = 1000; + int max_rect_size = 10; + int num_pieces = 4; + + Range2DTest(int argc, const char *argv[]) + { + for(int i = 1; i < argc; i++) { + + if(!strcmp(argv[i], "-p")) { + num_pieces = atoi(argv[++i]); + continue; + } + + if(!strcmp(argv[i], "-n")) { + num_nodes = atoi(argv[++i]); + continue; + } + + if (!strcmp(argv[i], "-r")) { + num_rects = atoi(argv[++i]); + continue; + } + + if (!strcmp(argv[i], "-m")) { + max_rect_size = atoi(argv[++i]); + continue; + } + } + + if (num_nodes <= 0 || num_rects <= 0) { + log_app.error() << "Invalid graph dimensions in input file: rects=" << num_rects << " nodes=" << num_nodes; + exit(1); + } + + } + + + + struct InitDataArgs { + int index; + RegionInstance ri_nodes; + RegionInstance ri_rects; + }; + + enum PRNGStreams { + NODE_SUBGRAPH_STREAM, + }; + + void random_rect_data(int idx, int& subgraph) + { + if(random_colors) + subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); + else + subgraph = idx * num_pieces / num_rects; + } + + void random_node_data(int idx, int& subgraph) + { + if(true) + subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); + else + subgraph = idx * num_pieces / num_nodes; + } + + void initialize_rect_data(int idx, Rect<2> &rect, int max_rect_size = 10) + { + + int x = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes); + int y = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, num_nodes); + int length = Philox_2x32<>::rand_int(random_seed, idx + 2, NODE_SUBGRAPH_STREAM, max_rect_size); + int height = Philox_2x32<>::rand_int(random_seed, idx + 3, NODE_SUBGRAPH_STREAM, max_rect_size); + rect.lo[0] = x; + rect.hi[0] = x + length; + rect.lo[1] = y; + rect.hi[1] = y + height; + } + + + static void init_data_task_wrapper(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) + { + Range2DTest *me = (Range2DTest *)testcfg; + me->init_data_task(args, arglen, p); + } + + void init_data_task(const void *args, size_t arglen, Processor p) + { + const InitDataArgs& i_args = *(const InitDataArgs *)args; + + log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes << ", ri_rects=" << i_args.ri_rects << ")"; + + i_args.ri_nodes.fetch_metadata(p).wait(); + i_args.ri_rects.fetch_metadata(p).wait(); + + IndexSpace<2> is_nodes = i_args.ri_nodes.get_indexspace<2>(); + IndexSpace<1> is_rects = i_args.ri_rects.get_indexspace<1>(); + + log_app.debug() << "N: " << is_nodes; + log_app.debug() << "E: " << is_rects; + + { + AffineAccessor a_piece_id(i_args.ri_rects, 0 /* offset */); + + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) { + int subgraph; + random_rect_data(i, subgraph); + a_piece_id.write(i, subgraph); + } + } + { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo[0]; i <= is_nodes.bounds.hi[0]; i++) { + for (int j = is_nodes.bounds.lo[1]; j <= is_nodes.bounds.hi[1]; j++) { + int idx = i * (is_nodes.bounds.hi[1] - is_nodes.bounds.lo[1] + 1) + j; + int subgraph; + random_node_data(idx, subgraph); + a_piece_id.write(Point<2>(i, j), subgraph); + } + } + } + + + { + + AffineAccessor, 1> a_rect(i_args.ri_rects, 1 * sizeof(int) /* offset */); + + // Read edges line by line + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) { + Rect<2> rect; + initialize_rect_data(i, rect, max_rect_size); + a_rect.write(i, rect); + } + } + + if(show_graph) { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo[0]; i <= is_nodes.bounds.hi[1]; i++) { + for (int j = is_nodes.bounds.lo[1]; j <= is_nodes.bounds.hi[1]; j++) { + Point<2> p(i, j); + log_app.info() << "node_id[" << p << "] = " << a_piece_id.read(p) << "\n"; + } + } + + AffineAccessor a_rect_id(i_args.ri_rects, 0 * sizeof(Point<1>) /* offset */); + + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) + log_app.info() << "rect_id[" << i << "] = " << a_rect_id.read(i) << "\n"; + + AffineAccessor,1> a_rect_val(i_args.ri_rects, 1 * sizeof(int) /* offset */); + + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) + log_app.info() << "rect_val[" << i << "] = " << a_rect_val.read(i) << "\n"; + } + } + + IndexSpace<1> is_rects; + IndexSpace<2> is_nodes; + std::vector ri_nodes; + std::vector, int> > node_id_field_data; + std::vector ri_rects; + std::vector, int> > rect_id_field_data; + std::vector, Rect<2> > > rect_val_field_data; + + virtual void print_info(void) + { + printf("Realm dependent partitioning test - 2D ranges: %d nodes, %d rects, %d pieces\n", + (int)num_nodes, (int)num_rects, (int)num_pieces); + } + + virtual Event initialize_data(const std::vector& memories, + const std::vector& procs) + { + // now create index spaces for nodes and edges + is_nodes = Rect<2>(Point<2>(0, 0), Point<2>(num_nodes - 1, num_nodes - 1)); + is_rects = Rect<1>(0, num_rects - 1); + + // equal partition is used to do initial population of edges and nodes + std::vector > ss_nodes_eq; + std::vector > ss_rects_eq; + + log_app.info() << "Creating equal subspaces" << "\n"; + + is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); + is_rects.create_equal_subspaces(num_pieces, 1, ss_rects_eq, Realm::ProfilingRequestSet()).wait(); + + log_app.debug() << "Initial partitions:\n"; + for(size_t i = 0; i < ss_nodes_eq.size(); i++) + log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i]; + for(size_t i = 0; i < ss_rects_eq.size(); i++) + log_app.debug() << " Rects #" << i << ": " << ss_rects_eq[i]; + + // create instances for each of these subspaces + std::vector node_fields, rect_fields; + node_fields.push_back(sizeof(int)); // piece_id + rect_fields.push_back(sizeof(int)); // src_node + rect_fields.push_back(sizeof(Rect<2>)); // dst_node + + ri_nodes.resize(num_pieces); + node_id_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_nodes_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, + memories[i % memories.size()], + ss_nodes_eq[i], + node_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_nodes[i] = ri; + + node_id_field_data[i].index_space = ss_nodes_eq[i]; + node_id_field_data[i].inst = ri_nodes[i]; + node_id_field_data[i].field_offset = 0; + } + + ri_rects.resize(num_pieces); + rect_id_field_data.resize(num_pieces); + rect_val_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_rects_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, + memories[i % memories.size()], + ss_rects_eq[i], + rect_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_rects[i] = ri; + + rect_id_field_data[i].index_space = ss_rects_eq[i]; + rect_id_field_data[i].inst = ri_rects[i]; + rect_id_field_data[i].field_offset = 0; + + rect_val_field_data[i].index_space = ss_rects_eq[i]; + rect_val_field_data[i].inst = ri_rects[i]; + rect_val_field_data[i].field_offset = 1 * sizeof(int); + } + + // fire off tasks to initialize data + std::set events; + for(int i = 0; i < num_pieces; i++) { + Processor p = procs[i % procs.size()]; + InitDataArgs args; + args.index = i; + args.ri_nodes = ri_nodes[i]; + args.ri_rects = ri_rects[i]; + Event e = p.spawn(INIT_RANGE2D_DATA_TASK, &args, sizeof(args)); + events.insert(e); + } + + return Event::merge_events(events); + } + + // the outputs of our partitioning will be: + // is_private, is_shared - subsets of is_nodes based on private/shared + // p_rd, p_wr, p_ghost - subsets of the above split by subckt + // p_edges - subsets of is_edges for each subckt + + std::vector > p_colored_rects; + std::vector> p_rects, p_intersect, p_diff; + std::vector> p_colored_rects_cpu; + std::vector> p_rects_cpu, p_intersect_cpu, p_diff_cpu; + + IndexSpace<2> cpu_union, gpu_union, garbage_union; + + virtual Event perform_partitioning(void) + { + // first partition nodes by subckt id (this is the independent partition, + // but not actually used by the app) + + std::vector colors(num_pieces); + for(int i = 0; i < num_pieces; i++) + colors[i] = i; + + Memory gpu_memory; + bool found_gpu_memory = false; + Machine machine = Machine::get_machine(); + std::set all_memories; + machine.get_all_memories(all_memories); + for(auto& memory : all_memories) { + if(memory.kind() == Memory::GPU_FB_MEM) { + gpu_memory = memory; + found_gpu_memory = true; + break; + } + } + assert(found_gpu_memory); + std::vector rect_fields; + rect_fields.push_back(sizeof(int)); + rect_fields.push_back(sizeof(Rect<2>)); + std::vector node_fields; + node_fields.push_back(sizeof(int)); + + std::vector, int > > node_id_data_gpu; + std::vector, int > > rect_id_data_gpu; + std::vector, Rect<2>>> rect_val_data_gpu; + node_id_data_gpu.resize(num_pieces); + rect_id_data_gpu.resize(num_pieces); + rect_val_data_gpu.resize(num_pieces); + for (int i = 0; i < num_pieces; i++) { + RegionInstance node_id_instance; + RegionInstance rect_id_instance; + RegionInstance rect_val_instance; + RegionInstance::create_instance(node_id_instance, + gpu_memory, + node_id_field_data[i].index_space, + node_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(rect_id_instance, + gpu_memory, + rect_id_field_data[i].index_space, + rect_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(rect_val_instance, + gpu_memory, + rect_val_field_data[i].index_space, + rect_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + CopySrcDstField node_id_gpu_field, node_id_cpu_field, rect_id_gpu_field, rect_id_cpu_field, rect_val_gpu_field, rect_val_cpu_field; + node_id_gpu_field.inst = node_id_instance; + node_id_gpu_field.size = sizeof(int); + node_id_gpu_field.field_id = 0; + node_id_cpu_field.inst = node_id_field_data[i].inst; + node_id_cpu_field.size = sizeof(int); + node_id_cpu_field.field_id = 0; + rect_id_gpu_field.inst = rect_id_instance; + rect_id_gpu_field.size = sizeof(int); + rect_id_gpu_field.field_id = 0; + rect_id_cpu_field.inst = rect_id_field_data[i].inst; + rect_id_cpu_field.size = sizeof(int); + rect_id_cpu_field.field_id = 0; + rect_val_gpu_field.inst = rect_val_instance; + rect_val_gpu_field.size = sizeof(Rect<2>); + rect_val_gpu_field.field_id = sizeof(int); + rect_val_cpu_field.inst = rect_val_field_data[i].inst; + rect_val_cpu_field.size = sizeof(Rect<2>); + rect_val_cpu_field.field_id = sizeof(int); + std::vector node_id_gpu_data, node_id_cpu_data, rect_id_gpu_data, rect_id_cpu_data, rect_val_gpu_data, rect_val_cpu_data; + node_id_gpu_data.push_back(node_id_gpu_field); + node_id_cpu_data.push_back(node_id_cpu_field); + rect_id_gpu_data.push_back(rect_id_gpu_field); + rect_id_cpu_data.push_back(rect_id_cpu_field); + rect_val_gpu_data.push_back(rect_val_gpu_field); + rect_val_cpu_data.push_back(rect_val_cpu_field); + Event copy_event = node_id_field_data[i].index_space.copy(node_id_cpu_data, node_id_gpu_data, Realm::ProfilingRequestSet()); + copy_event.wait(); + Event second_copy_event = rect_id_field_data[i].index_space.copy(rect_id_cpu_data, rect_id_gpu_data, Realm::ProfilingRequestSet()); + second_copy_event.wait(); + Event third_copy_event = rect_val_field_data[i].index_space.copy(rect_val_cpu_data, rect_val_gpu_data, Realm::ProfilingRequestSet()); + third_copy_event.wait(); + node_id_data_gpu[i].inst = node_id_instance; + node_id_data_gpu[i].index_space = node_id_field_data[i].index_space; + node_id_data_gpu[i].field_offset = 0; + rect_id_data_gpu[i].inst = rect_id_instance; + rect_id_data_gpu[i].index_space = rect_id_field_data[i].index_space; + rect_id_data_gpu[i].field_offset = 0; + rect_val_data_gpu[i].inst = rect_val_instance; + rect_val_data_gpu[i].index_space = rect_val_field_data[i].index_space; + rect_val_data_gpu[i].field_offset = sizeof(int); + } + wait_on_events = true; + std::vector> p_garbage_colors; + std::vector> p_garbage_rects; + log_app.info() << "WARMING UP " << "\n"; + + std::vector> field_estimate_input(rect_id_data_gpu.size()); + std::vector field_estimate_output(rect_id_data_gpu.size()); + std::vector> image_estimate_input(rect_val_data_gpu.size()); + std::vector image_estimate_output(rect_val_data_gpu.size()); + std::vector> subspace_input(colors.size()); + for (size_t i = 0; i < rect_id_data_gpu.size(); i++) { + field_estimate_input[i].location = rect_id_data_gpu[i].inst.get_location(); + field_estimate_input[i].space = rect_id_data_gpu[i].index_space; + } + for (size_t i = 0; i < rect_val_data_gpu.size(); i++) { + image_estimate_input[i].location = rect_val_data_gpu[i].inst.get_location(); + image_estimate_input[i].space = rect_val_data_gpu[i].index_space; + } + + is_rects.by_field_buffer_requirements(field_estimate_input, field_estimate_output); + std::vector byte_fields = {sizeof(char)}; + for (size_t i = 0; i < rect_id_data_gpu.size(); i++) { + IndexSpace<1> instance_index_space(Rect<1>(0, field_estimate_output[i].upper_bound-1)); + RegionInstance::create_instance(rect_id_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + } + + Event e001 = is_rects.create_subspaces_by_field(rect_id_data_gpu, + colors, + p_garbage_colors, + Realm::ProfilingRequestSet()); + if (wait_on_events) e001.wait(); + for (size_t i = 0; i < colors.size(); i++) { + subspace_input[i].space = p_garbage_colors[i]; + subspace_input[i].entries = p_garbage_colors[i].sparsity.impl()->get_entries().size(); + } + is_nodes.by_image_buffer_requirements(subspace_input, image_estimate_input, image_estimate_output); + for (size_t i = 0; i < rect_val_data_gpu.size(); i++) { + IndexSpace<1> instance_index_space(Rect<1>(0, (image_estimate_output[i].upper_bound*5)-1)); + RegionInstance::create_instance(rect_val_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + } + Event e002 = is_nodes.create_subspaces_by_image(rect_val_data_gpu, + p_garbage_colors, + p_garbage_rects, + Realm::ProfilingRequestSet(), + e001); + if(wait_on_events) e002.wait(); + + log_app.info() << "FINISHED WARMING UP " << "\n"; + log_app.info() << "starting GPU partitioning " << Clock::current_time_in_microseconds() << "\n"; + + log_app.info() << "STARTING GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + + Event e01 = is_rects.create_subspaces_by_field(rect_id_data_gpu, + colors, + p_colored_rects, + Realm::ProfilingRequestSet()); + if (wait_on_events) e01.wait(); + + log_app.info() << "FINISHED GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "STARTING GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + Event e02 = is_nodes.create_subspaces_by_image(rect_val_data_gpu, + p_colored_rects, + p_rects, + Realm::ProfilingRequestSet(), + e01); + if(wait_on_events) e02.wait(); + log_app.info() << "FINISHED GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "GPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; + + log_app.info() << "STARTING CPU partitioning " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "STARTING CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + Event e1 = is_rects.create_subspaces_by_field(rect_id_field_data, + colors, + p_colored_rects_cpu, + Realm::ProfilingRequestSet()); + if (wait_on_events) e1.wait(); + log_app.info() << "FINISHED CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "STARTING CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + Event e2 = is_nodes.create_subspaces_by_image(rect_val_field_data, + p_colored_rects_cpu, + p_rects_cpu, + Realm::ProfilingRequestSet(), + e1); + if(wait_on_events) e2.wait(); + log_app.info() << "FINISHED CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; + return e2; + } + + + + virtual int perform_dynamic_checks(void) + { + return 0; + } + + virtual int check_partitioning(void) + { + log_app.info() << "Checking correctness of partitioning " << "\n"; + int errors = 0; + + for (int i = 0; i < num_pieces; i++) { + for (IndexSpaceIterator<1> it(p_colored_rects[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if(!p_colored_rects_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra colored rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_colored_rects_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_colored_rects[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing colored rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<2> it(p_rects[i]); it.valid; it.step()) { + for (PointInRectIterator<2> point(it.rect); point.valid; point.step()) { + if (!p_rects_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<2> it(p_rects_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<2> point(it.rect); point.valid; point.step()) { + if (!p_rects[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + } + return errors; + } +}; + +class MiniAeroTest : public TestInterface { +public: + enum ProblemType + { + PTYPE_0, + PTYPE_1, + PTYPE_2, + }; + enum FaceType + { + BC_INTERIOR = 0, + BC_TANGENT = 1, + BC_EXTRAPOLATE = 2, + BC_INFLOW = 3, + BC_NOSLIP = 4, + BC_BLOCK_BORDER = 5, + BC_TOTAL = 6, + }; + + ProblemType problem_type = PTYPE_0; + int global_x = 4; + int global_y = 4; + int global_z = 4; + int blocks_x = 2; + int blocks_y = 2; + int blocks_z = 2; + + int n_cells; // total cell count + int n_blocks; // total block count + int n_faces; // total face count + std::vector xsplit, ysplit, zsplit; // cut planes + std::vector cells_per_block, faces_per_block; + + // can't do 64-bit index types right now, so at least get most of our 32-bit space + typedef int INDEXTYPE; + static const INDEXTYPE FIRST_INDEX = -2000000000; // easier to read than INT_MIN+1 + + MiniAeroTest(int argc, const char *argv[]) + { +#define INT_ARG(s, v) \ + if(!strcmp(argv[i], s)) { \ + v = atoi(argv[++i]); \ + continue; \ + } + for(int i = 1; i < argc; i++) { + if(!strcmp(argv[i], "-type")) { + problem_type = (ProblemType)atoi(argv[++i]); + continue; + } + INT_ARG("-gx", global_x); + INT_ARG("-gy", global_y); + INT_ARG("-gz", global_z); + INT_ARG("-bx", blocks_x); + INT_ARG("-by", blocks_y); + INT_ARG("-bz", blocks_z); + if(!strcmp(argv[i], "-g")) { + int v = atoi(argv[++i]); + global_x = global_y = global_z = v; + continue; + } + if(!strcmp(argv[i], "-b")) { + int v = atoi(argv[++i]); + blocks_x = blocks_y = blocks_z = v; + continue; + } + } +#undef INT_ARG + + // don't allow degenerate blocks + assert(global_x >= blocks_x); + assert(global_y >= blocks_y); + assert(global_z >= blocks_z); + + split_evenly(global_x, blocks_x, xsplit); + split_evenly(global_y, blocks_y, ysplit); + split_evenly(global_z, blocks_z, zsplit); + + n_blocks = blocks_x * blocks_y * blocks_z; + n_cells = 0; + n_faces = 0; + for(int bz = 0; bz < blocks_z; bz++) + for(int by = 0; by < blocks_y; by++) + for(int bx = 0; bx < blocks_x; bx++) { + int nx = xsplit[bx + 1] - xsplit[bx]; + int ny = ysplit[by + 1] - ysplit[by]; + int nz = zsplit[bz + 1] - zsplit[bz]; + + int c = nx * ny * nz; + int f = (((nx + 1) * ny * nz) + (nx * (ny + 1) * nz) + (nx * ny * (nz + 1))); + cells_per_block.push_back(c); + faces_per_block.push_back(f); + + n_cells += c; + n_faces += f; + } + assert(n_cells == global_x * global_y * global_z); + assert(n_faces == (((global_x + blocks_x) * global_y * global_z) + + (global_x * (global_y + blocks_y) * global_z) + + (global_x * global_y * (global_z + blocks_z)))); + } + + virtual void print_info(void) + { + printf("Realm dependent partitioning test - miniaero: %d x %d x %d cells, %d x %d x " + "%d blocks\n", + (int)global_x, (int)global_y, (int)global_z, (int)blocks_x, (int)blocks_y, + (int)blocks_z); + } + + IndexSpace<1> is_cells, is_faces; + std::vector ri_cells; + std::vector, int>> cell_blockid_field_data; + std::vector ri_faces; + std::vector, Point<1>>> face_left_field_data; + std::vector, Point<1>>> face_right_field_data; + std::vector, int>> face_type_field_data; + + struct InitDataArgs { + int index; + RegionInstance ri_cells, ri_faces; + }; + + virtual Event initialize_data(const std::vector &memories, + const std::vector &procs) + { + // top level index spaces + is_cells = Rect<1>(FIRST_INDEX, FIRST_INDEX + n_cells - 1); + is_faces = Rect<1>(FIRST_INDEX, FIRST_INDEX + n_faces - 1); + + // weighted partitions based on the distribution we already computed + std::vector> ss_cells_w; + std::vector> ss_faces_w; + + is_cells + .create_weighted_subspaces(n_blocks, 1, cells_per_block, ss_cells_w, + Realm::ProfilingRequestSet()) + .wait(); + is_faces + .create_weighted_subspaces(n_blocks, 1, faces_per_block, ss_faces_w, + Realm::ProfilingRequestSet()) + .wait(); + + log_app.debug() << "Initial partitions:"; + for(size_t i = 0; i < ss_cells_w.size(); i++) + log_app.debug() << " Cells #" << i << ": " << ss_cells_w[i]; + for(size_t i = 0; i < ss_faces_w.size(); i++) + log_app.debug() << " Faces #" << i << ": " << ss_faces_w[i]; + + // create instances for each of these subspaces + std::vector cell_fields, face_fields; + cell_fields.push_back(sizeof(int)); // blockid + assert(sizeof(int) == sizeof(Point<1>)); + face_fields.push_back(sizeof(Point<1>)); // left + face_fields.push_back(sizeof(Point<1>)); // right + face_fields.push_back(sizeof(int)); // type + + ri_cells.resize(n_blocks); + cell_blockid_field_data.resize(n_blocks); + + for(size_t i = 0; i < ss_cells_w.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, memories[i % memories.size()], ss_cells_w[i], + cell_fields, 0 /*SOA*/, + Realm::ProfilingRequestSet()) + .wait(); + ri_cells[i] = ri; + + cell_blockid_field_data[i].index_space = ss_cells_w[i]; + cell_blockid_field_data[i].inst = ri_cells[i]; + cell_blockid_field_data[i].field_offset = 0; + } + + ri_faces.resize(n_blocks); + face_left_field_data.resize(n_blocks); + face_right_field_data.resize(n_blocks); + face_type_field_data.resize(n_blocks); + + for(size_t i = 0; i < ss_faces_w.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, memories[i % memories.size()], ss_faces_w[i], + face_fields, 0 /*SOA*/, + Realm::ProfilingRequestSet()) + .wait(); + ri_faces[i] = ri; + + face_left_field_data[i].index_space = ss_faces_w[i]; + face_left_field_data[i].inst = ri_faces[i]; + face_left_field_data[i].field_offset = 0 * sizeof(Point<1>); + + face_right_field_data[i].index_space = ss_faces_w[i]; + face_right_field_data[i].inst = ri_faces[i]; + face_right_field_data[i].field_offset = 1 * sizeof(Point<1>); + + face_type_field_data[i].index_space = ss_faces_w[i]; + face_type_field_data[i].inst = ri_faces[i]; + face_type_field_data[i].field_offset = 2 * sizeof(Point<1>); + } + + // fire off tasks to initialize data + std::set events; + for(int i = 0; i < n_blocks; i++) { + Processor p = procs[i % memories.size()]; + InitDataArgs args; + args.index = i; + args.ri_cells = ri_cells[i]; + args.ri_faces = ri_faces[i]; + Event e = p.spawn(INIT_MINIAERO_DATA_TASK, &args, sizeof(args)); + events.insert(e); + } + + return Event::merge_events(events); + } + + static void init_data_task_wrapper(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) + { + MiniAeroTest *me = (MiniAeroTest *)testcfg; + me->init_data_task(args, arglen, p); + } + + Point<1> global_cell_pointer(int cx, int cy, int cz) + { + INDEXTYPE p = FIRST_INDEX; + + // out of range? return -1 + if((cx < 0) || (cx >= global_x) || (cy < 0) || (cy >= global_y) || (cz < 0) || + (cz >= global_z)) + return -1; + + // first chunks in z, then y, then x + int zi = find_split(zsplit, cz); + p += global_x * global_y * zsplit[zi]; + cz -= zsplit[zi]; + int local_z = zsplit[zi + 1] - zsplit[zi]; + + int yi = find_split(ysplit, cy); + p += global_x * ysplit[yi] * local_z; + cy -= ysplit[yi]; + int local_y = ysplit[yi + 1] - ysplit[yi]; + + int xi = find_split(xsplit, cx); + p += xsplit[xi] * local_y * local_z; + cx -= xsplit[xi]; + int local_x = xsplit[xi + 1] - xsplit[xi]; + + // now local addressing within this block + p += (cx + (cy * local_x) + (cz * local_x * local_y)); + return p; + } + + void init_data_task(const void *args, size_t arglen, Processor p) + { + const InitDataArgs &i_args = *(const InitDataArgs *)args; + + i_args.ri_cells.fetch_metadata(p).wait(); + i_args.ri_faces.fetch_metadata(p).wait(); + + log_app.info() << "init task #" << i_args.index << " (ri_cells=" << i_args.ri_cells + << ", ri_faces=" << i_args.ri_faces << ")"; + + IndexSpace<1> is_cells = i_args.ri_cells.get_indexspace<1>(); + IndexSpace<1> is_faces = i_args.ri_faces.get_indexspace<1>(); + + log_app.debug() << "C: " << is_cells; + log_app.debug() << "F: " << is_faces; + + int bx = i_args.index % blocks_x; + int by = (i_args.index / blocks_x) % blocks_y; + int bz = i_args.index / blocks_x / blocks_y; + + size_t nx = xsplit[bx + 1] - xsplit[bx]; + size_t ny = ysplit[by + 1] - ysplit[by]; + size_t nz = zsplit[bz + 1] - zsplit[bz]; + + size_t c = nx * ny * nz; + size_t f = (((nx + 1) * ny * nz) + (nx * (ny + 1) * nz) + (nx * ny * (nz + 1))); + assert(is_cells.bounds.volume() == c); + assert(is_faces.bounds.volume() == f); + + // cells are all assigned to the local block + { + AffineAccessor a_cell_blockid(i_args.ri_cells, 0 /* offset */); + + for(int cz = zsplit[bz]; cz < zsplit[bz + 1]; cz++) + for(int cy = ysplit[by]; cy < ysplit[by + 1]; cy++) + for(int cx = xsplit[bx]; cx < xsplit[bx + 1]; cx++) { + Point<1> pz = global_cell_pointer(cx, cy, cz); + assert(is_cells.bounds.contains(pz)); + + a_cell_blockid.write(pz, i_args.index); + } + } + + // faces aren't in any globally-visible order + { + AffineAccessor, 1> a_face_left(i_args.ri_faces, + 0 * sizeof(Point<1>) /* offset */); + AffineAccessor, 1> a_face_right(i_args.ri_faces, + 1 * sizeof(Point<1>) /* offset */); + AffineAccessor a_face_type(i_args.ri_faces, + 2 * sizeof(Point<1>) /* offset */); + + Point<1> pf = is_faces.bounds.lo; + + // -- type 0 | type 1 | type 2 + // -- ------ | ------ | ------ + // -- left extrapolate | inflow | inflow + // -- right extrapolate | extrapolate | extrapolate + // -- down tangent | noslip | tangent + // -- up tangent | extrapolate | tangent + // -- back tangent | tangent | tangent + // -- front tangent | tangent | tangent + + // left/right faces first + for(int fx = xsplit[bx]; fx <= xsplit[bx + 1]; fx++) { + int ftype = BC_INTERIOR; + bool reversed = false; + if(fx == xsplit[bx]) { + // low boundary + reversed = true; + if(fx == 0) + switch(problem_type) { + case PTYPE_0: + ftype = BC_EXTRAPOLATE; + break; + case PTYPE_1: + ftype = BC_INFLOW; + break; + case PTYPE_2: + ftype = BC_INFLOW; + break; + } + else + ftype = BC_BLOCK_BORDER; + } else if(fx == xsplit[bx + 1]) { + // high boundary + if(fx == global_x) + switch(problem_type) { + case PTYPE_0: + ftype = BC_EXTRAPOLATE; + break; + case PTYPE_1: + ftype = BC_EXTRAPOLATE; + break; + case PTYPE_2: + ftype = BC_EXTRAPOLATE; + break; + } + else + ftype = BC_BLOCK_BORDER; + } + + for(int cz = zsplit[bz]; cz < zsplit[bz + 1]; cz++) + for(int cy = ysplit[by]; cy < ysplit[by + 1]; cy++) { + a_face_left.write(pf, global_cell_pointer(fx - (reversed ? 0 : 1), cy, cz)); + a_face_right.write(pf, global_cell_pointer(fx - (reversed ? 1 : 0), cy, cz)); + a_face_type.write(pf, ftype); + pf[0]++; + } + } + + // down/up faces next + for(int fy = ysplit[by]; fy <= ysplit[by + 1]; fy++) { + int ftype = BC_INTERIOR; + bool reversed = false; + if(fy == ysplit[by]) { + // low boundary + reversed = true; + if(fy == 0) + switch(problem_type) { + case PTYPE_0: + ftype = BC_TANGENT; + break; + case PTYPE_1: + ftype = BC_NOSLIP; + break; + case PTYPE_2: + ftype = BC_TANGENT; + break; + } + else + ftype = BC_BLOCK_BORDER; + } else if(fy == ysplit[by + 1]) { + // high boundary + if(fy == global_y) + switch(problem_type) { + case PTYPE_0: + ftype = BC_TANGENT; + break; + case PTYPE_1: + ftype = BC_EXTRAPOLATE; + break; + case PTYPE_2: + ftype = BC_TANGENT; + break; + } + else + ftype = BC_BLOCK_BORDER; + } + + for(int cz = zsplit[bz]; cz < zsplit[bz + 1]; cz++) + for(int cx = xsplit[bx]; cx < xsplit[bx + 1]; cx++) { + a_face_left.write(pf, global_cell_pointer(cx, fy - (reversed ? 0 : 1), cz)); + a_face_right.write(pf, global_cell_pointer(cx, fy - (reversed ? 1 : 0), cz)); + a_face_type.write(pf, ftype); + pf[0]++; + } + } + + // back/front faces last + for(int fz = zsplit[bz]; fz <= zsplit[bz + 1]; fz++) { + int ftype = BC_INTERIOR; + bool reversed = false; + if(fz == zsplit[bz]) { + // low boundary + reversed = true; + if(fz == 0) + switch(problem_type) { + case PTYPE_0: + ftype = BC_TANGENT; + break; + case PTYPE_1: + ftype = BC_TANGENT; + break; + case PTYPE_2: + ftype = BC_TANGENT; + break; + } + else + ftype = BC_BLOCK_BORDER; + } else if(fz == zsplit[bz + 1]) { + // high boundary + if(fz == global_z) + switch(problem_type) { + case PTYPE_0: + ftype = BC_TANGENT; + break; + case PTYPE_1: + ftype = BC_TANGENT; + break; + case PTYPE_2: + ftype = BC_TANGENT; + break; + } + else + ftype = BC_BLOCK_BORDER; + } + + for(int cy = ysplit[by]; cy < ysplit[by + 1]; cy++) + for(int cx = xsplit[bx]; cx < xsplit[bx + 1]; cx++) { + a_face_left.write(pf, global_cell_pointer(cx, cy, fz - (reversed ? 0 : 1))); + a_face_right.write(pf, global_cell_pointer(cx, cy, fz - (reversed ? 1 : 0))); + a_face_type.write(pf, ftype); + pf[0]++; + } + } + + assert(pf[0] == is_faces.bounds.hi[0] + 1); + } + + if(show_graph) { + AffineAccessor a_cell_blockid(i_args.ri_cells, 0 /* offset */); + + for(int i = is_cells.bounds.lo[0]; i <= is_cells.bounds.hi[0]; i++) + std::cout << "Z[" << i << "]: blockid=" << a_cell_blockid.read(i) << "\n"; + + AffineAccessor, 1> a_face_left(i_args.ri_faces, + 0 * sizeof(Point<1>) /* offset */); + AffineAccessor, 1> a_face_right(i_args.ri_faces, + 1 * sizeof(Point<1>) /* offset */); + AffineAccessor a_face_type(i_args.ri_faces, + 2 * sizeof(Point<1>) /* offset */); + + for(int i = is_faces.bounds.lo[0]; i <= is_faces.bounds.hi[0]; i++) + std::cout << "S[" << i << "]:" + << " left=" << a_face_left.read(i) << " right=" << a_face_right.read(i) + << " type=" << a_face_type.read(i) << "\n"; + } + } + + // the outputs of our partitioning will be: + // p_cells - subsets of is_cells split by block + // p_faces - subsets of_is_faces split by block (based on left cell) + // p_facetypes[6] - subsets of p_faces split further by face type + // p_ghost - subsets of is_cells reachable by each block's boundary faces + + std::vector> p_cells; + std::vector> p_faces; + std::vector>> p_facetypes; + std::vector> p_ghost; + + virtual Event perform_partitioning(void) + { + // partition cells first + std::vector colors(n_blocks); + for(int i = 0; i < n_blocks; i++) + colors[i] = i; + + Event e1 = is_cells.create_subspaces_by_field(cell_blockid_field_data, colors, + p_cells, Realm::ProfilingRequestSet()); + if(wait_on_events) + e1.wait(); + + // now a preimage to get faces + Event e2 = is_faces.create_subspaces_by_preimage( + face_left_field_data, p_cells, p_faces, Realm::ProfilingRequestSet(), e1); + if(wait_on_events) + e2.wait(); + + // now split by face type + std::set evs; + std::vector ftcolors(BC_TOTAL); + for(int i = 0; i < BC_TOTAL; i++) + ftcolors[i] = i; + p_facetypes.resize(n_blocks); + std::vector> p_border_faces(n_blocks); + + for(int idx = 0; idx < n_blocks; idx++) { + Event e = p_faces[idx].create_subspaces_by_field(face_type_field_data, ftcolors, + p_facetypes[idx], + Realm::ProfilingRequestSet(), e2); + if(wait_on_events) + e.wait(); + evs.insert(e); + p_border_faces[idx] = p_facetypes[idx][BC_BLOCK_BORDER]; + } + Event e3 = Event::merge_events(evs); + + // finally, the image of just the boundary faces through the right face gets us + // ghost cells + Event e4 = is_cells.create_subspaces_by_image( + face_right_field_data, p_border_faces, p_ghost, Realm::ProfilingRequestSet(), e3); + if(wait_on_events) + e4.wait(); + + return e4; + } + + virtual int perform_dynamic_checks(void) + { + int errors = 0; + + std::vector> p_int_faces, p_border_faces; + for(int idx = 0; idx < n_blocks; idx++) { + p_int_faces.push_back(p_facetypes[idx][BC_INTERIOR]); + p_border_faces.push_back(p_facetypes[idx][BC_BLOCK_BORDER]); + } + // miniaero's checks are faster with image/diff on 1 thread, but slower on 4 +#ifdef MINIAERO_USE_IMAGE_DIFF + std::vector> p_l_test, p_ri_test, p_rb_test; + Event e4 = is_cells.create_subspaces_by_image_with_difference( + face_left_field_data, p_faces, p_cells, p_l_test, Realm::ProfilingRequestSet()); + Event e5 = is_cells.create_subspaces_by_image_with_difference( + face_right_field_data, p_int_faces, p_cells, p_ri_test, + Realm::ProfilingRequestSet()); + Event e6 = is_cells.create_subspaces_by_image_with_difference( + face_right_field_data, p_border_faces, p_ghost, p_rb_test, + Realm::ProfilingRequestSet()); +#else + std::vector> p_img_left, p_img_right_i, p_img_right_b; + Event e1 = is_cells.create_subspaces_by_image( + face_left_field_data, p_faces, p_img_left, Realm::ProfilingRequestSet()); + Event e2 = is_cells.create_subspaces_by_image( + face_right_field_data, p_int_faces, p_img_right_i, Realm::ProfilingRequestSet()); + Event e3 = + is_cells.create_subspaces_by_image(face_right_field_data, p_border_faces, + p_img_right_b, Realm::ProfilingRequestSet()); + std::vector> p_l_test, p_ri_test, p_rb_test; + Event e4 = IndexSpace<1>::compute_differences(p_img_left, p_cells, p_l_test, + Realm::ProfilingRequestSet(), e1); + for(unsigned idx = 0; idx < p_img_left.size(); idx++) { + p_img_left[idx].destroy(e4); + } + Event e5 = IndexSpace<1>::compute_differences(p_img_right_i, p_cells, p_ri_test, + Realm::ProfilingRequestSet(), e2); + for(unsigned idx = 0; idx < p_img_right_i.size(); idx++) { + p_img_right_i[idx].destroy(e5); + } + Event e6 = IndexSpace<1>::compute_differences(p_img_right_b, p_ghost, p_rb_test, + Realm::ProfilingRequestSet(), e3); + for(unsigned idx = 0; idx < p_img_right_b.size(); idx++) { + p_img_right_b[idx].destroy(e6); + } +#endif + errors += check_empty(e4, p_l_test, "p_l_test"); + errors += check_empty(e5, p_ri_test, "p_ri_test"); + errors += check_empty(e6, p_rb_test, "p_rb_test"); + for(unsigned idx = 0; idx < p_l_test.size(); idx++) { + p_l_test[idx].destroy(e4); + } + for(unsigned idx = 0; idx < p_ri_test.size(); idx++) { + p_ri_test[idx].destroy(e5); + } + for(unsigned idx = 0; idx < p_rb_test.size(); idx++) { + p_rb_test[idx].destroy(e6); + } + + return errors; + } + + virtual int check_partitioning(void) + { + int errors = 0; + + Point<1> pc = is_cells.bounds.lo; + Point<1> pf = is_faces.bounds.lo; + + for(int blkid = 0; blkid < n_blocks; blkid++) { + int bx = blkid % blocks_x; + int by = (blkid / blocks_x) % blocks_y; + int bz = blkid / blocks_x / blocks_y; + + int nx = xsplit[bx + 1] - xsplit[bx]; + int ny = ysplit[by + 1] - ysplit[by]; + int nz = zsplit[bz + 1] - zsplit[bz]; + + // check cells + for(int i = 0; i < cells_per_block[blkid]; i++) { + for(int j = 0; j < n_blocks; j++) { + bool exp = (j == blkid); + bool act = p_cells[j].contains(pc); + if(exp != act) { + log_app.error() << "mismatch: cell " << pc << " in p_cells[" << j + << "]: exp=" << exp << " act=" << act; + errors++; + } + } + + std::set exp_ghosts; + int cx = i % nx; + int cy = (i / nx) % ny; + int cz = i / nx / ny; + if((cx == 0) && (bx > 0)) + exp_ghosts.insert(blkid - 1); + if((cx == (nx - 1)) && (bx < (blocks_x - 1))) + exp_ghosts.insert(blkid + 1); + if((cy == 0) && (by > 0)) + exp_ghosts.insert(blkid - blocks_x); + if((cy == (ny - 1)) && (by < (blocks_y - 1))) + exp_ghosts.insert(blkid + blocks_x); + if((cz == 0) && (bz > 0)) + exp_ghosts.insert(blkid - blocks_x * blocks_y); + if((cz == (nz - 1)) && (bz < (blocks_z - 1))) + exp_ghosts.insert(blkid + blocks_x * blocks_y); + + for(int j = 0; j < n_blocks; j++) { + bool exp = exp_ghosts.count(j) > 0; + bool act = p_ghost[j].contains(pc); + if(exp != act) { + log_app.error() << "mismatch: cell " << pc << " in p_ghost[" << j + << "]: exp=" << exp << " act=" << act; + errors++; + } + } + + pc[0]++; + } + + // check faces + for(int i = 0; i < faces_per_block[blkid]; i++) { + for(int j = 0; j < n_blocks; j++) { + bool exp = (j == blkid); + bool act = p_faces[j].contains(pf); + if(exp != act) { + log_app.error() << "mismatch: face " << pf << " in p_faces[" << j + << "]: exp=" << exp << " act=" << act; + errors++; + } + FaceType exptype = BC_INTERIOR; + // luckily the faces on the edge of a block come in chunks + int lr_faces = (nx + 1) * ny * nz; + int du_faces = nx * (ny + 1) * nz; + int bf_faces = nx * ny * (nz + 1); + assert((lr_faces + du_faces + bf_faces) == faces_per_block[blkid]); + if(i < lr_faces) { + int x = i / ny / nz; + if(x == 0) + exptype = ((bx == 0) ? ((problem_type == PTYPE_0) ? BC_EXTRAPOLATE + : (problem_type == PTYPE_1) ? BC_INFLOW + : BC_INFLOW) + : BC_BLOCK_BORDER); + if(x == nx) + exptype = + ((bx == blocks_x - 1) ? ((problem_type == PTYPE_0) ? BC_EXTRAPOLATE + : (problem_type == PTYPE_1) ? BC_EXTRAPOLATE + : BC_EXTRAPOLATE) + : BC_BLOCK_BORDER); + } else if(i < (lr_faces + du_faces)) { + int y = (i - lr_faces) / nx / nz; + if(y == 0) + exptype = ((by == 0) ? ((problem_type == PTYPE_0) ? BC_TANGENT + : (problem_type == PTYPE_1) ? BC_NOSLIP + : BC_TANGENT) + : BC_BLOCK_BORDER); + if(y == ny) + exptype = + ((by == blocks_y - 1) ? ((problem_type == PTYPE_0) ? BC_TANGENT + : (problem_type == PTYPE_1) ? BC_EXTRAPOLATE + : BC_TANGENT) + : BC_BLOCK_BORDER); + } else { + int z = (i - lr_faces - du_faces) / nx / ny; + if(z == 0) + exptype = ((bz == 0) ? ((problem_type == PTYPE_0) ? BC_TANGENT + : (problem_type == PTYPE_1) ? BC_TANGENT + : BC_TANGENT) + : BC_BLOCK_BORDER); + if(z == nz) + exptype = ((bz == blocks_z - 1) ? ((problem_type == PTYPE_0) ? BC_TANGENT + : (problem_type == PTYPE_1) ? BC_TANGENT + : BC_TANGENT) + : BC_BLOCK_BORDER); + } + + for(int k = 0; k < BC_TOTAL; k++) { + bool exp = (j == blkid) && (k == exptype); + bool act = p_facetypes[j][k].contains(pf); + if(exp != act) { + log_app.error() << "mismatch: face " << pf << " in p_facetypes[" << j + << "][" << k << "]: exp=" << exp << " act=" << act; + errors++; + } + } + } + pf[0]++; + } + } + for(unsigned idx = 0; idx < p_cells.size(); idx++) { + p_cells[idx].destroy(); + } + for(unsigned idx = 0; idx < p_faces.size(); idx++) { + p_faces[idx].destroy(); + } + for(unsigned i = 0; i < p_facetypes.size(); i++) { + for(unsigned j = 0; j < p_facetypes[i].size(); j++) { + p_facetypes[i][j].destroy(); + } + } + for(unsigned idx = 0; idx < p_ghost.size(); idx++) { + p_ghost[idx].destroy(); + } + + return errors; + } +}; + +class CircuitTest : public TestInterface { +public: + // graph config parameters + int num_nodes = 100; + int num_edges = 10; + int num_pieces = 2; + int pct_wire_in_piece = 50; + + CircuitTest(int argc, const char *argv[]) + { + for(int i = 1; i < argc; i++) { + if(!strcmp(argv[i], "-n")) { + num_nodes = atoi(argv[++i]); + continue; + } + + if(!strcmp(argv[i], "-e")) { + num_edges = atoi(argv[++i]); + continue; + } + + if(!strcmp(argv[i], "-p")) { + num_pieces = atoi(argv[++i]); + continue; + } + } + } + + struct InitDataArgs { + int index; + RegionInstance ri_nodes, ri_edges; + }; + + enum PRNGStreams + { + NODE_SUBCKT_STREAM, + EDGE_IN_NODE_STREAM, + EDGE_OUT_NODE_STREAM1, + EDGE_OUT_NODE_STREAM2, + }; + + // nodes and edges are generated pseudo-randomly so that we can check the results + // without + // needing all the field data in any one place + void random_node_data(int idx, int &subckt) + { + if(random_colors) + subckt = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBCKT_STREAM, num_pieces); + else + subckt = idx * num_pieces / num_nodes; + } + + void random_edge_data(int idx, Point<1> &in_node, Point<1> &out_node) + { + if(random_colors) { + in_node = Philox_2x32<>::rand_int(random_seed, idx, EDGE_IN_NODE_STREAM, num_nodes); + out_node = + Philox_2x32<>::rand_int(random_seed, idx, EDGE_OUT_NODE_STREAM1, num_nodes); + } else { + int subckt = idx * num_pieces / num_edges; + int n_lo = subckt * num_nodes / num_pieces; + int n_hi = (subckt + 1) * num_nodes / num_pieces; + in_node = n_lo + Philox_2x32<>::rand_int(random_seed, idx, EDGE_IN_NODE_STREAM, + n_hi - n_lo); + int pct = Philox_2x32<>::rand_int(random_seed, idx, EDGE_OUT_NODE_STREAM2, 100); + if(pct < pct_wire_in_piece) + out_node = n_lo + Philox_2x32<>::rand_int(random_seed, idx, EDGE_OUT_NODE_STREAM1, + n_hi - n_lo); + else + out_node = + Philox_2x32<>::rand_int(random_seed, idx, EDGE_OUT_NODE_STREAM1, num_nodes); + } + } + + static void init_data_task_wrapper(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) + { + CircuitTest *me = (CircuitTest *)testcfg; + me->init_data_task(args, arglen, p); + } + + void init_data_task(const void *args, size_t arglen, Processor p) + { + const InitDataArgs &i_args = *(const InitDataArgs *)args; + + log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes + << ", ri_edges=" << i_args.ri_edges << ")"; + + i_args.ri_nodes.fetch_metadata(p).wait(); + i_args.ri_edges.fetch_metadata(p).wait(); + + IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>(); + IndexSpace<1> is_edges = i_args.ri_edges.get_indexspace<1>(); + + log_app.debug() << "N: " << is_nodes; + log_app.debug() << "E: " << is_edges; + + { + AffineAccessor a_subckt_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) { + int subckt; + random_node_data(i, subckt); + a_subckt_id.write(i, subckt); + } + } + + { + AffineAccessor, 1> a_in_node(i_args.ri_edges, + 0 * sizeof(Point<1>) /* offset */); + AffineAccessor, 1> a_out_node(i_args.ri_edges, + 1 * sizeof(Point<1>) /* offset */); + + for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) { + Point<1> in_node, out_node; + random_edge_data(i, in_node, out_node); + a_in_node.write(i, in_node); + a_out_node.write(i, out_node); + } + } + + if(show_graph) { + AffineAccessor a_subckt_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) + std::cout << "subckt_id[" << i << "] = " << a_subckt_id.read(i) << "\n"; + + AffineAccessor, 1> a_in_node(i_args.ri_edges, + 0 * sizeof(Point<1>) /* offset */); + + for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) + std::cout << "in_node[" << i << "] = " << a_in_node.read(i) << "\n"; + + AffineAccessor, 1> a_out_node(i_args.ri_edges, + 1 * sizeof(Point<1>) /* offset */); + + for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) + std::cout << "out_node[" << i << "] = " << a_out_node.read(i) << "\n"; + } + } + + IndexSpace<1> is_nodes, is_edges; + std::vector ri_nodes; + std::vector, int>> subckt_field_data; + std::vector ri_edges; + std::vector, Point<1>>> in_node_field_data; + std::vector, Point<1>>> out_node_field_data; + + virtual void print_info(void) + { + printf("Realm dependent partitioning test - circuit: %d nodes, %d edges, %d pieces\n", + (int)num_nodes, (int)num_edges, (int)num_pieces); + } + + virtual Event initialize_data(const std::vector &memories, + const std::vector &procs) + { + // now create index spaces for nodes and edges + is_nodes = Rect<1>(0, num_nodes - 1); + is_edges = Rect<1>(0, num_edges - 1); + + // equal partition is used to do initial population of edges and nodes + std::vector> ss_nodes_eq; + std::vector> ss_edges_eq; + + is_nodes + .create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()) + .wait(); + is_edges + .create_equal_subspaces(num_pieces, 1, ss_edges_eq, Realm::ProfilingRequestSet()) + .wait(); + + log_app.debug() << "Initial partitions:"; + for(size_t i = 0; i < ss_nodes_eq.size(); i++) + log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i]; + for(size_t i = 0; i < ss_edges_eq.size(); i++) + log_app.debug() << " Edges #" << i << ": " << ss_edges_eq[i]; + + // create instances for each of these subspaces + std::vector node_fields, edge_fields; + node_fields.push_back(sizeof(int)); // subckt_id + assert(sizeof(int) == sizeof(Point<1>)); + edge_fields.push_back(sizeof(Point<1>)); // in_node + edge_fields.push_back(sizeof(Point<1>)); // out_node + + ri_nodes.resize(num_pieces); + subckt_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_nodes_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i], + node_fields, 0 /*SOA*/, + Realm::ProfilingRequestSet()) + .wait(); + ri_nodes[i] = ri; + + subckt_field_data[i].index_space = ss_nodes_eq[i]; + subckt_field_data[i].inst = ri_nodes[i]; + subckt_field_data[i].field_offset = 0; + } + + ri_edges.resize(num_pieces); + in_node_field_data.resize(num_pieces); + out_node_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_edges_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, memories[i % memories.size()], ss_edges_eq[i], + edge_fields, 0 /*SOA*/, + Realm::ProfilingRequestSet()) + .wait(); + ri_edges[i] = ri; + + in_node_field_data[i].index_space = ss_edges_eq[i]; + in_node_field_data[i].inst = ri_edges[i]; + in_node_field_data[i].field_offset = 0 * sizeof(Point<1>); + + out_node_field_data[i].index_space = ss_edges_eq[i]; + out_node_field_data[i].inst = ri_edges[i]; + out_node_field_data[i].field_offset = 1 * sizeof(Point<1>); + } + + // fire off tasks to initialize data + std::set events; + for(int i = 0; i < num_pieces; i++) { + Processor p = procs[i % memories.size()]; + InitDataArgs args; + args.index = i; + args.ri_nodes = ri_nodes[i]; + args.ri_edges = ri_edges[i]; + Event e = p.spawn(INIT_CIRCUIT_DATA_TASK, &args, sizeof(args)); + events.insert(e); + } + + return Event::merge_events(events); + } + + // the outputs of our partitioning will be: + // is_private, is_shared - subsets of is_nodes based on private/shared + // p_pvt, p_shr, p_ghost - subsets of the above split by subckt + // p_edges - subsets of is_edges for each subckt + + IndexSpace<1> is_shared, is_private; + std::vector> p_pvt, p_shr, p_ghost; + std::vector> p_edges; + + virtual Event perform_partitioning(void) + { + // first partition nodes by subckt id (this is the independent partition, + // but not actually used by the app) + std::vector> p_nodes; + + std::vector colors(num_pieces); + for(int i = 0; i < num_pieces; i++) + colors[i] = i; + + Event e1 = is_nodes.create_subspaces_by_field(subckt_field_data, colors, p_nodes, + Realm::ProfilingRequestSet()); + if(wait_on_events) + e1.wait(); + + // now compute p_edges based on the color of their in_node (i.e. a preimage) + Event e2 = is_edges.create_subspaces_by_preimage(in_node_field_data, p_nodes, p_edges, + Realm::ProfilingRequestSet(), e1); + if(wait_on_events) + e2.wait(); + + // an image of p_edges through out_node gives us all the shared nodes, along + // with some private nodes +#ifdef USE_IMAGE_DIFF + Event e4 = is_nodes.create_subspaces_by_image_with_difference( + out_node_field_data, p_edges, p_nodes, p_ghost, Realm::ProfilingRequestSet(), e2); + if(wait_on_events) + e4.wait(); +#else + std::vector> p_extra_nodes; + + Event e3 = is_nodes.create_subspaces_by_image( + out_node_field_data, p_edges, p_extra_nodes, Realm::ProfilingRequestSet(), e2); + if(wait_on_events) + e3.wait(); + + // subtracting out those private nodes gives us p_ghost + Event e4 = IndexSpace<1>::compute_differences(p_extra_nodes, p_nodes, p_ghost, + Realm::ProfilingRequestSet(), e3); + if(wait_on_events) + e4.wait(); +#endif + + // the union of everybody's ghost nodes is is_shared + Event e5 = IndexSpace<1>::compute_union(p_ghost, is_shared, + Realm::ProfilingRequestSet(), e4); + if(wait_on_events) + e5.wait(); + + // and is_private is just the nodes of is_nodes that aren't in is_shared + Event e6 = IndexSpace<1>::compute_difference(is_nodes, is_shared, is_private, + Realm::ProfilingRequestSet(), e5); + if(wait_on_events) + e6.wait(); + + // the intersection of the original p_nodes with is_shared gives us p_shr + // (note that we can do this in parallel with the computation of is_private) + Event e7 = IndexSpace<1>::compute_intersections(p_nodes, is_shared, p_shr, + Realm::ProfilingRequestSet(), e5); + if(wait_on_events) + e7.wait(); + + // and finally, the intersection of p_nodes with is_private gives us p_pvt + Event e8 = IndexSpace<1>::compute_intersections(p_nodes, is_private, p_pvt, + Realm::ProfilingRequestSet(), e6); + if(wait_on_events) + e8.wait(); + + // all done - wait on e7 and e8, which dominate every other operation + Event e9 = Event::merge_events(e7, e8); + + for(unsigned idx = 0; idx < p_nodes.size(); idx++) { + p_nodes[idx].destroy(e9); + } + + return e9; + } + + virtual int perform_dynamic_checks(void) + { + int errors = 0; + // compute the intermediates for the checks - these duplicate things we + // already have, but we're not supposed to know that here + std::vector> p_pvt_and_shr, p_all; + Event e1 = IndexSpace<1>::compute_unions( + p_pvt, p_shr, p_pvt_and_shr, Realm::ProfilingRequestSet(), Event::NO_EVENT); + Event e2 = IndexSpace<1>::compute_unions(p_pvt_and_shr, p_ghost, p_all, + Realm::ProfilingRequestSet(), e1); +#ifdef USE_IMAGE_DIFF + std::vector> p_in_test, p_out_test; + Event e5 = is_nodes.create_subspaces_by_image_with_difference( + in_node_field_data, p_edges, p_pvt_and_shr, p_in_test, + Realm::ProfilingRequestSet(), e1); + Event e6 = is_nodes.create_subspaces_by_image_with_difference( + out_node_field_data, p_edges, p_all, p_out_test, Realm::ProfilingRequestSet(), + e2); +#else + std::vector> p_in_img, p_out_img; + Event e3 = + is_nodes.create_subspaces_by_image(in_node_field_data, p_edges, p_in_img, + Realm::ProfilingRequestSet(), Event::NO_EVENT); + Event e4 = + is_nodes.create_subspaces_by_image(out_node_field_data, p_edges, p_out_img, + Realm::ProfilingRequestSet(), Event::NO_EVENT); + std::vector> p_in_test, p_out_test; + Event e5 = IndexSpace<1>::compute_differences(p_in_img, p_pvt_and_shr, p_in_test, + Realm::ProfilingRequestSet(), + Event::merge_events(e1, e3)); + Event e6 = IndexSpace<1>::compute_differences(p_out_img, p_all, p_out_test, + Realm::ProfilingRequestSet(), + Event::merge_events(e2, e4)); + for(unsigned idx = 0; idx < p_in_img.size(); idx++) { + p_in_img[idx].destroy(e5); + } + for(unsigned idx = 0; idx < p_out_img.size(); idx++) { + p_out_img[idx].destroy(e6); + } +#endif + + errors += check_empty(e5, p_in_test, "p_in_test"); + errors += check_empty(e6, p_out_test, "p_out_test"); + for(unsigned idx = 0; idx < p_pvt_and_shr.size(); idx++) { + p_pvt_and_shr[idx].destroy(e5); + } + for(unsigned idx = 0; idx < p_all.size(); idx++) { + p_all[idx].destroy(e6); + } + for(unsigned idx = 0; idx < p_in_test.size(); idx++) { + p_in_test[idx].destroy(e5); + } + for(unsigned idx = 0; idx < p_out_test.size(); idx++) { + p_out_test[idx].destroy(e6); + } + + return errors; + } + + virtual int check_partitioning(void) + { + int errors = 0; + + // we'll make up the list of nodes we expect to be shared as we walk the edges + std::map> ghost_nodes; + +#ifdef DUMP_OUTPUT_SPACES + dump_sparse_index_space<1, int>("is_private", is_private); + dump_sparse_index_space<1, int>("is_shared", is_shared); + + for(int p = 0; p < num_pieces; p++) { + std::cout << "Piece #" << p << "\n"; + dump_sparse_index_space<1, int>("p_pvt", p_pvt[p]); + dump_sparse_index_space<1, int>("p_shr", p_shr[p]); + dump_sparse_index_space<1, int>("p_ghost", p_ghost[p]); + } +#endif + + for(int i = 0; i < num_edges; i++) { + // regenerate the random info for this edge and the two nodes it touches + Point<1> in_node, out_node; + int in_subckt, out_subckt; + random_edge_data(i, in_node, out_node); + random_node_data(in_node, in_subckt); + random_node_data(out_node, out_subckt); + + // the edge should be in exactly the p_edges for in_subckt + for(int p = 0; p < num_pieces; p++) { + bool exp = (p == in_subckt); + bool act = p_edges[p].contains(i); + if(exp != act) { + log_app.error() << "mismatch: edge " << i << " in p_edges[" << p + << "]: exp=" << exp << " act=" << act; + errors++; + } + } + + // is the output node a ghost for this wire? + if(in_subckt != out_subckt) + ghost_nodes[out_node].insert(in_subckt); + } + + // now we can check the nodes + for(int i = 0; i < num_nodes; i++) { + int subckt; + random_node_data(i, subckt); + // check is_private and is_shared first + { + bool exp = ghost_nodes.count(i) == 0; + bool act = is_private.contains(i); + if(exp != act) { + log_app.error() << "mismatch: node " << i << " in is_private: exp=" << exp + << " act=" << act; + errors++; + } + } + { + bool exp = ghost_nodes.count(i) > 0; + bool act = is_shared.contains(i); + if(exp != act) { + log_app.error() << "mismatch: node " << i << " in is_shared: exp=" << exp + << " act=" << act; + errors++; + } + } + + // now check p_pvt/shr/ghost + for(int p = 0; p < num_pieces; p++) { + bool exp = (subckt == p) && (ghost_nodes.count(i) == 0); + bool act = p_pvt[p].contains(i); + if(exp != act) { + log_app.error() << "mismatch: node " << i << " in p_pvt[" << p + << "]: exp=" << exp << " act=" << act; + errors++; + } + } + for(int p = 0; p < num_pieces; p++) { + bool exp = (subckt == p) && (ghost_nodes.count(i) > 0); + bool act = p_shr[p].contains(i); + if(exp != act) { + log_app.error() << "mismatch: node " << i << " in p_shr[" << p + << "]: exp=" << exp << " act=" << act; + errors++; + } + } + for(int p = 0; p < num_pieces; p++) { + bool exp = + (subckt != p) && (ghost_nodes.count(i) > 0) && (ghost_nodes[i].count(p) > 0); + bool act = p_ghost[p].contains(i); + if(exp != act) { + log_app.error() << "mismatch: node " << i << " in p_ghost[" << p + << "]: exp=" << exp << " act=" << act; + errors++; + } + } + } + + is_shared.destroy(); + is_private.destroy(); + for(unsigned idx = 0; idx < p_pvt.size(); idx++) { + p_pvt[idx].destroy(); + } + for(unsigned idx = 0; idx < p_shr.size(); idx++) { + p_shr[idx].destroy(); + } + for(unsigned idx = 0; idx < p_ghost.size(); idx++) { + p_ghost[idx].destroy(); + } + for(unsigned idx = 0; idx < p_edges.size(); idx++) { + p_edges[idx].destroy(); + } + + return errors; + } +}; + +class PennantTest : public TestInterface { +public: +public: + // graph config parameters + enum MeshType + { + RectangularMesh, + }; + MeshType mesh_type = RectangularMesh; + int nzx = 10; // number of zones in x + int nzy = 10; // number of zones in y + int numpcx = 2; // number of submeshes in x + int numpcy = 2; // number of submeshes in y + + int npx, npy; // number of points in each dimension + int nz, ns, np, numpc; // total number of zones, sides, points, and pieces + std::vector zxbound, zybound; // x and y split points between submeshes + std::vector lz, ls, lp; // number of zones, sides, and points in each submesh + + // can't do 64-bit index types right now, so at least get most of our 32-bit space + typedef int INDEXTYPE; + static const INDEXTYPE FIRST_INDEX = -2000000000; // easier to read than INT_MIN+1 + + PennantTest(int argc, const char *argv[]) + { +#define INT_ARG(s, v) \ + if(!strcmp(argv[i], s)) { \ + v = atoi(argv[++i]); \ + continue; \ + } + for(int i = 1; i < argc; i++) { + INT_ARG("-nzx", nzx) + INT_ARG("-nzy", nzy) + INT_ARG("-numpcx", numpcx) + INT_ARG("-numpcy", numpcy) + if(!strcmp(argv[i], "-nz")) { + int v = atoi(argv[++i]); + nzx = nzy = v; + continue; + } + if(!strcmp(argv[i], "-numpc")) { + int v = atoi(argv[++i]); + numpcx = numpcy = v; + continue; + } + } +#undef INT_ARG + + switch(mesh_type) { + case RectangularMesh: + { + npx = nzx + 1; + npy = nzy + 1; + numpc = numpcx * numpcy; + + zxbound.resize(numpcx + 1); + for(int i = 0; i <= numpcx; i++) + zxbound[i] = (i * nzx) / numpcx; + + zybound.resize(numpcy + 1); + for(int i = 0; i <= numpcy; i++) + zybound[i] = (i * nzy) / numpcy; + + nz = ns = np = 0; + for(int pcy = 0; pcy < numpcy; pcy++) { + for(int pcx = 0; pcx < numpcx; pcx++) { + int lx = zxbound[pcx + 1] - zxbound[pcx]; + int ly = zybound[pcy + 1] - zybound[pcy]; + + int zones = lx * ly; + int sides = zones * 4; + // points are a little funny - shared edges go to the lower numbered piece + int points = ((pcx == 0) ? (lx + 1) : lx) * ((pcy == 0) ? (ly + 1) : ly); + + lz.push_back(zones); + ls.push_back(sides); + lp.push_back(points); + nz += zones; + ns += sides; + np += points; + } + } + + assert(nz == (nzx * nzy)); + assert(ns == (4 * nzx * nzy)); + assert(np == (npx * npy)); + + break; + } + } + } + + virtual void print_info(void) + { + printf("Realm dependent partitioning test - pennant: %d x %d zones, %d x %d pieces\n", + (int)nzx, (int)nzy, (int)numpcx, (int)numpcy); + } + + IndexSpace<1> is_zones, is_sides, is_points; + std::vector ri_zones; + std::vector, int>> zone_color_field_data; + std::vector ri_sides; + std::vector, Point<1>>> side_mapsz_field_data; + std::vector, Point<1>>> side_mapss3_field_data; + std::vector, Point<1>>> side_mapsp1_field_data; + std::vector, bool>> side_ok_field_data; + + struct InitDataArgs { + int index; + RegionInstance ri_zones, ri_sides; + }; + + virtual Event initialize_data(const std::vector &memories, + const std::vector &procs) + { + // top level index spaces + is_zones = Rect<1>(FIRST_INDEX, FIRST_INDEX + nz - 1); + is_sides = Rect<1>(FIRST_INDEX, FIRST_INDEX + ns - 1); + is_points = Rect<1>(FIRST_INDEX, FIRST_INDEX + np - 1); + + // weighted partitions based on the distribution we already computed + std::vector> ss_zones_w; + std::vector> ss_sides_w; + std::vector> ss_points_w; + + is_zones + .create_weighted_subspaces(numpc, 1, lz, ss_zones_w, Realm::ProfilingRequestSet()) + .wait(); + is_sides + .create_weighted_subspaces(numpc, 1, ls, ss_sides_w, Realm::ProfilingRequestSet()) + .wait(); + is_points + .create_weighted_subspaces(numpc, 1, lp, ss_points_w, + Realm::ProfilingRequestSet()) + .wait(); + + log_app.debug() << "Initial partitions:"; + for(size_t i = 0; i < ss_zones_w.size(); i++) + log_app.debug() << " Zones #" << i << ": " << ss_zones_w[i]; + for(size_t i = 0; i < ss_sides_w.size(); i++) + log_app.debug() << " Sides #" << i << ": " << ss_sides_w[i]; + for(size_t i = 0; i < ss_points_w.size(); i++) + log_app.debug() << " Points #" << i << ": " << ss_points_w[i]; + + // create instances for each of these subspaces + std::vector zone_fields, side_fields; + zone_fields.push_back(sizeof(int)); // color + assert(sizeof(int) == sizeof(Point<1>)); + side_fields.push_back(sizeof(Point<1>)); // mapsz + side_fields.push_back(sizeof(Point<1>)); // mapss3 + side_fields.push_back(sizeof(Point<1>)); // mapsp1 + side_fields.push_back(sizeof(bool)); // ok + + ri_zones.resize(numpc); + zone_color_field_data.resize(numpc); + + for(size_t i = 0; i < ss_zones_w.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, memories[i % memories.size()], ss_zones_w[i], + zone_fields, 0 /*SOA*/, + Realm::ProfilingRequestSet()) + .wait(); + ri_zones[i] = ri; + + zone_color_field_data[i].index_space = ss_zones_w[i]; + zone_color_field_data[i].inst = ri_zones[i]; + zone_color_field_data[i].field_offset = 0; + } + + ri_sides.resize(numpc); + side_mapsz_field_data.resize(numpc); + side_mapss3_field_data.resize(numpc); + side_mapsp1_field_data.resize(numpc); + side_ok_field_data.resize(numpc); + + for(size_t i = 0; i < ss_sides_w.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, memories[i % memories.size()], ss_sides_w[i], + side_fields, 0 /*SOA*/, + Realm::ProfilingRequestSet()) + .wait(); + ri_sides[i] = ri; + + side_mapsz_field_data[i].index_space = ss_sides_w[i]; + side_mapsz_field_data[i].inst = ri_sides[i]; + side_mapsz_field_data[i].field_offset = 0 * sizeof(Point<1>); + + side_mapss3_field_data[i].index_space = ss_sides_w[i]; + side_mapss3_field_data[i].inst = ri_sides[i]; + side_mapss3_field_data[i].field_offset = 1 * sizeof(Point<1>); + + side_mapsp1_field_data[i].index_space = ss_sides_w[i]; + side_mapsp1_field_data[i].inst = ri_sides[i]; + side_mapsp1_field_data[i].field_offset = 2 * sizeof(Point<1>); + + side_ok_field_data[i].index_space = ss_sides_w[i]; + side_ok_field_data[i].inst = ri_sides[i]; + side_ok_field_data[i].field_offset = 3 * sizeof(Point<1>); + } + + // fire off tasks to initialize data + std::set events; + for(int i = 0; i < numpc; i++) { + Processor p = procs[i % memories.size()]; + InitDataArgs args; + args.index = i; + args.ri_zones = ri_zones[i]; + args.ri_sides = ri_sides[i]; + Event e = p.spawn(INIT_PENNANT_DATA_TASK, &args, sizeof(args)); + events.insert(e); + } + + return Event::merge_events(events); + } + + static void init_data_task_wrapper(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) + { + PennantTest *me = (PennantTest *)testcfg; + me->init_data_task(args, arglen, p); + } + + Point<1> global_point_pointer(int py, int px) const + { + int pp = FIRST_INDEX; + + // start by steping over whole y slabs - again be careful that the extra slab belongs + // to pcy == 0 + int dy; + if(py > zybound[1]) { + int pcy = 1; + while(py > zybound[pcy + 1]) + pcy++; + int slabs = zybound[pcy] + 1; + pp += npx * slabs; + py -= slabs; + dy = zybound[pcy + 1] - zybound[pcy]; + } else { + dy = zybound[1] + 1; + } + + // now chunks in x, using just the y width of this row of chunks + int dx; + if(px > zxbound[1]) { + int pcx = 1; + while(px > zxbound[pcx + 1]) + pcx++; + int strips = zxbound[pcx] + 1; + pp += dy * strips; + px -= strips; + dx = zxbound[pcx + 1] - zxbound[pcx]; + } else { + dx = zxbound[1] + 1; + } + + // finally, px and py are now local and are handled easily + pp += py * dx + px; + + return pp; + } + + void init_data_task(const void *args, size_t arglen, Processor p) + { + const InitDataArgs &i_args = *(const InitDataArgs *)args; + + log_app.info() << "init task #" << i_args.index << " (ri_zones=" << i_args.ri_zones + << ", ri_sides=" << i_args.ri_sides << ")"; + + i_args.ri_zones.fetch_metadata(p).wait(); + i_args.ri_sides.fetch_metadata(p).wait(); + + IndexSpace<1> is_zones = i_args.ri_zones.get_indexspace<1>(); + IndexSpace<1> is_sides = i_args.ri_sides.get_indexspace<1>(); + + log_app.debug() << "Z: " << is_zones; + log_app.debug() << "S: " << is_sides; + + int pcx = i_args.index % numpcx; + int pcy = i_args.index / numpcx; + + int zxlo = zxbound[pcx]; + int zxhi = zxbound[pcx + 1]; + int zylo = zybound[pcy]; + int zyhi = zybound[pcy + 1]; + + { + AffineAccessor a_zone_color(i_args.ri_zones, 0 /* offset */); + AffineAccessor, 1> a_side_mapsz(i_args.ri_sides, + 0 * sizeof(Point<1>) /* offset */); + AffineAccessor, 1> a_side_mapss3(i_args.ri_sides, + 1 * sizeof(Point<1>) /* offset */); + AffineAccessor, 1> a_side_mapsp1(i_args.ri_sides, + 2 * sizeof(Point<1>) /* offset */); + AffineAccessor a_side_ok(i_args.ri_sides, + 3 * sizeof(Point<1>) /* offset */); + + Point<1> pz = is_zones.bounds.lo; + Point<1> ps = is_sides.bounds.lo; + + for(int zy = zylo; zy < zyhi; zy++) { + for(int zx = zxlo; zx < zxhi; zx++) { + // get 4 side pointers + Point<1> ps0 = ps; + ps[0]++; + Point<1> ps1 = ps; + ps[0]++; + Point<1> ps2 = ps; + ps[0]++; + Point<1> ps3 = ps; + ps[0]++; + + // point pointers are ugly because they can be in neighbors - use a helper + Point<1> pp0 = global_point_pointer(zy, zx); // go CCW + Point<1> pp1 = global_point_pointer(zy + 1, zx); + Point<1> pp2 = global_point_pointer(zy + 1, zx + 1); + Point<1> pp3 = global_point_pointer(zy, zx + 1); + + a_zone_color.write(pz, i_args.index); + + a_side_mapsz.write(ps0, pz); + a_side_mapsz.write(ps1, pz); + a_side_mapsz.write(ps2, pz); + a_side_mapsz.write(ps3, pz); + + a_side_mapss3.write(ps0, ps1); + a_side_mapss3.write(ps1, ps2); + a_side_mapss3.write(ps2, ps3); + a_side_mapss3.write(ps3, ps0); + + a_side_mapsp1.write(ps0, pp0); + a_side_mapsp1.write(ps1, pp1); + a_side_mapsp1.write(ps2, pp2); + a_side_mapsp1.write(ps3, pp3); + + a_side_ok.write(ps0, true); + a_side_ok.write(ps1, true); + a_side_ok.write(ps2, true); + a_side_ok.write(ps3, true); + + pz[0]++; + } + } + assert(pz[0] == is_zones.bounds.hi[0] + 1); + assert(ps[0] == is_sides.bounds.hi[0] + 1); + } + + if(show_graph) { + AffineAccessor a_zone_color(i_args.ri_zones, 0 /* offset */); + + for(int i = is_zones.bounds.lo; i <= is_zones.bounds.hi; i++) + std::cout << "Z[" << i << "]: color=" << a_zone_color.read(i) << "\n"; + + AffineAccessor, 1> a_side_mapsz(i_args.ri_sides, + 0 * sizeof(Point<1>) /* offset */); + AffineAccessor, 1> a_side_mapss3(i_args.ri_sides, + 1 * sizeof(Point<1>) /* offset */); + AffineAccessor, 1> a_side_mapsp1(i_args.ri_sides, + 2 * sizeof(Point<1>) /* offset */); + AffineAccessor a_side_ok(i_args.ri_sides, + 3 * sizeof(Point<1>) /* offset */); + + for(int i = is_sides.bounds.lo; i <= is_sides.bounds.hi; i++) + std::cout << "S[" << i << "]:" + << " mapsz=" << a_side_mapsz.read(i) + << " mapss3=" << a_side_mapss3.read(i) + << " mapsp1=" << a_side_mapsp1.read(i) << " ok=" << a_side_ok.read(i) + << "\n"; + } + } + + // the outputs of our partitioning will be: + // p_zones - subsets of is_zones split by piece + // p_sides - subsets of is_sides split by piece (with bad sides removed) + // p_points - subsets of is_points by piece (aliased) + + std::vector> p_zones; + std::vector> p_sides; + std::vector> p_points; + + virtual Event perform_partitioning(void) + { + // first get the set of bad sides (i.e. ok == false) + IndexSpace<1> bad_sides; + + Event e1 = is_sides.create_subspace_by_field(side_ok_field_data, false, bad_sides, + Realm::ProfilingRequestSet()); + if(wait_on_events) + e1.wait(); + + // map the bad sides through to bad zones + IndexSpace<1> bad_zones; + Event e2 = is_zones.create_subspace_by_image( + side_mapsz_field_data, bad_sides, bad_zones, Realm::ProfilingRequestSet(), e1); + if(wait_on_events) + e2.wait(); + bad_sides.destroy(e2); + + // subtract bad zones to get good zones + IndexSpace<1> good_zones; + Event e3 = IndexSpace<1>::compute_difference(is_zones, bad_zones, good_zones, + Realm::ProfilingRequestSet(), e2); + if(wait_on_events) + e3.wait(); + bad_zones.destroy(e3); + + // now do actual partitions with just good zones + std::vector colors(numpc); + for(int i = 0; i < numpc; i++) + colors[i] = i; + + Event e4 = good_zones.create_subspaces_by_field( + zone_color_field_data, colors, p_zones, Realm::ProfilingRequestSet(), e3); + if(wait_on_events) + e4.wait(); + good_zones.destroy(e4); + + // preimage of zones is sides + Event e5 = is_sides.create_subspaces_by_preimage( + side_mapsz_field_data, p_zones, p_sides, Realm::ProfilingRequestSet(), e4); + if(wait_on_events) + e5.wait(); + + // and image of sides->mapsp1 is points + Event e6 = is_points.create_subspaces_by_image( + side_mapsp1_field_data, p_sides, p_points, Realm::ProfilingRequestSet(), e5); + if(wait_on_events) + e6.wait(); + + return e6; + } + + virtual int perform_dynamic_checks(void) + { + int errors = 0; + + // pennant's checks are actually slower with the fused image/diff +#ifdef PENNANT_USE_IMAGE_DIFF + std::vector> p_z_test, p_p_test, p_s_test; + Event e4 = is_zones.create_subspaces_by_image_with_difference( + side_mapsz_field_data, p_sides, p_zones, p_z_test, Realm::ProfilingRequestSet()); + Event e5 = is_points.create_subspaces_by_image_with_difference( + side_mapsp1_field_data, p_sides, p_points, p_p_test, + Realm::ProfilingRequestSet()); + Event e6 = is_sides.create_subspaces_by_image_with_difference( + side_mapss3_field_data, p_sides, p_sides, p_s_test, Realm::ProfilingRequestSet()); +#else + std::vector> p_img_mapsz, p_img_mapsp1, p_img_mapss3; + Event e1 = is_zones.create_subspaces_by_image( + side_mapsz_field_data, p_sides, p_img_mapsz, Realm::ProfilingRequestSet()); + Event e2 = is_points.create_subspaces_by_image( + side_mapsp1_field_data, p_sides, p_img_mapsp1, Realm::ProfilingRequestSet()); + Event e3 = is_sides.create_subspaces_by_image( + side_mapss3_field_data, p_sides, p_img_mapss3, Realm::ProfilingRequestSet()); + std::vector> p_z_test, p_p_test, p_s_test; + Event e4 = IndexSpace<1>::compute_differences(p_img_mapsz, p_zones, p_z_test, + Realm::ProfilingRequestSet(), e1); + for(unsigned idx = 0; idx < p_img_mapsz.size(); idx++) { + p_img_mapsz[idx].destroy(e4); + } + Event e5 = IndexSpace<1>::compute_differences(p_img_mapsp1, p_points, p_p_test, + Realm::ProfilingRequestSet(), e2); + for(unsigned idx = 0; idx < p_img_mapsp1.size(); idx++) { + p_img_mapsp1[idx].destroy(e5); + } + Event e6 = IndexSpace<1>::compute_differences(p_img_mapss3, p_sides, p_s_test, + Realm::ProfilingRequestSet(), e3); + for(unsigned idx = 0; idx < p_img_mapss3.size(); idx++) { + p_img_mapss3[idx].destroy(e6); + } +#endif + errors += check_empty(e4, p_z_test, "p_z_test"); + errors += check_empty(e5, p_p_test, "p_p_test"); + errors += check_empty(e6, p_s_test, "p_s_test"); + for(unsigned idx = 0; idx < p_z_test.size(); idx++) { + p_z_test[idx].destroy(e4); + } + for(unsigned idx = 0; idx < p_p_test.size(); idx++) { + p_p_test[idx].destroy(e5); + } + for(unsigned idx = 0; idx < p_s_test.size(); idx++) { + p_s_test[idx].destroy(e6); + } + + return errors; + } + + virtual int check_partitioning(void) + { + int errors = 0; + + for(int pcy = 0; pcy < numpcy; pcy++) { + for(int pcx = 0; pcx < numpcx; pcx++) { + int idx = pcy * numpcx + pcx; + + int lx = zxbound[pcx + 1] - zxbound[pcx]; + int ly = zybound[pcy + 1] - zybound[pcy]; + + int exp_zones = lx * ly; + int exp_sides = exp_zones * 4; + int exp_points = (lx + 1) * (ly + 1); // easier because of aliasing + + int act_zones = p_zones[idx].volume(); + int act_sides = p_sides[idx].volume(); + int act_points = p_points[idx].volume(); + + if(exp_zones != act_zones) { + log_app.error() << "Piece #" << idx + << ": zone count mismatch: exp = " << exp_zones + << ", act = " << act_zones; + errors++; + } + if(exp_sides != act_sides) { + log_app.error() << "Piece #" << idx + << ": side count mismatch: exp = " << exp_sides + << ", act = " << act_sides; + errors++; + } + if(exp_points != act_points) { + log_app.error() << "Piece #" << idx + << ": point count mismatch: exp = " << exp_points + << ", act = " << act_points; + errors++; + } + } + } + + // check zones + Point<1> pz = is_zones.bounds.lo; + for(int pc = 0; pc < numpc; pc++) { + for(int i = 0; i < lz[pc]; i++) { + for(int j = 0; j < numpc; j++) { + bool exp = (j == pc); + bool act = p_zones[j].contains(pz); + if(exp != act) { + log_app.error() << "mismatch: zone " << pz << " in p_zones[" << j + << "]: exp=" << exp << " act=" << act; + errors++; + } + } + pz[0]++; + } + } + + // check sides + Point<1> ps = is_sides.bounds.lo; + for(int pc = 0; pc < numpc; pc++) { + for(int i = 0; i < ls[pc]; i++) { + for(int j = 0; j < numpc; j++) { + bool exp = (j == pc); + bool act = p_sides[j].contains(ps); + if(exp != act) { + log_app.error() << "mismatch: side " << ps << " in p_sides[" << j + << "]: exp=" << exp << " act=" << act; + errors++; + } + } + ps[0]++; + } + } + + // check points (trickier due to ghosting) + for(int py = 0; py < npy; py++) + for(int px = 0; px < npx; px++) { + Point<1> pp = global_point_pointer(py, px); + for(int pc = 0; pc < numpc; pc++) { + int pcy = pc / numpcx; + int pcx = pc % numpcx; + bool exp = ((py >= zybound[pcy]) && (py <= zybound[pcy + 1]) && + (px >= zxbound[pcx]) && (px <= zxbound[pcx + 1])); + bool act = p_points[pc].contains(pp); + if(exp != act) { + log_app.error() << "mismatch: point " << pp << " in p_points[" << pc + << "]: exp=" << exp << " act=" << act; + errors++; + } + } + } + + for(unsigned idx = 0; idx < p_zones.size(); idx++) { + p_zones[idx].destroy(); + } + for(unsigned idx = 0; idx < p_sides.size(); idx++) { + p_sides[idx].destroy(); + } + for(unsigned idx = 0; idx < p_points.size(); idx++) { + p_points[idx].destroy(); + } + + return errors; + } +}; + +template > +class RandStream { +public: + RandStream(unsigned _seed) + : seed(_seed) + , idx(0) + {} + + void setpos(unsigned long long _idx) { idx = _idx; } + void adjpos(long long _adj) { idx += _adj; } + + unsigned rand_int(unsigned n) + { + unsigned v = PRNG::rand_int(seed, idx >> 32, idx, n); + idx++; + return v; + } + + float rand_float(void) + { + float v = PRNG::rand_float(seed, idx >> 32, idx); + idx++; + return v; + } + + unsigned seed; + unsigned long long idx; +}; + +template +FT randval(RandStream<> &rs); + +template <> +float randval(RandStream<> &rs) +{ + return rs.rand_float(); +} + +template <> +int randval(RandStream<> &rs) +{ + return rs.rand_int(INT_MAX); +} + +template +class RandomTest : public TestInterface { +public: + RandomTest(int argc, const char *argv[]); + virtual ~RandomTest(void); + + virtual void print_info(void); + + virtual Event initialize_data(const std::vector &memories, + const std::vector &procs); + + virtual Event perform_partitioning(void); + + virtual int perform_dynamic_checks(void); + + virtual int check_partitioning(void); + + void fill_instance_data(IndexSpace ibounds, RegionInstance inst); + +protected: + T1 base1_min, base1_max, extent1_min, extent1_max; + T2 base2_min, base2_max, extent2_min, extent2_max; + int num_pieces, num_colors; + + Rect bounds1; + Rect bounds2; + IndexSpace root1; + IndexSpace root2; + std::vector colors; + std::vector ri_data1; + std::vector, FT>> fd_vals1; + std::vector, Point>> fd_ptrs1; +}; + +template +RandomTest::RandomTest(int argc, const char *argv[]) + : base1_min(0) + , base1_max(0) + , extent1_min(4) + , extent1_max(6) + , base2_min(0) + , base2_max(0) + , extent2_min(4) + , extent2_max(6) + , num_pieces(2) + , num_colors(4) +{ + RandStream<> rs(random_seed + 0); + + for(int i = 0; i < N1; i++) { + bounds1.lo[i] = base1_min + rs.rand_int(base1_max - base1_min + 1); + bounds1.hi[i] = + (bounds1.lo[i] + extent1_min + rs.rand_int(extent1_max - extent1_min + 1)); + } + for(int i = 0; i < N2; i++) { + bounds2.lo[i] = base2_min + rs.rand_int(base2_max - base2_min + 1); + bounds2.hi[i] = + (bounds2.lo[i] + extent2_min + rs.rand_int(extent2_max - extent2_min + 1)); + } + + colors.resize(num_colors); + for(int i = 0; i < num_colors; i++) + colors[i] = randval(rs); +} + +template +RandomTest::~RandomTest(void) +{} + +template +void RandomTest::print_info(void) +{ + printf("Realm dependent partitioning test - random\n"); +} + +template +void RandomTest::fill_instance_data(IndexSpace ibounds, + RegionInstance inst) +{ + { + // start with value field + AffineAccessor a_vals(inst, 0); + + // iterate over all points in root1 with initial random values + RandStream<> rs1(random_seed + 1); + for(PointInRectIterator pir(bounds1); pir.valid; pir.step()) { + FT v = colors[rs1.rand_int(colors.size())]; + if(ibounds.contains(pir.p)) + a_vals.write(pir.p, v); + } + + // print results + for(PointInRectIterator pir(bounds1); pir.valid; pir.step()) { + if(ibounds.contains(pir.p)) + log_app.debug() << "v[" << pir.p << "] = " << a_vals.read(pir.p); + } + } + + { + // now pointer field + AffineAccessor, N1, T1> a_ptrs(inst, 0 + sizeof(FT)); + + // iterate over all points in root1 with initial random values + RandStream<> rs2(random_seed + 2); + for(PointInRectIterator pir(bounds1); pir.valid; pir.step()) { + Point p2; + for(int i = 0; i < N2; i++) + p2[i] = bounds2.lo[i] + rs2.rand_int(bounds2.hi[i] - bounds2.lo[i] + 1); + if(ibounds.contains(pir.p)) + a_ptrs.write(pir.p, p2); + } + + // print results + for(PointInRectIterator pir(bounds1); pir.valid; pir.step()) { + if(ibounds.contains(pir.p)) + log_app.debug() << "p[" << pir.p << "] = " << a_ptrs.read(pir.p); + } + } +} + +template +Event RandomTest::initialize_data(const std::vector &memories, + const std::vector &procs) +{ + root1 = IndexSpace(bounds1); + root2 = IndexSpace(bounds2); + log_app.debug() << "root1 = " << root1; + log_app.debug() << "root2 = " << root2; + + // create instances to hold actual data + size_t num_insts = memories.size(); + log_app.debug() << "procs: " << procs; + log_app.debug() << "mems: " << memories; + std::vector> ss_inst1; + root1.create_equal_subspaces(num_insts, 1, ss_inst1, Realm::ProfilingRequestSet()) + .wait(); + + std::vector field_sizes; + field_sizes.push_back(sizeof(FT)); + field_sizes.push_back(sizeof(Point)); + + ri_data1.resize(num_insts); + fd_vals1.resize(num_insts); + fd_ptrs1.resize(num_insts); + + for(size_t i = 0; i < num_insts; i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, memories[i], ss_inst1[i], field_sizes, 0 /*SOA*/, + Realm::ProfilingRequestSet()) + .wait(); + log_app.debug() << "inst[" << i << "] = " << ri << " (" << ss_inst1[i] << ")"; + ri_data1[i] = ri; + + fd_vals1[i].index_space = ss_inst1[i]; + fd_vals1[i].inst = ri; + fd_vals1[i].field_offset = 0; + + fd_ptrs1[i].index_space = ss_inst1[i]; + fd_ptrs1[i].inst = ri; + fd_ptrs1[i].field_offset = 0 + sizeof(FT); + } + + log_app.debug() << "colors = " << colors; + + for(size_t i = 0; i < num_insts; i++) { + fill_instance_data(root1 /*ss_inst1[i]*/, ri_data1[i]); + } + + return Event::NO_EVENT; +} + +template +Event RandomTest::perform_partitioning(void) +{ + // start by filtering root1 by color + std::vector piece_colors(colors.begin(), colors.begin() + num_pieces); + std::vector> ss_by_color; + Event e1 = root1.create_subspaces_by_field(fd_vals1, piece_colors, ss_by_color, + ProfilingRequestSet()); + e1.wait(); + + for(int i = 0; i < num_pieces; i++) { + log_app.debug() << "bycolor[" << i << "] (" << colors[i] << ") = " << ss_by_color[i]; + dump_sparse_index_space("", ss_by_color[i]); + } + + // images + std::vector> ss_images; + Event e2 = root2.create_subspaces_by_image(fd_ptrs1, ss_by_color, ss_images, + ProfilingRequestSet(), e1); + + e2.wait(); + + for(int i = 0; i < num_pieces; i++) { + log_app.debug() << "image[" << i << "] = " << ss_images[i]; + dump_sparse_index_space("", ss_images[i]); + } + + // preimages + std::vector> ss_preimages; + Event e3 = root1.create_subspaces_by_preimage(fd_ptrs1, ss_images, ss_preimages, + ProfilingRequestSet(), e2); + + e3.wait(); + + for(int i = 0; i < num_pieces; i++) { + log_app.debug() << "preimage[" << i << "] = " << ss_preimages[i]; + dump_sparse_index_space("", ss_preimages[i]); + ss_by_color[i].destroy(); + ss_images[i].destroy(); + ss_preimages[i].destroy(); + } + + return Event::NO_EVENT; +} + +template +int RandomTest::perform_dynamic_checks(void) +{ + return 0; +} + +template +int RandomTest::check_partitioning(void) +{ + return 0; +} + +void top_level_task(const void *args, size_t arglen, const void *userdata, size_t userlen, + Processor p) +{ + int errors = 0; + + testcfg->print_info(); + + // find all the system memories - we'll stride our data across them + // for each memory, we'll need one CPU that can do the initialization of the data + std::vector sysmems; + std::vector procs; + + Machine machine = Machine::get_machine(); + { + std::set all_memories; + machine.get_all_memories(all_memories); + for(std::set::const_iterator it = all_memories.begin(); + it != all_memories.end(); it++) { + Memory m = *it; + + // skip memories with no capacity for creating instances + if(m.capacity() == 0) + continue; + + if(m.kind() == Memory::SYSTEM_MEM) { + sysmems.push_back(m); + std::set pset; + machine.get_shared_processors(m, pset); + Processor p = Processor::NO_PROC; + for(std::set::const_iterator it2 = pset.begin(); it2 != pset.end(); + it2++) { + if(it2->kind() == Processor::LOC_PROC) { + p = *it2; + break; + } + } + assert(p.exists()); + procs.push_back(p); + log_app.debug() << "System mem #" << (sysmems.size() - 1) << " = " + << *sysmems.rbegin() << " (" << *procs.rbegin() << ")"; + } + } + } + assert(sysmems.size() > 0); + + { + Realm::TimeStamp ts("initialization", true, &log_app); + + Event e = testcfg->initialize_data(sysmems, procs); + // wait for all initialization to be done + e.wait(); + } + + // now actual partitioning work + { + Realm::TimeStamp ts("dependent partitioning work", true, &log_app); + + Event e = testcfg->perform_partitioning(); + + e.wait(); + } + + // dynamic checks (which would be eliminated by compiler) + { + Realm::TimeStamp ts("dynamic checks", true, &log_app); + errors += testcfg->perform_dynamic_checks(); + } + + if(!skip_check) { + log_app.print() << "checking correctness of partitioning"; + Realm::TimeStamp ts("verification", true, &log_app); + errors += testcfg->check_partitioning(); + } + + if(errors > 0) { + printf("Exiting with errors\n"); + exit(1); + } + + printf("all done!\n"); +} + +template +class RandomAffineTest : public TestInterface { +public: + RandomAffineTest(int argc, const char *argv[], + const std::vector &transforms); + virtual ~RandomAffineTest(void); + + virtual void print_info(void); + + virtual Event initialize_data(const std::vector &memories, + const std::vector &procs); + + virtual Event perform_partitioning(void); + + virtual int perform_dynamic_checks(void); + + virtual int check_partitioning(void); + + void fill_instance_data(IndexSpace ibounds, RegionInstance inst); + + int verify_results(const IndexSpace &root, const TRANSFORM &transform, + const std::vector>> &images, + const std::vector>> &preimages); + +protected: + std::vector transforms; + T1 base1_min, base1_max, extent1_min, extent1_max; + T2 base2_min, base2_max, extent2_min, extent2_max; + int num_pieces, num_colors; + + // std::vector> transforms; + + std::vector>> dense_images; + std::vector>> sparse_images; + + std::vector> ss_by_color; + + std::vector>> dense_preimages; + std::vector>> sparse_preimages; + + Rect bounds1; + Rect bounds2; + IndexSpace root1; + IndexSpace root2; + IndexSpace root2_sparse; + std::vector colors; + std::vector ri_data1; + std::vector, FT>> fd_vals1; +}; + +template +RandomAffineTest::RandomAffineTest( + int argc, const char *argv[], const std::vector &_transforms) + : transforms(_transforms) + , base1_min(0) + , base1_max(0) + , extent1_min(4) + , extent1_max(6) + , base2_min(0) + , base2_max(0) + , extent2_min(4) + , extent2_max(6) + , num_pieces(2) + , num_colors(4) +{ + RandStream<> rs(random_seed + 2); + + for(int i = 0; i < N1; i++) { + bounds1.lo[i] = base1_min + rs.rand_int(base1_max - base1_min + 1); + bounds1.hi[i] = + (bounds1.lo[i] + extent1_min + rs.rand_int(extent1_max - extent1_min + 1)); + } + for(int i = 0; i < N2; i++) { + bounds2.lo[i] = base2_min + rs.rand_int(base2_max - base2_min + 1); + bounds2.hi[i] = + (bounds2.lo[i] + extent2_min + rs.rand_int(extent2_max - extent2_min + 1)); + } + + colors.resize(num_colors); + + for(int i = 0; i < num_colors; i++) + colors[i] = randval(rs); + + dense_images.resize(transforms.size()); + sparse_images.resize(transforms.size()); + + dense_preimages.resize(transforms.size()); + sparse_preimages.resize(transforms.size()); +} + +template +RandomAffineTest::~RandomAffineTest(void) +{} + +template +void RandomAffineTest::print_info(void) +{ + printf("Realm dependent partitioning test - random affine\n"); +} + +template +void RandomAffineTest::fill_instance_data( + IndexSpace ibounds, RegionInstance inst) +{ + { + // start with value field + AffineAccessor a_vals(inst, 0); + + // iterate over all points in root1 with initial random values + RandStream<> rs1(random_seed + 1); + for(PointInRectIterator pir(bounds1); pir.valid; pir.step()) { + FT v = colors[rs1.rand_int(2)]; + if(ibounds.contains(pir.p)) + a_vals.write(pir.p, v); + } + + // print results + for(PointInRectIterator pir(bounds1); pir.valid; pir.step()) { + if(ibounds.contains(pir.p)) + log_app.debug() << "v[" << pir.p << "] = " << a_vals.read(pir.p); + } + } +} + +template +Event RandomAffineTest::initialize_data( + const std::vector &memories, const std::vector &procs) +{ + std::vector> sparse_points; + int index = 0; + for(PointInRectIterator pir(bounds2); pir.valid; pir.step()) { + if(index % 2 == 0) { + sparse_points.push_back(pir.p); + } + index++; + } + SparsityMap sparse_map = + SparsityMap::construct(sparse_points, true, true); + + root1 = IndexSpace(bounds1); + root2 = IndexSpace(bounds2); + root2_sparse = IndexSpace(bounds2, sparse_map); + + log_app.debug() << "root1 = " << root1; + log_app.debug() << "root2 = " << root2; + log_app.debug() << "root2_sparse = " << root2_sparse; + + // create instances to hold actual data + size_t num_insts = memories.size(); + log_app.debug() << "procs: " << procs; + log_app.debug() << "mems: " << memories; + std::vector> ss_inst1; + root1.create_equal_subspaces(num_insts, 1, ss_inst1, Realm::ProfilingRequestSet()) + .wait(); + + std::vector field_sizes; + field_sizes.push_back(sizeof(FT)); + field_sizes.push_back(sizeof(Point)); + + ri_data1.resize(num_insts); + fd_vals1.resize(num_insts); + + for(size_t i = 0; i < num_insts; i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, memories[i], ss_inst1[i], field_sizes, 0 /*SOA*/, + Realm::ProfilingRequestSet()) + .wait(); + log_app.debug() << "inst[" << i << "] = " << ri << " (" << ss_inst1[i] << ")"; + ri_data1[i] = ri; + + fd_vals1[i].index_space = ss_inst1[i]; + fd_vals1[i].inst = ri; + fd_vals1[i].field_offset = 0; + } + + log_app.debug() << "colors = " << colors; + + for(size_t i = 0; i < num_insts; i++) { + fill_instance_data(root1 /*ss_inst1[i]*/, ri_data1[i]); + } + + return Event::NO_EVENT; +} + +template +Event RandomAffineTest::perform_partitioning(void) +{ + // start by filtering root1 by color + std::vector piece_colors(colors.begin(), colors.begin() + num_pieces); + + Event e1 = root1.create_subspaces_by_field(fd_vals1, piece_colors, ss_by_color, + ProfilingRequestSet()); + e1.wait(); + + for(int i = 0; i < num_pieces; i++) { + log_app.debug() << "bycolor[" << i << "] (" << colors[i] << ") = " << ss_by_color[i]; + dump_sparse_index_space("", ss_by_color[i]); + } + + for(size_t idx = 0; idx < transforms.size(); idx++) { + log_app.debug() << "Compute images for transform idx=" << idx; + + unsigned long long start_time = Clock::current_time_in_nanoseconds(); + // images + Event e2 = root2.create_subspaces_by_image( + transforms[idx], ss_by_color, dense_images[idx], ProfilingRequestSet(), e1); + e2.wait(); + + log_app.debug() << "affine image time=" + << (Clock::current_time_in_nanoseconds() - start_time); + + for(int i = 0; i < num_pieces; i++) { + log_app.debug() << "image[" << i << "] = " << dense_images[idx][i]; + dump_sparse_index_space("", dense_images[idx][i]); + } + + start_time = Clock::current_time_in_nanoseconds(); + Event e3 = root2_sparse.create_subspaces_by_image( + transforms[idx], ss_by_color, sparse_images[idx], ProfilingRequestSet(), e2); + + e3.wait(); + log_app.debug() << "affine sparse image time=" + << (Clock::current_time_in_nanoseconds() - start_time); + + for(int i = 0; i < num_pieces; i++) { + log_app.debug() << "sparse_image1[" << i << "] = " << sparse_images[idx][i]; + dump_sparse_index_space("", sparse_images[idx][i]); + } + + // preimages + Event e4 = root1.create_subspaces_by_preimage(transforms[idx], dense_images[idx], + dense_preimages[idx], + ProfilingRequestSet(), e3); + e4.wait(); + + for(int i = 0; i < num_pieces; i++) { + log_app.debug() << "dense_preimage[" << i << "] = " << dense_preimages[idx][i]; + dump_sparse_index_space("", dense_preimages[idx][i]); + } + + Event e5 = root1.create_subspaces_by_preimage(transforms[idx], sparse_images[idx], + sparse_preimages[idx], + ProfilingRequestSet(), e4); + e5.wait(); + + for(int i = 0; i < num_pieces; i++) { + log_app.debug() << "sparse_preimage[" << i << "] = " << sparse_preimages[idx][i]; + dump_sparse_index_space("", sparse_preimages[idx][i]); + } + } + + return Event::NO_EVENT; +} + +template +int RandomAffineTest::perform_dynamic_checks(void) +{ + return 0; +} + +template +int RandomAffineTest::verify_results( + const IndexSpace &root, const TRANSFORM &transform, + const std::vector>> &images, + const std::vector>> &preimages) +{ + for(size_t idx = 0; idx < transforms.size(); idx++) { + assert(ss_by_color.size() == images[idx].size() && + images[idx].size() == preimages[idx].size()); + int image_total = 0; + for(const auto &image : images[idx]) { + for(IndexSpaceIterator it2(image); it2.valid; it2.step()) { + image_total += it2.rect.volume(); + } + } + + int preimage_total = 0; + for(const auto &preimage : preimages[idx]) { + for(IndexSpaceIterator it2(preimage); it2.valid; it2.step()) { + preimage_total += it2.rect.volume(); + } + } + + if(image_total != preimage_total) + return 1; + + for(size_t i = 0; i < ss_by_color.size(); i++) { + for(IndexSpaceIterator it(ss_by_color[i]); it.valid; it.step()) { + for(PointInRectIterator point(it.rect); point.valid; point.step()) { + auto target_point = transforms[idx][point.p]; + if(root.contains(target_point)) { + if(!images[idx][i].contains(target_point)) { + return 1; + } + if(!preimages[idx][i].contains(point.p)) { + return 1; + } + } + } + } + } + } + return 0; +} + +template +int RandomAffineTest::check_partitioning(void) +{ + int result = 0; + for(size_t i = 0; i < transforms.size(); i++) { + if(verify_results(root2, transforms[i], dense_images, dense_preimages) || + verify_results(root2_sparse, transforms[i], sparse_images, sparse_preimages)) { + result++; + } + } + root1.destroy(); + root2.destroy(); + root2_sparse.destroy(); + for(unsigned i = 0; i < dense_images.size(); i++) { + for(unsigned j = 0; j < dense_images[i].size(); j++) { + dense_images[i][j].destroy(); + } + } + for(unsigned i = 0; i < sparse_images.size(); i++) { + for(unsigned j = 0; j < sparse_images[i].size(); j++) { + sparse_images[i][j].destroy(); + } + } + for(unsigned i = 0; i < dense_preimages.size(); i++) { + for(unsigned j = 0; j < dense_preimages[i].size(); j++) { + dense_preimages[i][j].destroy(); + } + } + for(unsigned i = 0; i < sparse_preimages.size(); i++) { + for(unsigned j = 0; j < sparse_preimages[i].size(); j++) { + sparse_preimages[i][j].destroy(); + } + } + return result; +} + +template +std::vector> create_translate_transforms(int size) +{ + RandStream<> rs(random_seed + 2); + std::vector> transforms; + { + TranslationTransform translate; + translate.offset = Point::ZEROES(); + for(int i = 0; i < N2; i++) { + translate.offset[i] = rs.rand_int(size - 1); + } + transforms.push_back(translate); + } + return transforms; +} + +template +std::vector> create_affine_transforms() +{ + std::vector> transforms; + + { + AffineTransform transpose; + for(int i = 0; i < N2; i++) { + for(int j = 0; j < N1; j++) { + transpose.transform[i][j] = (i == N1 - j - 1); + } + } + transpose.offset = Point::ZEROES(); + transforms.push_back(transpose); + } + + { + AffineTransform translate; + for(int i = 0; i < N2; i++) { + for(int j = 0; j < N1; j++) { + translate.transform[i][j] = (i == j); + } + } + translate.offset = Point::ZEROES(); + transforms.push_back(translate); + } + + { + AffineTransform scale; + for(int i = 0; i < N2; i++) { + for(int j = 0; j < N1; j++) { + scale.transform[i][j] = (i == j) ? 2 : 0; + } + } + scale.offset = Point::ZEROES(); + transforms.push_back(scale); + } + + { + AffineTransform shear; + for(int i = 0; i < N2; i++) { + for(int j = 0; j < N1; j++) { + shear.transform[i][j] = (i == j); + } + shear.transform[i][i + 1] = 1; + } + shear.offset = Point::ZEROES(); + transforms.push_back(shear); + } + + { + AffineTransform reflect; + for(int i = 0; i < N2; i++) { + for(int j = 0; j < N1; j++) { + reflect.transform[i][j] = (i == j) ? -1 : 0; + } + } + reflect.offset = Point::ZEROES(); + // transforms.push_back(reflect); + } + return transforms; +} + +TestInterface *run_structured_test(TransformType type, int argc, char **argv) +{ + switch(type) { + case TransformType::AFFINE: + return new RandomAffineTest<2, int, 2, int, int, AffineTransform<2, 2, int>>( + argc, const_cast(argv), + create_affine_transforms<2, int, 2, int, int>()); + case TransformType::TRANSLATION: + return new RandomAffineTest<2, int, 2, int, int, TranslationTransform<2, int>>( + argc, const_cast(argv), + create_translate_transforms<2, int, 2, int, int>(4)); + } + return nullptr; +} + +int main(int argc, char **argv) +{ + Runtime rt; + + rt.init(&argc, &argv); + + // parse global options + for(int i = 1; i < argc; i++) { + if(!strcmp(argv[i], "-seed")) { + random_seed = atoi(argv[++i]); + continue; + } + + if(!strcmp(argv[i], "-random")) { + random_colors = true; + continue; + } + + if(!strcmp(argv[i], "-wait")) { + wait_on_events = true; + continue; + } + + if(!strcmp(argv[i], "-show")) { + show_graph = true; + continue; + } + + if(!strcmp(argv[i], "-nocheck")) { + skip_check = true; + continue; + } + + // test cases consume the rest of the args + if(!strcmp(argv[i], "circuit")) { + testcfg = new CircuitTest(argc - i, const_cast(argv + i)); + break; + } + + if(!strcmp(argv[i], "basic")) { + testcfg = new BasicTest(argc - i, const_cast(argv + i)); + break; + } + + if(!strcmp(argv[i], "tile")) { + testcfg = new TileTest(argc - i, const_cast(argv + i)); + break; + } + + if (!strcmp(argv[i], "range")) { + testcfg = new RangeTest(argc - i, const_cast(argv + i)); + break; + } + + if (!strcmp(argv[i], "multi")) { + testcfg = new Range2DTest(argc - i, const_cast(argv + i)); + break; + } + + if(!strcmp(argv[i], "pennant")) { + testcfg = new PennantTest(argc - i, const_cast(argv + i)); + break; + } + + if(!strcmp(argv[i], "miniaero")) { + testcfg = new MiniAeroTest(argc - i, const_cast(argv + i)); + break; + } + + if(!strcmp(argv[i], "random")) { + testcfg = new RandomTest<1, int, 2, int, int>(argc - i, + const_cast(argv + i)); + break; + } + + if(!strcmp(argv[i], "affine")) { + TransformType type = TransformType::AFFINE; + if(i < argc - 1 && !strcmp(argv[++i], "-type")) { + type = static_cast(atoi(argv[++i])); + } + testcfg = run_structured_test(type, argc, argv); + break; + } + + // printf("unknown parameter: %s\n", argv[i]); + } + + // if no test specified, use circuit (with default parameters) + if(!testcfg) { + testcfg = new CircuitTest(0, 0); + } + + rt.register_task(TOP_LEVEL_TASK, top_level_task); + rt.register_task(INIT_CIRCUIT_DATA_TASK, CircuitTest::init_data_task_wrapper); + rt.register_task(INIT_PENNANT_DATA_TASK, PennantTest::init_data_task_wrapper); + rt.register_task(INIT_BASIC_DATA_TASK, BasicTest::init_data_task_wrapper); + rt.register_task(INIT_TILE_DATA_TASK, TileTest::init_data_task_wrapper); + rt.register_task(INIT_RANGE_DATA_TASK, RangeTest::init_data_task_wrapper); + rt.register_task(INIT_RANGE2D_DATA_TASK, Range2DTest::init_data_task_wrapper); + rt.register_task(INIT_MINIAERO_DATA_TASK, MiniAeroTest::init_data_task_wrapper); + + signal(SIGALRM, sigalrm_handler); + + Processor p = Machine::ProcessorQuery(Machine::get_machine()) + .only_kind(Processor::LOC_PROC) + .first(); + assert(p.exists()); + + // collective launch of a single task - everybody gets the same finish + // event + Event e = rt.collective_spawn(p, TOP_LEVEL_TASK, 0, 0); + + // request shutdown once that task is complete + rt.shutdown(e); + + // now sleep this thread until that shutdown actually happens + rt.wait_for_shutdown(); + + delete testcfg; + + return 0; +} diff --git a/tests/deppart.cc b/tests/deppart.cc index 8fde66845d..b6847f5513 100644 --- a/tests/deppart.cc +++ b/tests/deppart.cc @@ -502,25 +502,29 @@ class BasicTest : public TestInterface { wait_on_events = true; log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; const char* val = std::getenv("TILE_SIZE"); // or any env var - size_t tile_size = 100000000; //default + size_t tile_size = 10000000; //default if (val) { tile_size = atoi(val); } std::vector byte_fields = {sizeof(char)}; IndexSpace<1> instance_index_space(Rect<1>(0, tile_size-1)); + IndexSpace<1> dst_index_space(Rect<1>(0, tile_size/100-1)); for (size_t i = 0; i < piece_field_data_gpu.size(); i++) { RegionInstance::create_instance(piece_field_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); } for (size_t i = 0; i < src_field_data_gpu.size(); i++) { RegionInstance::create_instance(src_field_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); } + for (size_t i = 0; i < dst_field_data_gpu.size(); i++) { + RegionInstance::create_instance(dst_field_data_gpu[i].scratch_buffer, gpu_memory, dst_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + } std::vector > p_garbage_nodes, p_garbage_edges, p_garbage_rd, p_garbage_preimage_edges; Event e01 = is_nodes.create_subspaces_by_field(piece_field_data_gpu, colors, p_garbage_nodes, Realm::ProfilingRequestSet()); if (wait_on_events) e01.wait(); - Event e02 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + Event e02 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu, p_garbage_nodes, p_garbage_edges, Realm::ProfilingRequestSet(), @@ -536,7 +540,7 @@ class BasicTest : public TestInterface { e02); if(wait_on_events) e03.wait(); - Event e04 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + Event e04 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu, p_garbage_rd, p_garbage_preimage_edges, Realm::ProfilingRequestSet(), @@ -553,7 +557,7 @@ class BasicTest : public TestInterface { log_app.info() << "GPU By Field complete " << Clock::current_time_in_microseconds() << "\n"; log_app.info() << "Starting GPU Preimage " << Clock::current_time_in_microseconds() << "\n"; // now compute p_edges based on the color of their in_node (i.e. a preimage) - Event e2 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + Event e2 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu, p_nodes, p_edges, Realm::ProfilingRequestSet(), @@ -576,7 +580,7 @@ class BasicTest : public TestInterface { log_app.info() << "GPU Image complete " << Clock::current_time_in_microseconds() << "\n"; log_app.info() << "Starting second GPU preimage " << Clock::current_time_in_microseconds() << "\n"; - Event e4 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + Event e4 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu, p_rd, p_preimage_edges, Realm::ProfilingRequestSet(), @@ -1647,7 +1651,7 @@ class RangeTest : public TestInterface { } is_nodes.by_image_buffer_requirements(subspace_input, image_estimate_input, image_estimate_output); for (size_t i = 0; i < rect_val_data_gpu.size(); i++) { - IndexSpace<1> instance_index_space(Rect<1>(0, (image_estimate_output[i].upper_bound)/4-1)); + IndexSpace<1> instance_index_space(Rect<1>(0, (image_estimate_output[i].upper_bound)/12-1)); RegionInstance::create_instance(rect_val_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); } Event e002 = is_nodes.create_subspaces_by_image(rect_val_data_gpu, From 1fc63681df000fe7706324aed709821ac9b584b3 Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Mon, 23 Feb 2026 01:41:38 -0800 Subject: [PATCH 17/32] benchmarks done for byfield and image --- tests/benchmark.cc | 4793 ++++---------------------------------------- 1 file changed, 362 insertions(+), 4431 deletions(-) diff --git a/tests/benchmark.cc b/tests/benchmark.cc index b6847f5513..9615a3bcbc 100644 --- a/tests/benchmark.cc +++ b/tests/benchmark.cc @@ -40,13 +40,12 @@ Logger log_app("app"); enum { TOP_LEVEL_TASK = Processor::TASK_ID_FIRST_AVAILABLE + 0, - INIT_CIRCUIT_DATA_TASK, - INIT_BASIC_DATA_TASK, - INIT_TILE_DATA_TASK, - INIT_RANGE_DATA_TASK, - INIT_RANGE2D_DATA_TASK, - INIT_PENNANT_DATA_TASK, - INIT_MINIAERO_DATA_TASK, + INIT_BYFIELD_DATA_TASK, + INIT_IMAGE_DATA_TASK, +}; + +enum TestType { + BYFIELD = 0 }; enum TransformType @@ -84,41 +83,6 @@ void sigalrm_handler(int sig) exit(1); } -template -void dump_sparse_index_space(const char *pfx, IndexSpace is) -{ - std::cout << pfx << ": " << is << "\n"; - if(!is.sparsity.exists()) - return; - SparsityMapPublicImpl *impl = is.sparsity.impl(); - span> entries = impl->get_entries(); - for(size_t i = 0; i < entries.size(); i++) { - SparsityMapEntry entry = entries[i]; - std::cout << " " << entry.bounds; - if(entry.bitmap) - std::cout << " bitmap(" << entry.bitmap << ")"; - if(entry.sparsity.exists()) - std::cout << " sparsity(" << entry.sparsity << ")"; - std::cout << "\n"; - } -} - -static int check_empty(Event e, const std::vector> &p, const char *pfx) -{ - int errors = 0; - e.wait(); - for(size_t i = 0; i < p.size(); i++) { - p[i].make_valid().wait(); - if(p[i].volume() > 0) { - log_app.error() << "HELP! " << pfx << "[" << i << "] space " << p[i] - << " isn't empty?"; - dump_sparse_index_space(pfx, p[i]); - errors++; - } - } - return errors; -} - class TestInterface { public: virtual ~TestInterface(void) {} @@ -142,43 +106,64 @@ namespace { bool wait_on_events = false; bool show_graph = false; bool skip_check = false; + int dimension1 = 1; + int dimension2 = 1; + TestType test_type = BYFIELD; + TestInterface *testcfg = 0; }; // namespace -template -void split_evenly(T total, T pieces, std::vector &cuts) +template +Event copy_piece(FieldDataDescriptor src_data, FieldDataDescriptor &dst_data, const std::vector &fields, size_t field_idx, Memory dst_memory) { - cuts.resize(pieces + 1); - for(T i = 0; i <= pieces; i++) - cuts[i] = ((long long)total * i) / pieces; + size_t offset = 0; + for (size_t i = 0; i < field_idx; i++) { + offset += fields[i]; + } + size_t size = fields[field_idx]; + RegionInstance::create_instance(dst_data.inst, + dst_memory, + src_data.index_space, + fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + CopySrcDstField src_field, dst_field; + src_field.inst = src_data.inst; + src_field.size = size; + src_field.field_id = offset; + dst_field.inst = dst_data.inst; + dst_field.size = size; + dst_field.field_id = offset; + dst_data.index_space = src_data.index_space; + dst_data.field_offset = src_data.field_offset; + std::vector src_fields = {src_field}; + std::vector dst_fields = {dst_field}; + return src_data.index_space.copy(src_fields, dst_fields, Realm::ProfilingRequestSet()); } -template -int find_split(const std::vector &cuts, T v) -{ - // dumb linear search - assert(v >= cuts[0]); - for(size_t i = 1; i < cuts.size(); i++) - if(v < cuts[i]) - return i - 1; - assert(false); - return 0; +Event alloc_piece(RegionInstance &result, size_t size, Memory location) { + assert(location != Memory::NO_MEMORY); + assert(size > 0); + std::vector byte_fields = {sizeof(char)}; + IndexSpace<1> instance_index_space(Rect<1>(0, size-1)); + return RegionInstance::create_instance(result, location, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()); } /* - * Basic test - create a graph, partition it by + * Byfield test - create a graph, partition it by * node subgraph id and then check that the partitioning * is correct */ -class BasicTest : public TestInterface { +template +class ByfieldTest : public TestInterface { public: // graph config parameters int num_nodes = 1000; - int num_edges = 1000; int num_pieces = 4; + int num_colors = 4; std::string filename; - BasicTest(int argc, const char *argv[]) + ByfieldTest(int argc, const char *argv[]) { for(int i = 1; i < argc; i++) { @@ -190,23 +175,22 @@ class BasicTest : public TestInterface { num_nodes = atoi(argv[++i]); continue; } - if(!strcmp(argv[i], "-e")) { - num_edges = atoi(argv[++i]); + if(!strcmp(argv[i], "-c")) { + num_colors = atoi(argv[++i]); continue; } } - if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0) { - log_app.error() << "Invalid config: nodes=" << num_nodes << " edges=" << num_edges << " pieces=" << num_pieces << "\n"; + if (num_nodes <= 0 || num_pieces <= 0 || num_colors <= 0) { + log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_colors << " pieces=" << num_pieces << "\n"; exit(1); } } struct InitDataArgs { int index; - RegionInstance ri_nodes; - RegionInstance ri_edges; + RegionInstance ri_colors; }; enum PRNGStreams @@ -215,25 +199,18 @@ class BasicTest : public TestInterface { }; // assign subgraph ids to nodes - void random_node_data(int idx, int &subgraph) + void color_point(int idx, int& color) { if(random_colors) - subgraph = - Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); - else - subgraph = idx * num_pieces / num_nodes; - } - - void random_edge_data(int idx, int& src, int& dst) - { - src = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes); - dst = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, num_nodes); + color = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_colors); + else + color = (idx * num_colors / num_nodes) % num_colors; } static void init_data_task_wrapper(const void *args, size_t arglen, const void *userdata, size_t userlen, Processor p) { - BasicTest *me = (BasicTest *)testcfg; + ByfieldTest *me = (ByfieldTest *)testcfg; me->init_data_task(args, arglen, p); } @@ -242,95 +219,68 @@ class BasicTest : public TestInterface { { const InitDataArgs &i_args = *(const InitDataArgs *)args; - log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes + log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_colors << ")"; - i_args.ri_nodes.fetch_metadata(p).wait(); - i_args.ri_edges.fetch_metadata(p).wait(); + i_args.ri_colors.fetch_metadata(p).wait(); - IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>(); - IndexSpace<1> is_edges = i_args.ri_edges.get_indexspace<1>(); + IndexSpace colors_space = i_args.ri_colors.template get_indexspace(); - log_app.debug() << "N: " << is_nodes; - log_app.debug() << "E: " << is_edges; + log_app.debug() << "N: " << is_colors; //For each node in the graph, mark it with a random (or deterministic) subgraph id { - AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); - - for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) { - int subgraph; - random_node_data(i, subgraph); - a_piece_id.write(i, subgraph); - } - - AffineAccessor,1> a_src(i_args.ri_edges, 0 /* offset */); - AffineAccessor,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */); - - for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) { - int src, dst; - random_edge_data(i, src, dst); - a_src.write(i, Point<1>(src)); - a_dst.write(i, Point<1>(dst)); + AffineAccessor a_piece_id(i_args.ri_colors, 0 /* offset */); + + for (IndexSpaceIterator it(is_colors); it.valid; it.step()) { + for (PointInRectIterator point(it.rect); point.valid; point.step()) { + int idx = 0; + int stride = 1; + for (int d = 0; d < N; d++) { + idx += (point.p[d] - is_colors.bounds.lo[d]) * stride; + stride *= (is_colors.bounds.hi[d] - is_colors.bounds.lo[d] + 1); + } + int subgraph; + color_point(idx, subgraph); + a_piece_id.write(point.p, subgraph); + } } } - - //Optionally print out the assigned subgraph ids - if(show_graph) { - AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); - - for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) - log_app.info() << "piece_id[" << i << "] = " << a_piece_id.read(i) << "\n"; - - AffineAccessor,1> a_src(i_args.ri_edges, 0 /* offset */); - AffineAccessor,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */); - - for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) - log_app.info() << "src, dst[" << i << "] = " << a_src.read(i) << ", " << a_dst.read(i) << "\n"; - } } - IndexSpace<1> is_nodes, is_edges; - std::vector ri_nodes, ri_edges; - std::vector, int> > piece_id_field_data; - std::vector, Point<1> > > src_node_field_data, dst_node_field_data; + IndexSpace is_colors; + std::vector ri_colors; + std::vector, int> > piece_id_field_data; virtual void print_info(void) { - printf("Realm dependent partitioning test - basic: %d nodes, %d edges, %d pieces\n", - (int)num_nodes, (int) num_edges, (int)num_pieces); + printf("Realm %dD Byfield dependent partitioning test: %d nodes, %d colors, %d pieces\n", (int) N, + (int)num_nodes, (int) num_colors, (int)num_pieces); } virtual Event initialize_data(const std::vector &memories, const std::vector &procs) { // now create index space for nodes - is_nodes = Rect<1>(0, num_nodes - 1); - is_edges = Rect<1>(0, num_edges - 1); + Point lo, hi; + for (int d = 0; d < N; d++) { + lo[d] = 0; + hi[d] = num_nodes - 1; + } + is_colors = Rect(lo, hi); // equal partition is used to do initial population of edges and nodes - std::vector > ss_nodes_eq; - std::vector > ss_edges_eq; + std::vector > ss_nodes_eq; log_app.info() << "Creating equal subspaces\n"; - is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); - is_edges.create_equal_subspaces(num_pieces, 1, ss_edges_eq, Realm::ProfilingRequestSet()).wait(); - - log_app.debug() << "Initial partitions:"; - for(size_t i = 0; i < ss_nodes_eq.size(); i++) - log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i]; - for(size_t i = 0; i < ss_edges_eq.size(); i++) - log_app.debug() << " Edges #" << i << ": " << ss_edges_eq[i]; + is_colors.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); // create instances for each of these subspaces - std::vector node_fields, edge_fields; - node_fields.push_back(sizeof(int)); // piece_id - assert(sizeof(int) == sizeof(Point<1>)); - edge_fields.push_back(sizeof(Point<1>)); // src_node - edge_fields.push_back(sizeof(Point<1>)); // dst_node - - ri_nodes.resize(num_pieces); + std::vector node_fields; + node_fields.push_back(sizeof(int)); + + ri_colors.resize(num_pieces); piece_id_field_data.resize(num_pieces); for(size_t i = 0; i < ss_nodes_eq.size(); i++) { @@ -339,47 +289,21 @@ class BasicTest : public TestInterface { node_fields, 0 /*SOA*/, Realm::ProfilingRequestSet()) .wait(); - ri_nodes[i] = ri; + ri_colors[i] = ri; piece_id_field_data[i].index_space = ss_nodes_eq[i]; - piece_id_field_data[i].inst = ri_nodes[i]; + piece_id_field_data[i].inst = ri_colors[i]; piece_id_field_data[i].field_offset = 0; } - - // Fire off tasks to initialize data - ri_edges.resize(num_pieces); - src_node_field_data.resize(num_pieces); - dst_node_field_data.resize(num_pieces); - - for(size_t i = 0; i < ss_edges_eq.size(); i++) { - RegionInstance ri; - RegionInstance::create_instance(ri, - memories[i % memories.size()], - ss_edges_eq[i], - edge_fields, - 0 /*SOA*/, - Realm::ProfilingRequestSet()).wait(); - ri_edges[i] = ri; - - src_node_field_data[i].index_space = ss_edges_eq[i]; - src_node_field_data[i].inst = ri_edges[i]; - src_node_field_data[i].field_offset = 0 * sizeof(Point<1>); - - dst_node_field_data[i].index_space = ss_edges_eq[i]; - dst_node_field_data[i].inst = ri_edges[i]; - dst_node_field_data[i].field_offset = 1 * sizeof(Point<1>); - } - // fire off tasks to initialize data std::set events; for(int i = 0; i < num_pieces; i++) { Processor p = procs[i % procs.size()]; InitDataArgs args; args.index = i; - args.ri_nodes = ri_nodes[i]; - args.ri_edges = ri_edges[i]; - Event e = p.spawn(INIT_BASIC_DATA_TASK, &args, sizeof(args)); + args.ri_colors = ri_colors[i]; + Event e = p.spawn(INIT_BYFIELD_DATA_TASK, &args, sizeof(args)); events.insert(e); } @@ -391,19 +315,15 @@ class BasicTest : public TestInterface { // p_nodes_cpu - nodes partitioned by subgraph id (from CPU) - std::vector > p_nodes, p_rd; - std::vector > p_edges, p_preimage_edges; - - std::vector > p_nodes_cpu, p_rd_cpu; - std::vector > p_edges_cpu, p_preimage_edges_cpu; + std::vector > p_nodes, p_garbage_nodes, p_nodes_cpu; virtual Event perform_partitioning(void) { // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU // Ensure that the results are identical - std::vector colors(num_pieces); - for(int i = 0; i < num_pieces; i++) + std::vector colors(num_colors); + for(int i = 0; i < num_colors; i++) colors[i] = i; // We need a GPU memory for GPU partitioning @@ -423,210 +343,52 @@ class BasicTest : public TestInterface { log_app.error() << "No GPU memory found for partitioning test\n"; return Event::NO_EVENT; } - std::vector edge_fields; - edge_fields.push_back(sizeof(Point<1>)); - edge_fields.push_back(sizeof(Point<1>)) ; + + std::vector node_fields; node_fields.push_back(sizeof(int)); - std::vector, Point<1> > > src_field_data_gpu; - std::vector, Point<1> > > dst_field_data_gpu; - std::vector, int> > piece_field_data_gpu; + std::vector, int> > piece_field_data_gpu; piece_field_data_gpu.resize(num_pieces); - src_field_data_gpu.resize(num_pieces); - dst_field_data_gpu.resize(num_pieces); + for (int i = 0; i < num_pieces; i++) { - RegionInstance src_gpu_instance; - RegionInstance dst_gpu_instance; - RegionInstance piece_gpu_instance; - RegionInstance::create_instance(src_gpu_instance, - gpu_memory, - src_node_field_data[i].index_space, - edge_fields, - 0 /*SOA*/, - Realm::ProfilingRequestSet()).wait(); - RegionInstance::create_instance(dst_gpu_instance, - gpu_memory, - dst_node_field_data[i].index_space, - edge_fields, - 0 /*SOA*/, - Realm::ProfilingRequestSet()).wait(); - RegionInstance::create_instance(piece_gpu_instance, - gpu_memory, - piece_id_field_data[i].index_space, - node_fields, - 0 /*SOA*/, - Realm::ProfilingRequestSet()).wait(); - CopySrcDstField src_gpu_field, src_cpu_field, dst_gpu_field, dst_cpu_field, piece_gpu_field, piece_cpu_field; - src_gpu_field.inst = src_gpu_instance; - src_gpu_field.size = sizeof(Point<1>); - src_gpu_field.field_id = 0; - src_cpu_field.inst = src_node_field_data[i].inst; - src_cpu_field.size = sizeof(Point<1>); - src_cpu_field.field_id = 0; - dst_gpu_field.inst = dst_gpu_instance; - dst_gpu_field.size = sizeof(Point<1>); - dst_gpu_field.field_id = sizeof(Point<1>); - dst_cpu_field.inst = dst_node_field_data[i].inst; - dst_cpu_field.size = sizeof(Point<1>); - dst_cpu_field.field_id = sizeof(Point<1>); - piece_gpu_field.inst = piece_gpu_instance; - piece_gpu_field.size = sizeof(int); - piece_gpu_field.field_id = 0; - piece_cpu_field.inst = piece_id_field_data[i].inst; - piece_cpu_field.size = sizeof(int); - piece_cpu_field.field_id = 0; - std::vector src_cpu_data, src_gpu_data, dst_cpu_data, dst_gpu_data, piece_cpu_data, piece_gpu_data; - src_cpu_data.push_back(src_cpu_field); - dst_cpu_data.push_back(dst_cpu_field); - src_gpu_data.push_back(src_gpu_field); - dst_gpu_data.push_back(dst_gpu_field); - piece_gpu_data.push_back(piece_gpu_field); - piece_cpu_data.push_back(piece_cpu_field); - Event copy_event = src_node_field_data[i].index_space.copy(src_cpu_data, src_gpu_data, Realm::ProfilingRequestSet()); - copy_event.wait(); - Event second_copy_event = dst_node_field_data[i].index_space.copy(dst_cpu_data, dst_gpu_data, Realm::ProfilingRequestSet()); - second_copy_event.wait(); - Event third_copy_event = piece_id_field_data[i].index_space.copy(piece_cpu_data, piece_gpu_data, Realm::ProfilingRequestSet()); - third_copy_event.wait(); - src_field_data_gpu[i].inst = src_gpu_instance; - src_field_data_gpu[i].index_space = src_node_field_data[i].index_space; - src_field_data_gpu[i].field_offset = 0; - dst_field_data_gpu[i].inst = dst_gpu_instance; - dst_field_data_gpu[i].index_space = dst_node_field_data[i].index_space; - dst_field_data_gpu[i].field_offset = 1 * sizeof(Point<1>); - piece_field_data_gpu[i].inst = piece_gpu_instance; - piece_field_data_gpu[i].index_space = piece_id_field_data[i].index_space; - piece_field_data_gpu[i].field_offset = 0; + copy_piece(piece_id_field_data[i], piece_field_data_gpu[i], node_fields, 0, gpu_memory).wait(); } - wait_on_events = true; - log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; - const char* val = std::getenv("TILE_SIZE"); // or any env var - size_t tile_size = 10000000; //default - if (val) { - tile_size = atoi(val); - } - std::vector byte_fields = {sizeof(char)}; - IndexSpace<1> instance_index_space(Rect<1>(0, tile_size-1)); - IndexSpace<1> dst_index_space(Rect<1>(0, tile_size/100-1)); - for (size_t i = 0; i < piece_field_data_gpu.size(); i++) { - RegionInstance::create_instance(piece_field_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); - } - for (size_t i = 0; i < src_field_data_gpu.size(); i++) { - RegionInstance::create_instance(src_field_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); - } - for (size_t i = 0; i < dst_field_data_gpu.size(); i++) { - RegionInstance::create_instance(dst_field_data_gpu[i].scratch_buffer, gpu_memory, dst_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); - } - std::vector > p_garbage_nodes, p_garbage_edges, p_garbage_rd, p_garbage_preimage_edges; - Event e01 = is_nodes.create_subspaces_by_field(piece_field_data_gpu, + + std::vector> byfield_inputs(num_pieces); + std::vector byfield_requirements(num_pieces); + + for (int i = 0; i < num_pieces; i++) { + byfield_inputs[i].location = piece_field_data_gpu[i].inst.get_location(); + byfield_inputs[i].space = piece_field_data_gpu[i].index_space; + } + + is_colors.by_field_buffer_requirements(byfield_inputs, byfield_requirements); + + for (int i = 0; i < num_pieces; i++) { + alloc_piece(piece_field_data_gpu[i].scratch_buffer, byfield_requirements[i].upper_bound, gpu_memory).wait(); + } + + wait_on_events = true; + log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; + Event warmup = is_colors.create_subspaces_by_field(piece_field_data_gpu, colors, p_garbage_nodes, Realm::ProfilingRequestSet()); - if (wait_on_events) e01.wait(); - Event e02 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu, - p_garbage_nodes, - p_garbage_edges, - Realm::ProfilingRequestSet(), - e01); - if(wait_on_events) e02.wait(); - - // an image of p_edges through out_node gives us all the shared nodes, along - // with some private nodes - Event e03 = is_nodes.create_subspaces_by_image(src_field_data_gpu, - p_garbage_edges, - p_garbage_rd, - Realm::ProfilingRequestSet(), - e02); - if(wait_on_events) e03.wait(); - - Event e04 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu, - p_garbage_rd, - p_garbage_preimage_edges, - Realm::ProfilingRequestSet(), - e03); - e04.wait(); - log_app.info() << "warming up complete " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "Starting GPU Partitioning " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "Starting GPU By Field " << Clock::current_time_in_microseconds() << "\n"; - Event e1 = is_nodes.create_subspaces_by_field(piece_field_data_gpu, - colors, - p_nodes, - Realm::ProfilingRequestSet()); - if(wait_on_events) e1.wait(); - log_app.info() << "GPU By Field complete " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "Starting GPU Preimage " << Clock::current_time_in_microseconds() << "\n"; - // now compute p_edges based on the color of their in_node (i.e. a preimage) - Event e2 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu, - p_nodes, - p_edges, - Realm::ProfilingRequestSet(), - e1); - if(wait_on_events) e2.wait(); - log_app.info() << "GPU Preimage complete " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "Starting GPU Image " << Clock::current_time_in_microseconds() << "\n"; - - std::vector> spaces = {}; - std::vector requirements; - is_nodes.by_field_buffer_requirements(spaces, requirements); - // an image of p_edges through out_node gives us all the shared nodes, along - // with some private nodes - Event e3 = is_nodes.create_subspaces_by_image(src_field_data_gpu, - p_edges, - p_rd, - Realm::ProfilingRequestSet(), - e2); - if(wait_on_events) e3.wait(); - log_app.info() << "GPU Image complete " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "Starting second GPU preimage " << Clock::current_time_in_microseconds() << "\n"; - - Event e4 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu, - p_rd, - p_preimage_edges, - Realm::ProfilingRequestSet(), - e3); - e4.wait(); - log_app.info() << "Second GPU preimage complete " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "GPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "Starting CPU Partitioning " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "Starting CPU By Field " << Clock::current_time_in_microseconds() << "\n"; - Event e5 = is_nodes.create_subspaces_by_field(piece_id_field_data, - colors, - p_nodes_cpu, - Realm::ProfilingRequestSet()); - if(wait_on_events) e5.wait(); - log_app.info() << "CPU By Field complete " << Clock::current_time_in_microseconds() << "\n"; - // now compute p_edges based on the color of their in_node (i.e. a preimage) - log_app.info() << "Starting CPU Preimage " << Clock::current_time_in_microseconds() << "\n"; - Event e6 = is_edges.create_subspaces_by_preimage(dst_node_field_data, - p_nodes_cpu, - p_edges_cpu, - Realm::ProfilingRequestSet(), - e5); - if(wait_on_events) e6.wait(); - log_app.info() << "CPU Preimage complete " << Clock::current_time_in_microseconds() << "\n"; - - // an image of p_edges through out_node gives us all the shared nodes, along - // with some private nodes - log_app.info() << "Starting CPU Image " << Clock::current_time_in_microseconds() << "\n"; - Event e7 = is_nodes.create_subspaces_by_image(src_node_field_data, - p_edges_cpu, - p_rd_cpu, - Realm::ProfilingRequestSet(), - e6); - if(wait_on_events) e7.wait(); - log_app.info() << "CPU Image complete " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "Starting second CPU preimage " << Clock::current_time_in_microseconds() << "\n"; - - Event e8 = is_edges.create_subspaces_by_preimage(dst_node_field_data, - p_rd_cpu, - p_preimage_edges_cpu, - Realm::ProfilingRequestSet(), - e7); - e8.wait(); - log_app.info() << "Second CPU preimage complete " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; - return e8; + warmup.wait(); + + Event gpu_call = is_colors.create_subspaces_by_field(piece_field_data_gpu, + colors, + p_nodes, + Realm::ProfilingRequestSet()); + + Event cpu_call = is_colors.create_subspaces_by_field(piece_id_field_data, + colors, + p_nodes_cpu, + Realm::ProfilingRequestSet()); + + return Event::merge_events({gpu_call, cpu_call}); + } virtual int perform_dynamic_checks(void) @@ -640,14 +402,14 @@ class BasicTest : public TestInterface { int errors = 0; if (!p_nodes.size()) { - return 0; + return p_nodes.size() == p_nodes_cpu.size(); } log_app.info() << "Checking correctness of partitioning " << "\n"; for(int i = 0; i < num_pieces; i++) { - for(IndexSpaceIterator<1> it(p_nodes[i]); it.valid; it.step()) { - for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + for(IndexSpaceIterator it(p_nodes[i]); it.valid; it.step()) { + for(PointInRectIterator point(it.rect); point.valid; point.step()) { if (!p_nodes_cpu[i].contains(point.p)) { log_app.error() << "Mismatch! GPU has extra byfield point " << point.p << " on piece " << i << "\n"; @@ -655,8 +417,8 @@ class BasicTest : public TestInterface { } } } - for(IndexSpaceIterator<1> it(p_nodes_cpu[i]); it.valid; it.step()) { - for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + for(IndexSpaceIterator it(p_nodes_cpu[i]); it.valid; it.step()) { + for(PointInRectIterator point(it.rect); point.valid; point.step()) { if (!p_nodes[i].contains(point.p)) { log_app.error() << "Mismatch! GPU is missing byfield point " << point.p << " on piece " << i << "\n"; @@ -664,76 +426,23 @@ class BasicTest : public TestInterface { } } } - for (IndexSpaceIterator<1> it(p_edges[i]); it.valid; it.step()) { - for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { - if (!p_edges_cpu[i].contains(point.p)) { - log_app.error() << "Mismatch! GPU has extra preimage edge " << point.p - << " on piece " << i << "\n"; - errors++; - } - } - } - for (IndexSpaceIterator<1> it(p_edges_cpu[i]); it.valid; it.step()) { - for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { - if (!p_edges[i].contains(point.p)) { - log_app.error() << "Mismatch! GPU is missing preimage edge " << point.p - << " on piece " << i << "\n"; - errors++; - } - } - } - for (IndexSpaceIterator<1> it(p_rd[i]); it.valid; it.step()) { - for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { - if (!p_rd_cpu[i].contains(point.p)) { - log_app.error() << "Mismatch! GPU has extra image node " << point.p - << " on piece " << i << "\n"; - errors++; - } - } - } - for (IndexSpaceIterator<1> it(p_rd_cpu[i]); it.valid; it.step()) { - for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { - if (!p_rd[i].contains(point.p)) { - log_app.error() << "Mismatch! GPU is missing image node " << point.p - << " on piece " << i << "\n"; - errors++; - } - } - } - for (IndexSpaceIterator<1> it(p_preimage_edges[i]); it.valid; it.step()) { - for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { - if (!p_preimage_edges_cpu[i].contains(point.p)) { - log_app.error() << "Mismatch! GPU has extra second preimage edge " << point.p - << " on piece " << i << "\n"; - errors++; - } - } - } - for (IndexSpaceIterator<1> it(p_preimage_edges_cpu[i]); it.valid; it.step()) { - for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { - if (!p_preimage_edges[i].contains(point.p)) { - log_app.error() << "Mismatch! GPU is missing second preimage edge " << point.p - << " on piece " << i << "\n"; - errors++; - } - } - } } return errors; } }; -class TileTest : public TestInterface { +template +class ImageTest : public TestInterface { public: // graph config parameters int num_nodes = 1000; int num_edges = 1000; + int num_sources = 4; int num_pieces = 4; - int num_tiles = 1; std::string filename; - TileTest(int argc, const char *argv[]) + ImageTest(int argc, const char *argv[]) { for(int i = 1; i < argc; i++) { @@ -749,15 +458,15 @@ class TileTest : public TestInterface { num_edges = atoi(argv[++i]); continue; } - if(!strcmp(argv[i], "-t")) { - num_tiles = atoi(argv[++i]); + if(!strcmp(argv[i], "-s")) { + num_sources = atoi(argv[++i]); continue; } } - if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0) { - log_app.error() << "Invalid config: nodes=" << num_nodes << " edges=" << num_edges << " pieces=" << num_pieces << "\n"; + if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_sources <= 0) { + log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " sources=" << num_sources << "\n"; exit(1); } } @@ -765,7 +474,6 @@ class TileTest : public TestInterface { struct InitDataArgs { int index; RegionInstance ri_nodes; - RegionInstance ri_edges; }; enum PRNGStreams @@ -774,25 +482,20 @@ class TileTest : public TestInterface { }; // assign subgraph ids to nodes - void random_node_data(int idx, int &subgraph) - { - if(random_colors) - subgraph = - Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); - else - subgraph = idx * num_pieces / num_nodes; - } - - void random_edge_data(int idx, int& src, int& dst) + void chase_point(int idx, Point& color) { - src = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes); - dst = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, num_nodes); + for (int d = 0; d < N1; d++) { + if(random_colors) + color[d] = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_edges); + else + color[d] = (idx * num_edges / num_nodes) % num_edges; + } } static void init_data_task_wrapper(const void *args, size_t arglen, const void *userdata, size_t userlen, Processor p) { - TileTest *me = (TileTest *)testcfg; + ImageTest *me = (ImageTest *)testcfg; me->init_data_task(args, arglen, p); } @@ -805,128 +508,84 @@ class TileTest : public TestInterface { << ")"; i_args.ri_nodes.fetch_metadata(p).wait(); - i_args.ri_edges.fetch_metadata(p).wait(); - IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>(); - IndexSpace<1> is_edges = i_args.ri_edges.get_indexspace<1>(); + IndexSpace nodes_space = i_args.ri_nodes.template get_indexspace(); log_app.debug() << "N: " << is_nodes; - log_app.debug() << "E: " << is_edges; //For each node in the graph, mark it with a random (or deterministic) subgraph id { - AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); - - for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) { - int subgraph; - random_node_data(i, subgraph); - a_piece_id.write(i, subgraph); - } - - AffineAccessor,1> a_src(i_args.ri_edges, 0 /* offset */); - AffineAccessor,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */); - - for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) { - int src, dst; - random_edge_data(i, src, dst); - a_src.write(i, Point<1>(src)); - a_dst.write(i, Point<1>(dst)); + AffineAccessor, N2> a_point(i_args.ri_nodes, 0 /* offset */); + + for (IndexSpaceIterator it(is_nodes); it.valid; it.step()) { + for (PointInRectIterator point(it.rect); point.valid; point.step()) { + int idx = 0; + int stride = 1; + for (int d = 0; d < N2; d++) { + idx += (point.p[d] - is_nodes.bounds.lo[d]) * stride; + stride *= (is_nodes.bounds.hi[d] - is_nodes.bounds.lo[d] + 1); + } + Point destination; + chase_point(idx, destination); + a_point.write(point.p, destination); + } } } - - //Optionally print out the assigned subgraph ids - if(show_graph) { - AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); - - for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) - log_app.info() << "piece_id[" << i << "] = " << a_piece_id.read(i) << "\n"; - - AffineAccessor,1> a_src(i_args.ri_edges, 0 /* offset */); - AffineAccessor,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */); - - for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) - log_app.info() << "src, dst[" << i << "] = " << a_src.read(i) << ", " << a_dst.read(i) << "\n"; - } } - IndexSpace<1> is_nodes, is_edges; - std::vector ri_nodes, ri_edges; - std::vector, int> > piece_id_field_data; - std::vector, Point<1> > > src_node_field_data, dst_node_field_data; + IndexSpace is_nodes; + IndexSpace is_edges; + std::vector ri_nodes; + std::vector, Point> > point_field_data; virtual void print_info(void) { - printf("Realm dependent partitioning test - tile: %d nodes, %d edges, %d pieces, %d tiles\n", - (int)num_nodes, (int) num_edges, (int)num_pieces, (int)num_tiles); + printf("Realm %dD -> %dD Image dependent partitioning test: %d nodes, %d edges, %d pieces ,%d sources\n", (int) N2, (int) N1, + (int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_sources); } virtual Event initialize_data(const std::vector &memories, const std::vector &procs) { // now create index space for nodes - is_nodes = Rect<1>(0, num_nodes - 1); - is_edges = Rect<1>(0, num_edges - 1); + Point node_lo, node_hi; + for (int d = 0; d < N2; d++) { + node_lo[d] = 0; + node_hi[d] = num_nodes - 1; + } + is_nodes = Rect(node_lo, node_hi); + + Point edge_lo, edge_hi; + for (int d = 0; d < N1; d++) { + edge_lo[d] = 0; + edge_hi[d] = num_edges - 1; + } + is_edges = Rect(edge_lo, edge_hi); + // equal partition is used to do initial population of edges and nodes - std::vector > ss_nodes_eq; - std::vector > ss_edges_eq; + std::vector > ss_nodes_eq; log_app.info() << "Creating equal subspaces\n"; is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); - is_edges.create_equal_subspaces(num_pieces, 1, ss_edges_eq, Realm::ProfilingRequestSet()).wait(); - - log_app.debug() << "Initial partitions:"; - for(size_t i = 0; i < ss_nodes_eq.size(); i++) - log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i]; - for(size_t i = 0; i < ss_edges_eq.size(); i++) - log_app.debug() << " Edges #" << i << ": " << ss_edges_eq[i]; // create instances for each of these subspaces - std::vector node_fields, edge_fields; - node_fields.push_back(sizeof(int)); // piece_id - assert(sizeof(int) == sizeof(Point<1>)); - edge_fields.push_back(sizeof(Point<1>)); // src_node - edge_fields.push_back(sizeof(Point<1>)); // dst_node + std::vector node_fields; + node_fields.push_back(sizeof(Point)); ri_nodes.resize(num_pieces); - piece_id_field_data.resize(num_pieces); + point_field_data.resize(num_pieces); for(size_t i = 0; i < ss_nodes_eq.size(); i++) { RegionInstance ri; RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i], node_fields, 0 /*SOA*/, - Realm::ProfilingRequestSet()) - .wait(); + Realm::ProfilingRequestSet()).wait(); ri_nodes[i] = ri; - piece_id_field_data[i].index_space = ss_nodes_eq[i]; - piece_id_field_data[i].inst = ri_nodes[i]; - piece_id_field_data[i].field_offset = 0; - } - - - // Fire off tasks to initialize data - ri_edges.resize(num_pieces); - src_node_field_data.resize(num_pieces); - dst_node_field_data.resize(num_pieces); - - for(size_t i = 0; i < ss_edges_eq.size(); i++) { - RegionInstance ri; - RegionInstance::create_instance(ri, - memories[i % memories.size()], - ss_edges_eq[i], - edge_fields, - 0 /*SOA*/, - Realm::ProfilingRequestSet()).wait(); - ri_edges[i] = ri; - - src_node_field_data[i].index_space = ss_edges_eq[i]; - src_node_field_data[i].inst = ri_edges[i]; - src_node_field_data[i].field_offset = 0 * sizeof(Point<1>); - - dst_node_field_data[i].index_space = ss_edges_eq[i]; - dst_node_field_data[i].inst = ri_edges[i]; - dst_node_field_data[i].field_offset = 1 * sizeof(Point<1>); + point_field_data[i].index_space = ss_nodes_eq[i]; + point_field_data[i].inst = ri_nodes[i]; + point_field_data[i].field_offset = 0; } // fire off tasks to initialize data @@ -936,8 +595,7 @@ class TileTest : public TestInterface { InitDataArgs args; args.index = i; args.ri_nodes = ri_nodes[i]; - args.ri_edges = ri_edges[i]; - Event e = p.spawn(INIT_TILE_DATA_TASK, &args, sizeof(args)); + Event e = p.spawn(INIT_IMAGE_DATA_TASK, &args, sizeof(args)); events.insert(e); } @@ -949,20 +607,16 @@ class TileTest : public TestInterface { // p_nodes_cpu - nodes partitioned by subgraph id (from CPU) - std::vector > p_nodes, p_rd; - std::vector > p_edges, p_preimage_edges; - - std::vector > p_nodes_cpu, p_rd_cpu; - std::vector > p_edges_cpu, p_preimage_edges_cpu; + std::vector > p_edges, p_garbage_edges, p_edges_cpu; virtual Event perform_partitioning(void) { // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU // Ensure that the results are identical - std::vector colors(num_pieces); - for(int i = 0; i < num_pieces; i++) - colors[i] = i; + std::vector> sources(num_pieces); + for(int i = 0; i < num_sources; i++) + sources[i] = point_field_data[i % num_pieces].index_space; // We need a GPU memory for GPU partitioning Memory gpu_memory; @@ -981,190 +635,58 @@ class TileTest : public TestInterface { log_app.error() << "No GPU memory found for partitioning test\n"; return Event::NO_EVENT; } - std::vector edge_fields; - edge_fields.push_back(sizeof(Point<1>)); - edge_fields.push_back(sizeof(Point<1>)) ; + + std::vector node_fields; - node_fields.push_back(sizeof(int)); + node_fields.push_back(sizeof(Point)); + + std::vector, Point>> point_field_data_gpu; + point_field_data_gpu.resize(num_pieces); - std::vector, Point<1> > > src_field_data_gpu; - std::vector, Point<1> > > dst_field_data_gpu; - std::vector, int> > piece_field_data_gpu; - piece_field_data_gpu.resize(num_pieces); - src_field_data_gpu.resize(num_pieces); - dst_field_data_gpu.resize(num_pieces); for (int i = 0; i < num_pieces; i++) { - RegionInstance src_gpu_instance; - RegionInstance dst_gpu_instance; - RegionInstance piece_gpu_instance; - RegionInstance::create_instance(src_gpu_instance, - gpu_memory, - src_node_field_data[i].index_space, - edge_fields, - 0 /*SOA*/, - Realm::ProfilingRequestSet()).wait(); - RegionInstance::create_instance(dst_gpu_instance, - gpu_memory, - dst_node_field_data[i].index_space, - edge_fields, - 0 /*SOA*/, - Realm::ProfilingRequestSet()).wait(); - RegionInstance::create_instance(piece_gpu_instance, - gpu_memory, - piece_id_field_data[i].index_space, - node_fields, - 0 /*SOA*/, - Realm::ProfilingRequestSet()).wait(); - CopySrcDstField src_gpu_field, src_cpu_field, dst_gpu_field, dst_cpu_field, piece_gpu_field, piece_cpu_field; - src_gpu_field.inst = src_gpu_instance; - src_gpu_field.size = sizeof(Point<1>); - src_gpu_field.field_id = 0; - src_cpu_field.inst = src_node_field_data[i].inst; - src_cpu_field.size = sizeof(Point<1>); - src_cpu_field.field_id = 0; - dst_gpu_field.inst = dst_gpu_instance; - dst_gpu_field.size = sizeof(Point<1>); - dst_gpu_field.field_id = sizeof(Point<1>); - dst_cpu_field.inst = dst_node_field_data[i].inst; - dst_cpu_field.size = sizeof(Point<1>); - dst_cpu_field.field_id = sizeof(Point<1>); - piece_gpu_field.inst = piece_gpu_instance; - piece_gpu_field.size = sizeof(int); - piece_gpu_field.field_id = 0; - piece_cpu_field.inst = piece_id_field_data[i].inst; - piece_cpu_field.size = sizeof(int); - piece_cpu_field.field_id = 0; - std::vector src_cpu_data, src_gpu_data, dst_cpu_data, dst_gpu_data, piece_cpu_data, piece_gpu_data; - src_cpu_data.push_back(src_cpu_field); - dst_cpu_data.push_back(dst_cpu_field); - src_gpu_data.push_back(src_gpu_field); - dst_gpu_data.push_back(dst_gpu_field); - piece_gpu_data.push_back(piece_gpu_field); - piece_cpu_data.push_back(piece_cpu_field); - Event copy_event = src_node_field_data[i].index_space.copy(src_cpu_data, src_gpu_data, Realm::ProfilingRequestSet()); - copy_event.wait(); - Event second_copy_event = dst_node_field_data[i].index_space.copy(dst_cpu_data, dst_gpu_data, Realm::ProfilingRequestSet()); - second_copy_event.wait(); - Event third_copy_event = piece_id_field_data[i].index_space.copy(piece_cpu_data, piece_gpu_data, Realm::ProfilingRequestSet()); - third_copy_event.wait(); - src_field_data_gpu[i].inst = src_gpu_instance; - src_field_data_gpu[i].index_space = src_node_field_data[i].index_space; - src_field_data_gpu[i].field_offset = 0; - dst_field_data_gpu[i].inst = dst_gpu_instance; - dst_field_data_gpu[i].index_space = dst_node_field_data[i].index_space; - dst_field_data_gpu[i].field_offset = 1 * sizeof(Point<1>); - piece_field_data_gpu[i].inst = piece_gpu_instance; - piece_field_data_gpu[i].index_space = piece_id_field_data[i].index_space; - piece_field_data_gpu[i].field_offset = 0; + copy_piece(point_field_data[i], point_field_data_gpu[i], node_fields, 0, gpu_memory).wait(); } - wait_on_events = true; - log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; - std::vector > p_garbage_nodes, p_garbage_edges, p_garbage_rd, p_garbage_preimage_edges; - Event e01 = is_nodes.create_subspaces_by_field(piece_field_data_gpu, - colors, - p_garbage_nodes, - Realm::ProfilingRequestSet()); - if (wait_on_events) e01.wait(); - Event e02 = is_edges.create_subspaces_by_preimage(dst_node_field_data, - p_garbage_nodes, - p_garbage_edges, - Realm::ProfilingRequestSet(), - e01); - if(wait_on_events) e02.wait(); - - // an image of p_edges through out_node gives us all the shared nodes, along - // with some private nodes - Event e03 = is_nodes.create_subspaces_by_image(src_field_data_gpu, + + std::vector> image_inputs(num_pieces); + std::vector> image_subspaces(num_sources); + std::vector image_requirements(num_pieces); + + for (int i = 0; i < num_pieces; i++) { + image_inputs[i].location = point_field_data_gpu[i].inst.get_location(); + image_inputs[i].space = point_field_data_gpu[i].index_space; + } + + for (int i = 0; i < num_sources; i++) { + image_subspaces[i].space = sources[i]; + image_subspaces[i].entries = sources[i].dense() ? 1 : sources[i].sparsity.impl()->get_entries().size(); + } + + is_edges.by_image_buffer_requirements(image_subspaces, image_inputs, image_requirements); + + for (int i = 0; i < num_pieces; i++) { + alloc_piece(point_field_data_gpu[i].scratch_buffer, image_requirements[i].upper_bound, gpu_memory).wait(); + } + + wait_on_events = true; + log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; + Event warmup = is_edges.create_subspaces_by_image(point_field_data_gpu, + sources, p_garbage_edges, - p_garbage_rd, - Realm::ProfilingRequestSet(), - e02); - if(wait_on_events) e03.wait(); - - Event e04 = is_edges.create_subspaces_by_preimage(dst_node_field_data, - p_garbage_rd, - p_garbage_preimage_edges, - Realm::ProfilingRequestSet(), - e03); - e04.wait(); - log_app.info() << "warming up complete " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "Starting GPU Partitioning " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "Starting GPU By Field " << Clock::current_time_in_microseconds() << "\n"; - Event e1 = is_nodes.create_subspaces_by_field(piece_field_data_gpu, - colors, - p_nodes, - Realm::ProfilingRequestSet()); - if(wait_on_events) e1.wait(); - log_app.info() << "GPU By Field complete " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "Starting GPU Preimage " << Clock::current_time_in_microseconds() << "\n"; - // now compute p_edges based on the color of their in_node (i.e. a preimage) - Event e2 = is_edges.create_subspaces_by_preimage(dst_node_field_data, - p_nodes, - p_edges, - Realm::ProfilingRequestSet(), - e1); - if(wait_on_events) e2.wait(); - log_app.info() << "GPU Preimage complete " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "Starting GPU Image " << Clock::current_time_in_microseconds() << "\n"; - - // an image of p_edges through out_node gives us all the shared nodes, along - // with some private nodes - Event e3 = is_nodes.create_subspaces_by_image(src_field_data_gpu, - p_edges, - p_rd, - Realm::ProfilingRequestSet(), - e2); - if(wait_on_events) e3.wait(); - log_app.info() << "GPU Image complete " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "Starting second GPU preimage " << Clock::current_time_in_microseconds() << "\n"; - - Event e4 = is_edges.create_subspaces_by_preimage(dst_node_field_data, - p_rd, - p_preimage_edges, - Realm::ProfilingRequestSet(), - e3); - e4.wait(); - log_app.info() << "Second GPU preimage complete " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "GPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "Starting CPU Partitioning " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "Starting CPU By Field " << Clock::current_time_in_microseconds() << "\n"; - Event e5 = is_nodes.create_subspaces_by_field(piece_id_field_data, - colors, - p_nodes_cpu, - Realm::ProfilingRequestSet()); - if(wait_on_events) e5.wait(); - log_app.info() << "CPU By Field complete " << Clock::current_time_in_microseconds() << "\n"; - // now compute p_edges based on the color of their in_node (i.e. a preimage) - log_app.info() << "Starting CPU Preimage " << Clock::current_time_in_microseconds() << "\n"; - Event e6 = is_edges.create_subspaces_by_preimage(dst_node_field_data, - p_nodes_cpu, - p_edges_cpu, - Realm::ProfilingRequestSet(), - e5); - if(wait_on_events) e6.wait(); - log_app.info() << "CPU Preimage complete " << Clock::current_time_in_microseconds() << "\n"; - - // an image of p_edges through out_node gives us all the shared nodes, along - // with some private nodes - log_app.info() << "Starting CPU Image " << Clock::current_time_in_microseconds() << "\n"; - Event e7 = is_nodes.create_subspaces_by_image(src_node_field_data, - p_edges_cpu, - p_rd_cpu, - Realm::ProfilingRequestSet(), - e6); - if(wait_on_events) e7.wait(); - log_app.info() << "CPU Image complete " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "Starting second CPU preimage " << Clock::current_time_in_microseconds() << "\n"; - - Event e8 = is_edges.create_subspaces_by_preimage(dst_node_field_data, - p_rd_cpu, - p_preimage_edges_cpu, - Realm::ProfilingRequestSet(), - e7); - e8.wait(); - log_app.info() << "Second CPU preimage complete " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; - return e8; + Realm::ProfilingRequestSet()); + warmup.wait(); + + Event gpu_call = is_edges.create_subspaces_by_image(point_field_data_gpu, + sources, + p_edges, + Realm::ProfilingRequestSet()); + + Event cpu_call = is_edges.create_subspaces_by_image(point_field_data, + sources, + p_edges_cpu, + Realm::ProfilingRequestSet()); + + return Event::merge_events({gpu_call, cpu_call}); + } virtual int perform_dynamic_checks(void) @@ -1177,3273 +699,103 @@ class TileTest : public TestInterface { { int errors = 0; - if (!p_nodes.size()) { - return 0; + if (!p_edges.size()) { + return p_edges.size() == p_edges_cpu.size(); } log_app.info() << "Checking correctness of partitioning " << "\n"; for(int i = 0; i < num_pieces; i++) { - for(IndexSpaceIterator<1> it(p_nodes[i]); it.valid; it.step()) { - for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) { - if (!p_nodes_cpu[i].contains(point.p)) { - log_app.error() << "Mismatch! GPU has extra byfield point " << point.p + for(IndexSpaceIterator it(p_edges[i]); it.valid; it.step()) { + for(PointInRectIterator point(it.rect); point.valid; point.step()) { + if (!p_edges_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra image point " << point.p << " on piece " << i << "\n"; errors++; } } } - for(IndexSpaceIterator<1> it(p_nodes_cpu[i]); it.valid; it.step()) { - for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) { - if (!p_nodes[i].contains(point.p)) { - log_app.error() << "Mismatch! GPU is missing byfield point " << point.p + for(IndexSpaceIterator it(p_edges_cpu[i]); it.valid; it.step()) { + for(PointInRectIterator point(it.rect); point.valid; point.step()) { + if (!p_edges[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing image point " << point.p << " on piece " << i << "\n"; errors++; } } } - for (IndexSpaceIterator<1> it(p_edges[i]); it.valid; it.step()) { - for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { - if (!p_edges_cpu[i].contains(point.p)) { - log_app.error() << "Mismatch! GPU has extra preimage edge " << point.p - << " on piece " << i << "\n"; - errors++; - } - } - } - for (IndexSpaceIterator<1> it(p_edges_cpu[i]); it.valid; it.step()) { - for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { - if (!p_edges[i].contains(point.p)) { - log_app.error() << "Mismatch! GPU is missing preimage edge " << point.p - << " on piece " << i << "\n"; - errors++; - } - } - } - for (IndexSpaceIterator<1> it(p_rd[i]); it.valid; it.step()) { - for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { - if (!p_rd_cpu[i].contains(point.p)) { - log_app.error() << "Mismatch! GPU has extra image node " << point.p - << " on piece " << i << "\n"; - errors++; - } - } - } - for (IndexSpaceIterator<1> it(p_rd_cpu[i]); it.valid; it.step()) { - for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { - if (!p_rd[i].contains(point.p)) { - log_app.error() << "Mismatch! GPU is missing image node " << point.p - << " on piece " << i << "\n"; - errors++; - } - } - } - for (IndexSpaceIterator<1> it(p_preimage_edges[i]); it.valid; it.step()) { - for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { - if (!p_preimage_edges_cpu[i].contains(point.p)) { - log_app.error() << "Mismatch! GPU has extra second preimage edge " << point.p - << " on piece " << i << "\n"; - errors++; - } - } - } - for (IndexSpaceIterator<1> it(p_preimage_edges_cpu[i]); it.valid; it.step()) { - for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { - if (!p_preimage_edges[i].contains(point.p)) { - log_app.error() << "Mismatch! GPU is missing second preimage edge " << point.p - << " on piece " << i << "\n"; - errors++; - } - } - } } return errors; } }; -class RangeTest : public TestInterface { -public: - // graph config parameters - int num_nodes = 1000; - int num_rects = 1000; - int max_rect_size = 10; - int num_pieces = 4; - std::string filename; +void top_level_task(const void *args, size_t arglen, const void *userdata, size_t userlen, + Processor p) +{ + int errors = 0; - RangeTest(int argc, const char *argv[]) - { - for(int i = 1; i < argc; i++) { + testcfg->print_info(); - if(!strcmp(argv[i], "-p")) { - num_pieces = atoi(argv[++i]); - continue; - } + // find all the system memories - we'll stride our data across them + // for each memory, we'll need one CPU that can do the initialization of the data + std::vector sysmems; + std::vector procs; - if(!strcmp(argv[i], "-n")) { - num_nodes = atoi(argv[++i]); - continue; - } + Machine machine = Machine::get_machine(); + { + std::set all_memories; + machine.get_all_memories(all_memories); + for(std::set::const_iterator it = all_memories.begin(); + it != all_memories.end(); it++) { + Memory m = *it; - if(!strcmp(argv[i], "-r")) { - num_rects = atoi(argv[++i]); + // skip memories with no capacity for creating instances + if(m.capacity() == 0) continue; - } - if(!strcmp(argv[i], "-m")) { - max_rect_size = atoi(argv[++i]); - continue; + if(m.kind() == Memory::SYSTEM_MEM) { + sysmems.push_back(m); + std::set pset; + machine.get_shared_processors(m, pset); + Processor p = Processor::NO_PROC; + for(std::set::const_iterator it2 = pset.begin(); it2 != pset.end(); + it2++) { + if(it2->kind() == Processor::LOC_PROC) { + p = *it2; + break; + } + } + assert(p.exists()); + procs.push_back(p); + log_app.debug() << "System mem #" << (sysmems.size() - 1) << " = " + << *sysmems.rbegin() << " (" << *procs.rbegin() << ")"; } } - - - - if (num_nodes <= 0 || num_rects <= 0) { - log_app.error() << "Invalid graph dimensions in input file: rects=" << num_rects << " nodes=" << num_nodes; - exit(1); - } - } + assert(sysmems.size() > 0); + { + Realm::TimeStamp ts("initialization", true, &log_app); + Event e = testcfg->initialize_data(sysmems, procs); + // wait for all initialization to be done + e.wait(); + } - struct InitDataArgs { - int index; - RegionInstance ri_nodes; - RegionInstance ri_rects; - }; + // now actual partitioning work + { + Realm::TimeStamp ts("dependent partitioning work", true, &log_app); - enum PRNGStreams { - NODE_SUBGRAPH_STREAM, - }; + Event e = testcfg->perform_partitioning(); - void random_rect_data(int idx, int& subgraph) - { - if(random_colors) - subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); - else - subgraph = idx * num_pieces / num_rects; + e.wait(); } - void random_node_data(int idx, int& subgraph) + // dynamic checks (which would be eliminated by compiler) { - if(true) - subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); - else - subgraph = idx * num_pieces / num_nodes; - } - - void initialize_rect_data(int idx, Rect<1> &rect, int max_rect_size = 10) - { - - int first = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes); - int amount = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, max_rect_size); - rect = Rect<1>(first, first + amount); - } - - - static void init_data_task_wrapper(const void *args, size_t arglen, - const void *userdata, size_t userlen, Processor p) - { - RangeTest *me = (RangeTest *)testcfg; - me->init_data_task(args, arglen, p); - } - - void init_data_task(const void *args, size_t arglen, Processor p) - { - const InitDataArgs& i_args = *(const InitDataArgs *)args; - - log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes << ", ri_rects=" << i_args.ri_rects << ")"; - - i_args.ri_nodes.fetch_metadata(p).wait(); - i_args.ri_rects.fetch_metadata(p).wait(); - - IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>(); - IndexSpace<1> is_rects = i_args.ri_rects.get_indexspace<1>(); - - log_app.debug() << "N: " << is_nodes; - log_app.debug() << "E: " << is_rects; - - //Write out colors and rectangles - - { - AffineAccessor a_rect_id(i_args.ri_rects, 0 /* offset */); - - for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) { - int subgraph; - random_rect_data(i, subgraph); - a_rect_id.write(i, subgraph); - } - } - { - AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); - - for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) { - int subgraph; - random_node_data(i, subgraph); - a_piece_id.write(i, subgraph); - } - } - - - { - - AffineAccessor, 1> a_rect_val(i_args.ri_rects, 1 * sizeof(int) /* offset */); - - for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) { - Rect<1> rect; - initialize_rect_data(i, rect, max_rect_size); - a_rect_val.write(i, rect); - } - } - - if(show_graph) { - AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); - - for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) - log_app.info() << "node_id[" << i << "] = " << a_piece_id.read(i) << "\n"; - - AffineAccessor a_rect_id(i_args.ri_rects, 0 * sizeof(Point<1>) /* offset */); - - for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) - log_app.info() << "rect_id[" << i << "] = " << a_rect_id.read(i) << "\n"; - - AffineAccessor,1> a_rect_val(i_args.ri_rects, 1 * sizeof(int) /* offset */); - - for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) - log_app.info() << "rect_val[" << i << "] = " << a_rect_val.read(i) << "\n"; - } - } - - IndexSpace<1> is_nodes, is_rects; - std::vector ri_nodes; - std::vector, int> > node_id_field_data; - std::vector ri_rects; - std::vector, int> > rect_id_field_data; - std::vector, Rect<1> > > rect_val_field_data; - - virtual void print_info(void) - { - printf("Realm dependent partitioning test - ranges: %d nodes, %d rects, %d pieces\n", - (int)num_nodes, (int)num_rects, (int)num_pieces); - } - - virtual Event initialize_data(const std::vector& memories, - const std::vector& procs) - { - // now create index spaces for nodes and edges - is_nodes = Rect<1>(0, num_nodes - 1); - is_rects = Rect<1>(0, num_rects - 1); - - // equal partition is used to do initial population of edges and nodes - std::vector > ss_nodes_eq; - std::vector > ss_rects_eq; - - log_app.info() << "Creating equal subspaces" << "\n"; - - is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); - is_rects.create_equal_subspaces(num_pieces, 1, ss_rects_eq, Realm::ProfilingRequestSet()).wait(); - - log_app.debug() << "Initial partitions:"; - for(size_t i = 0; i < ss_nodes_eq.size(); i++) - log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i]; - for(size_t i = 0; i < ss_rects_eq.size(); i++) - log_app.debug() << " Rects #" << i << ": " << ss_rects_eq[i]; - - // create instances for each of these subspaces - std::vector node_fields, rect_fields; - node_fields.push_back(sizeof(int)); // piece_id - rect_fields.push_back(sizeof(int)); // src_node - rect_fields.push_back(sizeof(Rect<1>)); // dst_node - - ri_nodes.resize(num_pieces); - node_id_field_data.resize(num_pieces); - - for(size_t i = 0; i < ss_nodes_eq.size(); i++) { - RegionInstance ri; - RegionInstance::create_instance(ri, - memories[i % memories.size()], - ss_nodes_eq[i], - node_fields, - 0 /*SOA*/, - Realm::ProfilingRequestSet()).wait(); - ri_nodes[i] = ri; - - node_id_field_data[i].index_space = ss_nodes_eq[i]; - node_id_field_data[i].inst = ri_nodes[i]; - node_id_field_data[i].field_offset = 0; - } - - ri_rects.resize(num_pieces); - rect_id_field_data.resize(num_pieces); - rect_val_field_data.resize(num_pieces); - - for(size_t i = 0; i < ss_rects_eq.size(); i++) { - RegionInstance ri; - RegionInstance::create_instance(ri, - memories[i % memories.size()], - ss_rects_eq[i], - rect_fields, - 0 /*SOA*/, - Realm::ProfilingRequestSet()).wait(); - ri_rects[i] = ri; - - rect_id_field_data[i].index_space = ss_rects_eq[i]; - rect_id_field_data[i].inst = ri_rects[i]; - rect_id_field_data[i].field_offset = 0; - - rect_val_field_data[i].index_space = ss_rects_eq[i]; - rect_val_field_data[i].inst = ri_rects[i]; - rect_val_field_data[i].field_offset = 1 * sizeof(int); - } - - // fire off tasks to initialize data - std::set events; - for(int i = 0; i < num_pieces; i++) { - Processor p = procs[i % procs.size()]; - InitDataArgs args; - args.index = i; - args.ri_nodes = ri_nodes[i]; - args.ri_rects = ri_rects[i]; - Event e = p.spawn(INIT_RANGE_DATA_TASK, &args, sizeof(args)); - events.insert(e); - } - - return Event::merge_events(events); - } - - // the outputs of our partitioning will be: - //p_colored_rects -> all of our rectangles marked with the color given by random_rect_data - //p_rects -> image range by p colored rects into nodes - - std::vector > p_colored_rects, p_rects; - std::vector > p_colored_rects_cpu, p_rects_cpu; - - virtual Event perform_partitioning(void) - { - - std::vector colors(num_pieces); - for(int i = 0; i < num_pieces; i++) - colors[i] = i; - - Memory gpu_memory; - bool found_gpu_memory = false; - Machine machine = Machine::get_machine(); - std::set all_memories; - machine.get_all_memories(all_memories); - for(auto& memory : all_memories) { - if(memory.kind() == Memory::GPU_FB_MEM) { - gpu_memory = memory; - found_gpu_memory = true; - break; - } - } - assert(found_gpu_memory); - std::vector rect_fields; - rect_fields.push_back(sizeof(int)); - rect_fields.push_back(sizeof(Rect<1>)); - std::vector node_fields; - node_fields.push_back(sizeof(int)); - - std::vector, int > > node_id_data_gpu; - std::vector, int > > rect_id_data_gpu; - std::vector, Rect<1>>> rect_val_data_gpu; - node_id_data_gpu.resize(num_pieces); - rect_id_data_gpu.resize(num_pieces); - rect_val_data_gpu.resize(num_pieces); - for (int i = 0; i < num_pieces; i++) { - RegionInstance node_id_instance; - RegionInstance rect_id_instance; - RegionInstance rect_val_instance; - RegionInstance::create_instance(node_id_instance, - gpu_memory, - node_id_field_data[i].index_space, - node_fields, - 0 /*SOA*/, - Realm::ProfilingRequestSet()).wait(); - RegionInstance::create_instance(rect_id_instance, - gpu_memory, - rect_id_field_data[i].index_space, - rect_fields, - 0 /*SOA*/, - Realm::ProfilingRequestSet()).wait(); - RegionInstance::create_instance(rect_val_instance, - gpu_memory, - rect_val_field_data[i].index_space, - rect_fields, - 0 /*SOA*/, - Realm::ProfilingRequestSet()).wait(); - CopySrcDstField node_id_gpu_field, node_id_cpu_field, rect_id_gpu_field, rect_id_cpu_field, rect_val_gpu_field, rect_val_cpu_field; - node_id_gpu_field.inst = node_id_instance; - node_id_gpu_field.size = sizeof(int); - node_id_gpu_field.field_id = 0; - node_id_cpu_field.inst = node_id_field_data[i].inst; - node_id_cpu_field.size = sizeof(int); - node_id_cpu_field.field_id = 0; - rect_id_gpu_field.inst = rect_id_instance; - rect_id_gpu_field.size = sizeof(int); - rect_id_gpu_field.field_id = 0; - rect_id_cpu_field.inst = rect_id_field_data[i].inst; - rect_id_cpu_field.size = sizeof(int); - rect_id_cpu_field.field_id = 0; - rect_val_gpu_field.inst = rect_val_instance; - rect_val_gpu_field.size = sizeof(Rect<1>); - rect_val_gpu_field.field_id = sizeof(int); - rect_val_cpu_field.inst = rect_val_field_data[i].inst; - rect_val_cpu_field.size = sizeof(Rect<1>); - rect_val_cpu_field.field_id = sizeof(int); - std::vector node_id_gpu_data, node_id_cpu_data, rect_id_gpu_data, rect_id_cpu_data, rect_val_gpu_data, rect_val_cpu_data; - node_id_gpu_data.push_back(node_id_gpu_field); - node_id_cpu_data.push_back(node_id_cpu_field); - rect_id_gpu_data.push_back(rect_id_gpu_field); - rect_id_cpu_data.push_back(rect_id_cpu_field); - rect_val_gpu_data.push_back(rect_val_gpu_field); - rect_val_cpu_data.push_back(rect_val_cpu_field); - Event copy_event = node_id_field_data[i].index_space.copy(node_id_cpu_data, node_id_gpu_data, Realm::ProfilingRequestSet()); - copy_event.wait(); - Event second_copy_event = rect_id_field_data[i].index_space.copy(rect_id_cpu_data, rect_id_gpu_data, Realm::ProfilingRequestSet()); - second_copy_event.wait(); - Event third_copy_event = rect_val_field_data[i].index_space.copy(rect_val_cpu_data, rect_val_gpu_data, Realm::ProfilingRequestSet()); - third_copy_event.wait(); - node_id_data_gpu[i].inst = node_id_instance; - node_id_data_gpu[i].index_space = node_id_field_data[i].index_space; - node_id_data_gpu[i].field_offset = 0; - rect_id_data_gpu[i].inst = rect_id_instance; - rect_id_data_gpu[i].index_space = rect_id_field_data[i].index_space; - rect_id_data_gpu[i].field_offset = 0; - rect_val_data_gpu[i].inst = rect_val_instance; - rect_val_data_gpu[i].index_space = rect_val_field_data[i].index_space; - rect_val_data_gpu[i].field_offset = sizeof(int); - } - wait_on_events = true; - std::vector> p_garbage_rects, p_garbage_colors; - log_app.info() << "WARMING UP " << "\n"; - - std::vector> field_estimate_input(rect_id_data_gpu.size()); - std::vector field_estimate_output(rect_id_data_gpu.size()); - std::vector> image_estimate_input(rect_val_data_gpu.size()); - std::vector image_estimate_output(rect_val_data_gpu.size()); - std::vector> subspace_input(colors.size()); - for (size_t i = 0; i < rect_id_data_gpu.size(); i++) { - field_estimate_input[i].location = rect_id_data_gpu[i].inst.get_location(); - field_estimate_input[i].space = rect_id_data_gpu[i].index_space; - } - for (size_t i = 0; i < rect_val_data_gpu.size(); i++) { - image_estimate_input[i].location = rect_val_data_gpu[i].inst.get_location(); - image_estimate_input[i].space = rect_val_data_gpu[i].index_space; - } - - is_rects.by_field_buffer_requirements(field_estimate_input, field_estimate_output); - std::vector byte_fields = {sizeof(char)}; - for (size_t i = 0; i < rect_id_data_gpu.size(); i++) { - IndexSpace<1> instance_index_space(Rect<1>(0, field_estimate_output[i].upper_bound-1)); - RegionInstance::create_instance(rect_id_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); - } - - Event e001 = is_rects.create_subspaces_by_field(rect_id_data_gpu, - colors, - p_garbage_colors, - Realm::ProfilingRequestSet()); - if (wait_on_events) e001.wait(); - for (size_t i = 0; i < colors.size(); i++) { - subspace_input[i].space = p_garbage_colors[i]; - subspace_input[i].entries = p_garbage_colors[i].sparsity.impl()->get_entries().size(); - } - is_nodes.by_image_buffer_requirements(subspace_input, image_estimate_input, image_estimate_output); - for (size_t i = 0; i < rect_val_data_gpu.size(); i++) { - IndexSpace<1> instance_index_space(Rect<1>(0, (image_estimate_output[i].upper_bound)/12-1)); - RegionInstance::create_instance(rect_val_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); - } - Event e002 = is_nodes.create_subspaces_by_image(rect_val_data_gpu, - p_garbage_colors, - p_garbage_rects, - Realm::ProfilingRequestSet(), - e001); - if(wait_on_events) e002.wait(); - - log_app.info() << "FINISHED WARMING UP " << "\n"; - log_app.info() << "starting GPU partitioning " << Clock::current_time_in_microseconds() << "\n"; - - log_app.info() << "STARTING GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; - - Event e01 = is_rects.create_subspaces_by_field(rect_id_data_gpu, - colors, - p_colored_rects, - Realm::ProfilingRequestSet()); - if (wait_on_events) e01.wait(); - - log_app.info() << "FINISHED GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "STARTING GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; - Event e02 = is_nodes.create_subspaces_by_image(rect_val_data_gpu, - p_colored_rects, - p_rects, - Realm::ProfilingRequestSet(), - e01); - if(wait_on_events) e02.wait(); - - log_app.info() << "FINISHED GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "STARTING CPU partitioning " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "STARTING CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; - Event e1 = is_rects.create_subspaces_by_field(rect_id_field_data, - colors, - p_colored_rects_cpu, - Realm::ProfilingRequestSet()); - if (wait_on_events) e1.wait(); - log_app.info() << "FINISHED CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "STARTING CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; - Event e2 = is_nodes.create_subspaces_by_image(rect_val_field_data, - p_colored_rects_cpu, - p_rects_cpu, - Realm::ProfilingRequestSet(), - e1); - if(wait_on_events) e2.wait(); - log_app.info() << "FINISHED CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; - return e2; - } - - - - virtual int perform_dynamic_checks(void) - { - return 0; - } - - virtual int check_partitioning(void) - { - log_app.info() << "Checking correctness of partitioning " << "\n"; - int errors = 0; - - for (int i = 0; i < num_pieces; i++) { - for (IndexSpaceIterator<1> it(p_colored_rects[i]); it.valid; it.step()) { - for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { - if (!p_colored_rects_cpu[i].contains(point.p)) { - log_app.error() << "Mismatch! GPU has extra colored rect point " << point.p - << " on piece " << i << "\n"; - errors++; - } - } - } - for (IndexSpaceIterator<1> it(p_colored_rects_cpu[i]); it.valid; it.step()) { - for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { - if(!p_colored_rects[i].contains(point.p)) { - log_app.error() << "Mismatch! GPU is missing colored rect point " << point.p - << " on piece " << i << "\n"; - errors++; - } - } - } - for (IndexSpaceIterator<1> it(p_rects[i]); it.valid; it.step()) { - for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { - if (!p_rects_cpu[i].contains(point.p)) { - log_app.error() << "Mismatch! GPU has extra rect point " << point.p - << " on piece " << i << "\n"; - errors++; - } - } - } - for (IndexSpaceIterator<1> it(p_rects_cpu[i]); it.valid; it.step()) { - for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { - if(!p_rects[i].contains(point.p)) { - log_app.error() << "Mismatch! GPU is missing rect point " << point.p - << " on piece " << i << "\n"; - errors++; - } - } - } - } - return errors; - } -}; - -class Range2DTest : public TestInterface { -public: - // graph config parameters - int num_nodes = 1000; - int num_rects = 1000; - int max_rect_size = 10; - int num_pieces = 4; - - Range2DTest(int argc, const char *argv[]) - { - for(int i = 1; i < argc; i++) { - - if(!strcmp(argv[i], "-p")) { - num_pieces = atoi(argv[++i]); - continue; - } - - if(!strcmp(argv[i], "-n")) { - num_nodes = atoi(argv[++i]); - continue; - } - - if (!strcmp(argv[i], "-r")) { - num_rects = atoi(argv[++i]); - continue; - } - - if (!strcmp(argv[i], "-m")) { - max_rect_size = atoi(argv[++i]); - continue; - } - } - - if (num_nodes <= 0 || num_rects <= 0) { - log_app.error() << "Invalid graph dimensions in input file: rects=" << num_rects << " nodes=" << num_nodes; - exit(1); - } - - } - - - - struct InitDataArgs { - int index; - RegionInstance ri_nodes; - RegionInstance ri_rects; - }; - - enum PRNGStreams { - NODE_SUBGRAPH_STREAM, - }; - - void random_rect_data(int idx, int& subgraph) - { - if(random_colors) - subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); - else - subgraph = idx * num_pieces / num_rects; - } - - void random_node_data(int idx, int& subgraph) - { - if(true) - subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); - else - subgraph = idx * num_pieces / num_nodes; - } - - void initialize_rect_data(int idx, Rect<2> &rect, int max_rect_size = 10) - { - - int x = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes); - int y = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, num_nodes); - int length = Philox_2x32<>::rand_int(random_seed, idx + 2, NODE_SUBGRAPH_STREAM, max_rect_size); - int height = Philox_2x32<>::rand_int(random_seed, idx + 3, NODE_SUBGRAPH_STREAM, max_rect_size); - rect.lo[0] = x; - rect.hi[0] = x + length; - rect.lo[1] = y; - rect.hi[1] = y + height; - } - - - static void init_data_task_wrapper(const void *args, size_t arglen, - const void *userdata, size_t userlen, Processor p) - { - Range2DTest *me = (Range2DTest *)testcfg; - me->init_data_task(args, arglen, p); - } - - void init_data_task(const void *args, size_t arglen, Processor p) - { - const InitDataArgs& i_args = *(const InitDataArgs *)args; - - log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes << ", ri_rects=" << i_args.ri_rects << ")"; - - i_args.ri_nodes.fetch_metadata(p).wait(); - i_args.ri_rects.fetch_metadata(p).wait(); - - IndexSpace<2> is_nodes = i_args.ri_nodes.get_indexspace<2>(); - IndexSpace<1> is_rects = i_args.ri_rects.get_indexspace<1>(); - - log_app.debug() << "N: " << is_nodes; - log_app.debug() << "E: " << is_rects; - - { - AffineAccessor a_piece_id(i_args.ri_rects, 0 /* offset */); - - for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) { - int subgraph; - random_rect_data(i, subgraph); - a_piece_id.write(i, subgraph); - } - } - { - AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); - - for(int i = is_nodes.bounds.lo[0]; i <= is_nodes.bounds.hi[0]; i++) { - for (int j = is_nodes.bounds.lo[1]; j <= is_nodes.bounds.hi[1]; j++) { - int idx = i * (is_nodes.bounds.hi[1] - is_nodes.bounds.lo[1] + 1) + j; - int subgraph; - random_node_data(idx, subgraph); - a_piece_id.write(Point<2>(i, j), subgraph); - } - } - } - - - { - - AffineAccessor, 1> a_rect(i_args.ri_rects, 1 * sizeof(int) /* offset */); - - // Read edges line by line - for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) { - Rect<2> rect; - initialize_rect_data(i, rect, max_rect_size); - a_rect.write(i, rect); - } - } - - if(show_graph) { - AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); - - for(int i = is_nodes.bounds.lo[0]; i <= is_nodes.bounds.hi[1]; i++) { - for (int j = is_nodes.bounds.lo[1]; j <= is_nodes.bounds.hi[1]; j++) { - Point<2> p(i, j); - log_app.info() << "node_id[" << p << "] = " << a_piece_id.read(p) << "\n"; - } - } - - AffineAccessor a_rect_id(i_args.ri_rects, 0 * sizeof(Point<1>) /* offset */); - - for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) - log_app.info() << "rect_id[" << i << "] = " << a_rect_id.read(i) << "\n"; - - AffineAccessor,1> a_rect_val(i_args.ri_rects, 1 * sizeof(int) /* offset */); - - for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) - log_app.info() << "rect_val[" << i << "] = " << a_rect_val.read(i) << "\n"; - } - } - - IndexSpace<1> is_rects; - IndexSpace<2> is_nodes; - std::vector ri_nodes; - std::vector, int> > node_id_field_data; - std::vector ri_rects; - std::vector, int> > rect_id_field_data; - std::vector, Rect<2> > > rect_val_field_data; - - virtual void print_info(void) - { - printf("Realm dependent partitioning test - 2D ranges: %d nodes, %d rects, %d pieces\n", - (int)num_nodes, (int)num_rects, (int)num_pieces); - } - - virtual Event initialize_data(const std::vector& memories, - const std::vector& procs) - { - // now create index spaces for nodes and edges - is_nodes = Rect<2>(Point<2>(0, 0), Point<2>(num_nodes - 1, num_nodes - 1)); - is_rects = Rect<1>(0, num_rects - 1); - - // equal partition is used to do initial population of edges and nodes - std::vector > ss_nodes_eq; - std::vector > ss_rects_eq; - - log_app.info() << "Creating equal subspaces" << "\n"; - - is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); - is_rects.create_equal_subspaces(num_pieces, 1, ss_rects_eq, Realm::ProfilingRequestSet()).wait(); - - log_app.debug() << "Initial partitions:\n"; - for(size_t i = 0; i < ss_nodes_eq.size(); i++) - log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i]; - for(size_t i = 0; i < ss_rects_eq.size(); i++) - log_app.debug() << " Rects #" << i << ": " << ss_rects_eq[i]; - - // create instances for each of these subspaces - std::vector node_fields, rect_fields; - node_fields.push_back(sizeof(int)); // piece_id - rect_fields.push_back(sizeof(int)); // src_node - rect_fields.push_back(sizeof(Rect<2>)); // dst_node - - ri_nodes.resize(num_pieces); - node_id_field_data.resize(num_pieces); - - for(size_t i = 0; i < ss_nodes_eq.size(); i++) { - RegionInstance ri; - RegionInstance::create_instance(ri, - memories[i % memories.size()], - ss_nodes_eq[i], - node_fields, - 0 /*SOA*/, - Realm::ProfilingRequestSet()).wait(); - ri_nodes[i] = ri; - - node_id_field_data[i].index_space = ss_nodes_eq[i]; - node_id_field_data[i].inst = ri_nodes[i]; - node_id_field_data[i].field_offset = 0; - } - - ri_rects.resize(num_pieces); - rect_id_field_data.resize(num_pieces); - rect_val_field_data.resize(num_pieces); - - for(size_t i = 0; i < ss_rects_eq.size(); i++) { - RegionInstance ri; - RegionInstance::create_instance(ri, - memories[i % memories.size()], - ss_rects_eq[i], - rect_fields, - 0 /*SOA*/, - Realm::ProfilingRequestSet()).wait(); - ri_rects[i] = ri; - - rect_id_field_data[i].index_space = ss_rects_eq[i]; - rect_id_field_data[i].inst = ri_rects[i]; - rect_id_field_data[i].field_offset = 0; - - rect_val_field_data[i].index_space = ss_rects_eq[i]; - rect_val_field_data[i].inst = ri_rects[i]; - rect_val_field_data[i].field_offset = 1 * sizeof(int); - } - - // fire off tasks to initialize data - std::set events; - for(int i = 0; i < num_pieces; i++) { - Processor p = procs[i % procs.size()]; - InitDataArgs args; - args.index = i; - args.ri_nodes = ri_nodes[i]; - args.ri_rects = ri_rects[i]; - Event e = p.spawn(INIT_RANGE2D_DATA_TASK, &args, sizeof(args)); - events.insert(e); - } - - return Event::merge_events(events); - } - - // the outputs of our partitioning will be: - // is_private, is_shared - subsets of is_nodes based on private/shared - // p_rd, p_wr, p_ghost - subsets of the above split by subckt - // p_edges - subsets of is_edges for each subckt - - std::vector > p_colored_rects; - std::vector> p_rects, p_intersect, p_diff; - std::vector> p_colored_rects_cpu; - std::vector> p_rects_cpu, p_intersect_cpu, p_diff_cpu; - - IndexSpace<2> cpu_union, gpu_union, garbage_union; - - virtual Event perform_partitioning(void) - { - // first partition nodes by subckt id (this is the independent partition, - // but not actually used by the app) - - std::vector colors(num_pieces); - for(int i = 0; i < num_pieces; i++) - colors[i] = i; - - Memory gpu_memory; - bool found_gpu_memory = false; - Machine machine = Machine::get_machine(); - std::set all_memories; - machine.get_all_memories(all_memories); - for(auto& memory : all_memories) { - if(memory.kind() == Memory::GPU_FB_MEM) { - gpu_memory = memory; - found_gpu_memory = true; - break; - } - } - assert(found_gpu_memory); - std::vector rect_fields; - rect_fields.push_back(sizeof(int)); - rect_fields.push_back(sizeof(Rect<2>)); - std::vector node_fields; - node_fields.push_back(sizeof(int)); - - std::vector, int > > node_id_data_gpu; - std::vector, int > > rect_id_data_gpu; - std::vector, Rect<2>>> rect_val_data_gpu; - node_id_data_gpu.resize(num_pieces); - rect_id_data_gpu.resize(num_pieces); - rect_val_data_gpu.resize(num_pieces); - for (int i = 0; i < num_pieces; i++) { - RegionInstance node_id_instance; - RegionInstance rect_id_instance; - RegionInstance rect_val_instance; - RegionInstance::create_instance(node_id_instance, - gpu_memory, - node_id_field_data[i].index_space, - node_fields, - 0 /*SOA*/, - Realm::ProfilingRequestSet()).wait(); - RegionInstance::create_instance(rect_id_instance, - gpu_memory, - rect_id_field_data[i].index_space, - rect_fields, - 0 /*SOA*/, - Realm::ProfilingRequestSet()).wait(); - RegionInstance::create_instance(rect_val_instance, - gpu_memory, - rect_val_field_data[i].index_space, - rect_fields, - 0 /*SOA*/, - Realm::ProfilingRequestSet()).wait(); - CopySrcDstField node_id_gpu_field, node_id_cpu_field, rect_id_gpu_field, rect_id_cpu_field, rect_val_gpu_field, rect_val_cpu_field; - node_id_gpu_field.inst = node_id_instance; - node_id_gpu_field.size = sizeof(int); - node_id_gpu_field.field_id = 0; - node_id_cpu_field.inst = node_id_field_data[i].inst; - node_id_cpu_field.size = sizeof(int); - node_id_cpu_field.field_id = 0; - rect_id_gpu_field.inst = rect_id_instance; - rect_id_gpu_field.size = sizeof(int); - rect_id_gpu_field.field_id = 0; - rect_id_cpu_field.inst = rect_id_field_data[i].inst; - rect_id_cpu_field.size = sizeof(int); - rect_id_cpu_field.field_id = 0; - rect_val_gpu_field.inst = rect_val_instance; - rect_val_gpu_field.size = sizeof(Rect<2>); - rect_val_gpu_field.field_id = sizeof(int); - rect_val_cpu_field.inst = rect_val_field_data[i].inst; - rect_val_cpu_field.size = sizeof(Rect<2>); - rect_val_cpu_field.field_id = sizeof(int); - std::vector node_id_gpu_data, node_id_cpu_data, rect_id_gpu_data, rect_id_cpu_data, rect_val_gpu_data, rect_val_cpu_data; - node_id_gpu_data.push_back(node_id_gpu_field); - node_id_cpu_data.push_back(node_id_cpu_field); - rect_id_gpu_data.push_back(rect_id_gpu_field); - rect_id_cpu_data.push_back(rect_id_cpu_field); - rect_val_gpu_data.push_back(rect_val_gpu_field); - rect_val_cpu_data.push_back(rect_val_cpu_field); - Event copy_event = node_id_field_data[i].index_space.copy(node_id_cpu_data, node_id_gpu_data, Realm::ProfilingRequestSet()); - copy_event.wait(); - Event second_copy_event = rect_id_field_data[i].index_space.copy(rect_id_cpu_data, rect_id_gpu_data, Realm::ProfilingRequestSet()); - second_copy_event.wait(); - Event third_copy_event = rect_val_field_data[i].index_space.copy(rect_val_cpu_data, rect_val_gpu_data, Realm::ProfilingRequestSet()); - third_copy_event.wait(); - node_id_data_gpu[i].inst = node_id_instance; - node_id_data_gpu[i].index_space = node_id_field_data[i].index_space; - node_id_data_gpu[i].field_offset = 0; - rect_id_data_gpu[i].inst = rect_id_instance; - rect_id_data_gpu[i].index_space = rect_id_field_data[i].index_space; - rect_id_data_gpu[i].field_offset = 0; - rect_val_data_gpu[i].inst = rect_val_instance; - rect_val_data_gpu[i].index_space = rect_val_field_data[i].index_space; - rect_val_data_gpu[i].field_offset = sizeof(int); - } - wait_on_events = true; - std::vector> p_garbage_colors; - std::vector> p_garbage_rects; - log_app.info() << "WARMING UP " << "\n"; - - std::vector> field_estimate_input(rect_id_data_gpu.size()); - std::vector field_estimate_output(rect_id_data_gpu.size()); - std::vector> image_estimate_input(rect_val_data_gpu.size()); - std::vector image_estimate_output(rect_val_data_gpu.size()); - std::vector> subspace_input(colors.size()); - for (size_t i = 0; i < rect_id_data_gpu.size(); i++) { - field_estimate_input[i].location = rect_id_data_gpu[i].inst.get_location(); - field_estimate_input[i].space = rect_id_data_gpu[i].index_space; - } - for (size_t i = 0; i < rect_val_data_gpu.size(); i++) { - image_estimate_input[i].location = rect_val_data_gpu[i].inst.get_location(); - image_estimate_input[i].space = rect_val_data_gpu[i].index_space; - } - - is_rects.by_field_buffer_requirements(field_estimate_input, field_estimate_output); - std::vector byte_fields = {sizeof(char)}; - for (size_t i = 0; i < rect_id_data_gpu.size(); i++) { - IndexSpace<1> instance_index_space(Rect<1>(0, field_estimate_output[i].upper_bound-1)); - RegionInstance::create_instance(rect_id_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); - } - - Event e001 = is_rects.create_subspaces_by_field(rect_id_data_gpu, - colors, - p_garbage_colors, - Realm::ProfilingRequestSet()); - if (wait_on_events) e001.wait(); - for (size_t i = 0; i < colors.size(); i++) { - subspace_input[i].space = p_garbage_colors[i]; - subspace_input[i].entries = p_garbage_colors[i].sparsity.impl()->get_entries().size(); - } - is_nodes.by_image_buffer_requirements(subspace_input, image_estimate_input, image_estimate_output); - for (size_t i = 0; i < rect_val_data_gpu.size(); i++) { - IndexSpace<1> instance_index_space(Rect<1>(0, (image_estimate_output[i].upper_bound*5)-1)); - RegionInstance::create_instance(rect_val_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); - } - Event e002 = is_nodes.create_subspaces_by_image(rect_val_data_gpu, - p_garbage_colors, - p_garbage_rects, - Realm::ProfilingRequestSet(), - e001); - if(wait_on_events) e002.wait(); - - log_app.info() << "FINISHED WARMING UP " << "\n"; - log_app.info() << "starting GPU partitioning " << Clock::current_time_in_microseconds() << "\n"; - - log_app.info() << "STARTING GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; - - Event e01 = is_rects.create_subspaces_by_field(rect_id_data_gpu, - colors, - p_colored_rects, - Realm::ProfilingRequestSet()); - if (wait_on_events) e01.wait(); - - log_app.info() << "FINISHED GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "STARTING GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; - Event e02 = is_nodes.create_subspaces_by_image(rect_val_data_gpu, - p_colored_rects, - p_rects, - Realm::ProfilingRequestSet(), - e01); - if(wait_on_events) e02.wait(); - log_app.info() << "FINISHED GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "GPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; - - log_app.info() << "STARTING CPU partitioning " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "STARTING CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; - Event e1 = is_rects.create_subspaces_by_field(rect_id_field_data, - colors, - p_colored_rects_cpu, - Realm::ProfilingRequestSet()); - if (wait_on_events) e1.wait(); - log_app.info() << "FINISHED CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "STARTING CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; - Event e2 = is_nodes.create_subspaces_by_image(rect_val_field_data, - p_colored_rects_cpu, - p_rects_cpu, - Realm::ProfilingRequestSet(), - e1); - if(wait_on_events) e2.wait(); - log_app.info() << "FINISHED CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; - log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; - return e2; - } - - - - virtual int perform_dynamic_checks(void) - { - return 0; - } - - virtual int check_partitioning(void) - { - log_app.info() << "Checking correctness of partitioning " << "\n"; - int errors = 0; - - for (int i = 0; i < num_pieces; i++) { - for (IndexSpaceIterator<1> it(p_colored_rects[i]); it.valid; it.step()) { - for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { - if(!p_colored_rects_cpu[i].contains(point.p)) { - log_app.error() << "Mismatch! GPU has extra colored rect point " << point.p - << " on piece " << i << "\n"; - errors++; - } - } - } - for (IndexSpaceIterator<1> it(p_colored_rects_cpu[i]); it.valid; it.step()) { - for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { - if (!p_colored_rects[i].contains(point.p)) { - log_app.error() << "Mismatch! GPU is missing colored rect point " << point.p - << " on piece " << i << "\n"; - errors++; - } - } - } - for (IndexSpaceIterator<2> it(p_rects[i]); it.valid; it.step()) { - for (PointInRectIterator<2> point(it.rect); point.valid; point.step()) { - if (!p_rects_cpu[i].contains(point.p)) { - log_app.error() << "Mismatch! GPU has extra rect point " << point.p - << " on piece " << i << "\n"; - errors++; - } - } - } - for (IndexSpaceIterator<2> it(p_rects_cpu[i]); it.valid; it.step()) { - for (PointInRectIterator<2> point(it.rect); point.valid; point.step()) { - if (!p_rects[i].contains(point.p)) { - log_app.error() << "Mismatch! GPU is missing rect point " << point.p - << " on piece " << i << "\n"; - errors++; - } - } - } - } - return errors; - } -}; - -class MiniAeroTest : public TestInterface { -public: - enum ProblemType - { - PTYPE_0, - PTYPE_1, - PTYPE_2, - }; - enum FaceType - { - BC_INTERIOR = 0, - BC_TANGENT = 1, - BC_EXTRAPOLATE = 2, - BC_INFLOW = 3, - BC_NOSLIP = 4, - BC_BLOCK_BORDER = 5, - BC_TOTAL = 6, - }; - - ProblemType problem_type = PTYPE_0; - int global_x = 4; - int global_y = 4; - int global_z = 4; - int blocks_x = 2; - int blocks_y = 2; - int blocks_z = 2; - - int n_cells; // total cell count - int n_blocks; // total block count - int n_faces; // total face count - std::vector xsplit, ysplit, zsplit; // cut planes - std::vector cells_per_block, faces_per_block; - - // can't do 64-bit index types right now, so at least get most of our 32-bit space - typedef int INDEXTYPE; - static const INDEXTYPE FIRST_INDEX = -2000000000; // easier to read than INT_MIN+1 - - MiniAeroTest(int argc, const char *argv[]) - { -#define INT_ARG(s, v) \ - if(!strcmp(argv[i], s)) { \ - v = atoi(argv[++i]); \ - continue; \ - } - for(int i = 1; i < argc; i++) { - if(!strcmp(argv[i], "-type")) { - problem_type = (ProblemType)atoi(argv[++i]); - continue; - } - INT_ARG("-gx", global_x); - INT_ARG("-gy", global_y); - INT_ARG("-gz", global_z); - INT_ARG("-bx", blocks_x); - INT_ARG("-by", blocks_y); - INT_ARG("-bz", blocks_z); - if(!strcmp(argv[i], "-g")) { - int v = atoi(argv[++i]); - global_x = global_y = global_z = v; - continue; - } - if(!strcmp(argv[i], "-b")) { - int v = atoi(argv[++i]); - blocks_x = blocks_y = blocks_z = v; - continue; - } - } -#undef INT_ARG - - // don't allow degenerate blocks - assert(global_x >= blocks_x); - assert(global_y >= blocks_y); - assert(global_z >= blocks_z); - - split_evenly(global_x, blocks_x, xsplit); - split_evenly(global_y, blocks_y, ysplit); - split_evenly(global_z, blocks_z, zsplit); - - n_blocks = blocks_x * blocks_y * blocks_z; - n_cells = 0; - n_faces = 0; - for(int bz = 0; bz < blocks_z; bz++) - for(int by = 0; by < blocks_y; by++) - for(int bx = 0; bx < blocks_x; bx++) { - int nx = xsplit[bx + 1] - xsplit[bx]; - int ny = ysplit[by + 1] - ysplit[by]; - int nz = zsplit[bz + 1] - zsplit[bz]; - - int c = nx * ny * nz; - int f = (((nx + 1) * ny * nz) + (nx * (ny + 1) * nz) + (nx * ny * (nz + 1))); - cells_per_block.push_back(c); - faces_per_block.push_back(f); - - n_cells += c; - n_faces += f; - } - assert(n_cells == global_x * global_y * global_z); - assert(n_faces == (((global_x + blocks_x) * global_y * global_z) + - (global_x * (global_y + blocks_y) * global_z) + - (global_x * global_y * (global_z + blocks_z)))); - } - - virtual void print_info(void) - { - printf("Realm dependent partitioning test - miniaero: %d x %d x %d cells, %d x %d x " - "%d blocks\n", - (int)global_x, (int)global_y, (int)global_z, (int)blocks_x, (int)blocks_y, - (int)blocks_z); - } - - IndexSpace<1> is_cells, is_faces; - std::vector ri_cells; - std::vector, int>> cell_blockid_field_data; - std::vector ri_faces; - std::vector, Point<1>>> face_left_field_data; - std::vector, Point<1>>> face_right_field_data; - std::vector, int>> face_type_field_data; - - struct InitDataArgs { - int index; - RegionInstance ri_cells, ri_faces; - }; - - virtual Event initialize_data(const std::vector &memories, - const std::vector &procs) - { - // top level index spaces - is_cells = Rect<1>(FIRST_INDEX, FIRST_INDEX + n_cells - 1); - is_faces = Rect<1>(FIRST_INDEX, FIRST_INDEX + n_faces - 1); - - // weighted partitions based on the distribution we already computed - std::vector> ss_cells_w; - std::vector> ss_faces_w; - - is_cells - .create_weighted_subspaces(n_blocks, 1, cells_per_block, ss_cells_w, - Realm::ProfilingRequestSet()) - .wait(); - is_faces - .create_weighted_subspaces(n_blocks, 1, faces_per_block, ss_faces_w, - Realm::ProfilingRequestSet()) - .wait(); - - log_app.debug() << "Initial partitions:"; - for(size_t i = 0; i < ss_cells_w.size(); i++) - log_app.debug() << " Cells #" << i << ": " << ss_cells_w[i]; - for(size_t i = 0; i < ss_faces_w.size(); i++) - log_app.debug() << " Faces #" << i << ": " << ss_faces_w[i]; - - // create instances for each of these subspaces - std::vector cell_fields, face_fields; - cell_fields.push_back(sizeof(int)); // blockid - assert(sizeof(int) == sizeof(Point<1>)); - face_fields.push_back(sizeof(Point<1>)); // left - face_fields.push_back(sizeof(Point<1>)); // right - face_fields.push_back(sizeof(int)); // type - - ri_cells.resize(n_blocks); - cell_blockid_field_data.resize(n_blocks); - - for(size_t i = 0; i < ss_cells_w.size(); i++) { - RegionInstance ri; - RegionInstance::create_instance(ri, memories[i % memories.size()], ss_cells_w[i], - cell_fields, 0 /*SOA*/, - Realm::ProfilingRequestSet()) - .wait(); - ri_cells[i] = ri; - - cell_blockid_field_data[i].index_space = ss_cells_w[i]; - cell_blockid_field_data[i].inst = ri_cells[i]; - cell_blockid_field_data[i].field_offset = 0; - } - - ri_faces.resize(n_blocks); - face_left_field_data.resize(n_blocks); - face_right_field_data.resize(n_blocks); - face_type_field_data.resize(n_blocks); - - for(size_t i = 0; i < ss_faces_w.size(); i++) { - RegionInstance ri; - RegionInstance::create_instance(ri, memories[i % memories.size()], ss_faces_w[i], - face_fields, 0 /*SOA*/, - Realm::ProfilingRequestSet()) - .wait(); - ri_faces[i] = ri; - - face_left_field_data[i].index_space = ss_faces_w[i]; - face_left_field_data[i].inst = ri_faces[i]; - face_left_field_data[i].field_offset = 0 * sizeof(Point<1>); - - face_right_field_data[i].index_space = ss_faces_w[i]; - face_right_field_data[i].inst = ri_faces[i]; - face_right_field_data[i].field_offset = 1 * sizeof(Point<1>); - - face_type_field_data[i].index_space = ss_faces_w[i]; - face_type_field_data[i].inst = ri_faces[i]; - face_type_field_data[i].field_offset = 2 * sizeof(Point<1>); - } - - // fire off tasks to initialize data - std::set events; - for(int i = 0; i < n_blocks; i++) { - Processor p = procs[i % memories.size()]; - InitDataArgs args; - args.index = i; - args.ri_cells = ri_cells[i]; - args.ri_faces = ri_faces[i]; - Event e = p.spawn(INIT_MINIAERO_DATA_TASK, &args, sizeof(args)); - events.insert(e); - } - - return Event::merge_events(events); - } - - static void init_data_task_wrapper(const void *args, size_t arglen, - const void *userdata, size_t userlen, Processor p) - { - MiniAeroTest *me = (MiniAeroTest *)testcfg; - me->init_data_task(args, arglen, p); - } - - Point<1> global_cell_pointer(int cx, int cy, int cz) - { - INDEXTYPE p = FIRST_INDEX; - - // out of range? return -1 - if((cx < 0) || (cx >= global_x) || (cy < 0) || (cy >= global_y) || (cz < 0) || - (cz >= global_z)) - return -1; - - // first chunks in z, then y, then x - int zi = find_split(zsplit, cz); - p += global_x * global_y * zsplit[zi]; - cz -= zsplit[zi]; - int local_z = zsplit[zi + 1] - zsplit[zi]; - - int yi = find_split(ysplit, cy); - p += global_x * ysplit[yi] * local_z; - cy -= ysplit[yi]; - int local_y = ysplit[yi + 1] - ysplit[yi]; - - int xi = find_split(xsplit, cx); - p += xsplit[xi] * local_y * local_z; - cx -= xsplit[xi]; - int local_x = xsplit[xi + 1] - xsplit[xi]; - - // now local addressing within this block - p += (cx + (cy * local_x) + (cz * local_x * local_y)); - return p; - } - - void init_data_task(const void *args, size_t arglen, Processor p) - { - const InitDataArgs &i_args = *(const InitDataArgs *)args; - - i_args.ri_cells.fetch_metadata(p).wait(); - i_args.ri_faces.fetch_metadata(p).wait(); - - log_app.info() << "init task #" << i_args.index << " (ri_cells=" << i_args.ri_cells - << ", ri_faces=" << i_args.ri_faces << ")"; - - IndexSpace<1> is_cells = i_args.ri_cells.get_indexspace<1>(); - IndexSpace<1> is_faces = i_args.ri_faces.get_indexspace<1>(); - - log_app.debug() << "C: " << is_cells; - log_app.debug() << "F: " << is_faces; - - int bx = i_args.index % blocks_x; - int by = (i_args.index / blocks_x) % blocks_y; - int bz = i_args.index / blocks_x / blocks_y; - - size_t nx = xsplit[bx + 1] - xsplit[bx]; - size_t ny = ysplit[by + 1] - ysplit[by]; - size_t nz = zsplit[bz + 1] - zsplit[bz]; - - size_t c = nx * ny * nz; - size_t f = (((nx + 1) * ny * nz) + (nx * (ny + 1) * nz) + (nx * ny * (nz + 1))); - assert(is_cells.bounds.volume() == c); - assert(is_faces.bounds.volume() == f); - - // cells are all assigned to the local block - { - AffineAccessor a_cell_blockid(i_args.ri_cells, 0 /* offset */); - - for(int cz = zsplit[bz]; cz < zsplit[bz + 1]; cz++) - for(int cy = ysplit[by]; cy < ysplit[by + 1]; cy++) - for(int cx = xsplit[bx]; cx < xsplit[bx + 1]; cx++) { - Point<1> pz = global_cell_pointer(cx, cy, cz); - assert(is_cells.bounds.contains(pz)); - - a_cell_blockid.write(pz, i_args.index); - } - } - - // faces aren't in any globally-visible order - { - AffineAccessor, 1> a_face_left(i_args.ri_faces, - 0 * sizeof(Point<1>) /* offset */); - AffineAccessor, 1> a_face_right(i_args.ri_faces, - 1 * sizeof(Point<1>) /* offset */); - AffineAccessor a_face_type(i_args.ri_faces, - 2 * sizeof(Point<1>) /* offset */); - - Point<1> pf = is_faces.bounds.lo; - - // -- type 0 | type 1 | type 2 - // -- ------ | ------ | ------ - // -- left extrapolate | inflow | inflow - // -- right extrapolate | extrapolate | extrapolate - // -- down tangent | noslip | tangent - // -- up tangent | extrapolate | tangent - // -- back tangent | tangent | tangent - // -- front tangent | tangent | tangent - - // left/right faces first - for(int fx = xsplit[bx]; fx <= xsplit[bx + 1]; fx++) { - int ftype = BC_INTERIOR; - bool reversed = false; - if(fx == xsplit[bx]) { - // low boundary - reversed = true; - if(fx == 0) - switch(problem_type) { - case PTYPE_0: - ftype = BC_EXTRAPOLATE; - break; - case PTYPE_1: - ftype = BC_INFLOW; - break; - case PTYPE_2: - ftype = BC_INFLOW; - break; - } - else - ftype = BC_BLOCK_BORDER; - } else if(fx == xsplit[bx + 1]) { - // high boundary - if(fx == global_x) - switch(problem_type) { - case PTYPE_0: - ftype = BC_EXTRAPOLATE; - break; - case PTYPE_1: - ftype = BC_EXTRAPOLATE; - break; - case PTYPE_2: - ftype = BC_EXTRAPOLATE; - break; - } - else - ftype = BC_BLOCK_BORDER; - } - - for(int cz = zsplit[bz]; cz < zsplit[bz + 1]; cz++) - for(int cy = ysplit[by]; cy < ysplit[by + 1]; cy++) { - a_face_left.write(pf, global_cell_pointer(fx - (reversed ? 0 : 1), cy, cz)); - a_face_right.write(pf, global_cell_pointer(fx - (reversed ? 1 : 0), cy, cz)); - a_face_type.write(pf, ftype); - pf[0]++; - } - } - - // down/up faces next - for(int fy = ysplit[by]; fy <= ysplit[by + 1]; fy++) { - int ftype = BC_INTERIOR; - bool reversed = false; - if(fy == ysplit[by]) { - // low boundary - reversed = true; - if(fy == 0) - switch(problem_type) { - case PTYPE_0: - ftype = BC_TANGENT; - break; - case PTYPE_1: - ftype = BC_NOSLIP; - break; - case PTYPE_2: - ftype = BC_TANGENT; - break; - } - else - ftype = BC_BLOCK_BORDER; - } else if(fy == ysplit[by + 1]) { - // high boundary - if(fy == global_y) - switch(problem_type) { - case PTYPE_0: - ftype = BC_TANGENT; - break; - case PTYPE_1: - ftype = BC_EXTRAPOLATE; - break; - case PTYPE_2: - ftype = BC_TANGENT; - break; - } - else - ftype = BC_BLOCK_BORDER; - } - - for(int cz = zsplit[bz]; cz < zsplit[bz + 1]; cz++) - for(int cx = xsplit[bx]; cx < xsplit[bx + 1]; cx++) { - a_face_left.write(pf, global_cell_pointer(cx, fy - (reversed ? 0 : 1), cz)); - a_face_right.write(pf, global_cell_pointer(cx, fy - (reversed ? 1 : 0), cz)); - a_face_type.write(pf, ftype); - pf[0]++; - } - } - - // back/front faces last - for(int fz = zsplit[bz]; fz <= zsplit[bz + 1]; fz++) { - int ftype = BC_INTERIOR; - bool reversed = false; - if(fz == zsplit[bz]) { - // low boundary - reversed = true; - if(fz == 0) - switch(problem_type) { - case PTYPE_0: - ftype = BC_TANGENT; - break; - case PTYPE_1: - ftype = BC_TANGENT; - break; - case PTYPE_2: - ftype = BC_TANGENT; - break; - } - else - ftype = BC_BLOCK_BORDER; - } else if(fz == zsplit[bz + 1]) { - // high boundary - if(fz == global_z) - switch(problem_type) { - case PTYPE_0: - ftype = BC_TANGENT; - break; - case PTYPE_1: - ftype = BC_TANGENT; - break; - case PTYPE_2: - ftype = BC_TANGENT; - break; - } - else - ftype = BC_BLOCK_BORDER; - } - - for(int cy = ysplit[by]; cy < ysplit[by + 1]; cy++) - for(int cx = xsplit[bx]; cx < xsplit[bx + 1]; cx++) { - a_face_left.write(pf, global_cell_pointer(cx, cy, fz - (reversed ? 0 : 1))); - a_face_right.write(pf, global_cell_pointer(cx, cy, fz - (reversed ? 1 : 0))); - a_face_type.write(pf, ftype); - pf[0]++; - } - } - - assert(pf[0] == is_faces.bounds.hi[0] + 1); - } - - if(show_graph) { - AffineAccessor a_cell_blockid(i_args.ri_cells, 0 /* offset */); - - for(int i = is_cells.bounds.lo[0]; i <= is_cells.bounds.hi[0]; i++) - std::cout << "Z[" << i << "]: blockid=" << a_cell_blockid.read(i) << "\n"; - - AffineAccessor, 1> a_face_left(i_args.ri_faces, - 0 * sizeof(Point<1>) /* offset */); - AffineAccessor, 1> a_face_right(i_args.ri_faces, - 1 * sizeof(Point<1>) /* offset */); - AffineAccessor a_face_type(i_args.ri_faces, - 2 * sizeof(Point<1>) /* offset */); - - for(int i = is_faces.bounds.lo[0]; i <= is_faces.bounds.hi[0]; i++) - std::cout << "S[" << i << "]:" - << " left=" << a_face_left.read(i) << " right=" << a_face_right.read(i) - << " type=" << a_face_type.read(i) << "\n"; - } - } - - // the outputs of our partitioning will be: - // p_cells - subsets of is_cells split by block - // p_faces - subsets of_is_faces split by block (based on left cell) - // p_facetypes[6] - subsets of p_faces split further by face type - // p_ghost - subsets of is_cells reachable by each block's boundary faces - - std::vector> p_cells; - std::vector> p_faces; - std::vector>> p_facetypes; - std::vector> p_ghost; - - virtual Event perform_partitioning(void) - { - // partition cells first - std::vector colors(n_blocks); - for(int i = 0; i < n_blocks; i++) - colors[i] = i; - - Event e1 = is_cells.create_subspaces_by_field(cell_blockid_field_data, colors, - p_cells, Realm::ProfilingRequestSet()); - if(wait_on_events) - e1.wait(); - - // now a preimage to get faces - Event e2 = is_faces.create_subspaces_by_preimage( - face_left_field_data, p_cells, p_faces, Realm::ProfilingRequestSet(), e1); - if(wait_on_events) - e2.wait(); - - // now split by face type - std::set evs; - std::vector ftcolors(BC_TOTAL); - for(int i = 0; i < BC_TOTAL; i++) - ftcolors[i] = i; - p_facetypes.resize(n_blocks); - std::vector> p_border_faces(n_blocks); - - for(int idx = 0; idx < n_blocks; idx++) { - Event e = p_faces[idx].create_subspaces_by_field(face_type_field_data, ftcolors, - p_facetypes[idx], - Realm::ProfilingRequestSet(), e2); - if(wait_on_events) - e.wait(); - evs.insert(e); - p_border_faces[idx] = p_facetypes[idx][BC_BLOCK_BORDER]; - } - Event e3 = Event::merge_events(evs); - - // finally, the image of just the boundary faces through the right face gets us - // ghost cells - Event e4 = is_cells.create_subspaces_by_image( - face_right_field_data, p_border_faces, p_ghost, Realm::ProfilingRequestSet(), e3); - if(wait_on_events) - e4.wait(); - - return e4; - } - - virtual int perform_dynamic_checks(void) - { - int errors = 0; - - std::vector> p_int_faces, p_border_faces; - for(int idx = 0; idx < n_blocks; idx++) { - p_int_faces.push_back(p_facetypes[idx][BC_INTERIOR]); - p_border_faces.push_back(p_facetypes[idx][BC_BLOCK_BORDER]); - } - // miniaero's checks are faster with image/diff on 1 thread, but slower on 4 -#ifdef MINIAERO_USE_IMAGE_DIFF - std::vector> p_l_test, p_ri_test, p_rb_test; - Event e4 = is_cells.create_subspaces_by_image_with_difference( - face_left_field_data, p_faces, p_cells, p_l_test, Realm::ProfilingRequestSet()); - Event e5 = is_cells.create_subspaces_by_image_with_difference( - face_right_field_data, p_int_faces, p_cells, p_ri_test, - Realm::ProfilingRequestSet()); - Event e6 = is_cells.create_subspaces_by_image_with_difference( - face_right_field_data, p_border_faces, p_ghost, p_rb_test, - Realm::ProfilingRequestSet()); -#else - std::vector> p_img_left, p_img_right_i, p_img_right_b; - Event e1 = is_cells.create_subspaces_by_image( - face_left_field_data, p_faces, p_img_left, Realm::ProfilingRequestSet()); - Event e2 = is_cells.create_subspaces_by_image( - face_right_field_data, p_int_faces, p_img_right_i, Realm::ProfilingRequestSet()); - Event e3 = - is_cells.create_subspaces_by_image(face_right_field_data, p_border_faces, - p_img_right_b, Realm::ProfilingRequestSet()); - std::vector> p_l_test, p_ri_test, p_rb_test; - Event e4 = IndexSpace<1>::compute_differences(p_img_left, p_cells, p_l_test, - Realm::ProfilingRequestSet(), e1); - for(unsigned idx = 0; idx < p_img_left.size(); idx++) { - p_img_left[idx].destroy(e4); - } - Event e5 = IndexSpace<1>::compute_differences(p_img_right_i, p_cells, p_ri_test, - Realm::ProfilingRequestSet(), e2); - for(unsigned idx = 0; idx < p_img_right_i.size(); idx++) { - p_img_right_i[idx].destroy(e5); - } - Event e6 = IndexSpace<1>::compute_differences(p_img_right_b, p_ghost, p_rb_test, - Realm::ProfilingRequestSet(), e3); - for(unsigned idx = 0; idx < p_img_right_b.size(); idx++) { - p_img_right_b[idx].destroy(e6); - } -#endif - errors += check_empty(e4, p_l_test, "p_l_test"); - errors += check_empty(e5, p_ri_test, "p_ri_test"); - errors += check_empty(e6, p_rb_test, "p_rb_test"); - for(unsigned idx = 0; idx < p_l_test.size(); idx++) { - p_l_test[idx].destroy(e4); - } - for(unsigned idx = 0; idx < p_ri_test.size(); idx++) { - p_ri_test[idx].destroy(e5); - } - for(unsigned idx = 0; idx < p_rb_test.size(); idx++) { - p_rb_test[idx].destroy(e6); - } - - return errors; - } - - virtual int check_partitioning(void) - { - int errors = 0; - - Point<1> pc = is_cells.bounds.lo; - Point<1> pf = is_faces.bounds.lo; - - for(int blkid = 0; blkid < n_blocks; blkid++) { - int bx = blkid % blocks_x; - int by = (blkid / blocks_x) % blocks_y; - int bz = blkid / blocks_x / blocks_y; - - int nx = xsplit[bx + 1] - xsplit[bx]; - int ny = ysplit[by + 1] - ysplit[by]; - int nz = zsplit[bz + 1] - zsplit[bz]; - - // check cells - for(int i = 0; i < cells_per_block[blkid]; i++) { - for(int j = 0; j < n_blocks; j++) { - bool exp = (j == blkid); - bool act = p_cells[j].contains(pc); - if(exp != act) { - log_app.error() << "mismatch: cell " << pc << " in p_cells[" << j - << "]: exp=" << exp << " act=" << act; - errors++; - } - } - - std::set exp_ghosts; - int cx = i % nx; - int cy = (i / nx) % ny; - int cz = i / nx / ny; - if((cx == 0) && (bx > 0)) - exp_ghosts.insert(blkid - 1); - if((cx == (nx - 1)) && (bx < (blocks_x - 1))) - exp_ghosts.insert(blkid + 1); - if((cy == 0) && (by > 0)) - exp_ghosts.insert(blkid - blocks_x); - if((cy == (ny - 1)) && (by < (blocks_y - 1))) - exp_ghosts.insert(blkid + blocks_x); - if((cz == 0) && (bz > 0)) - exp_ghosts.insert(blkid - blocks_x * blocks_y); - if((cz == (nz - 1)) && (bz < (blocks_z - 1))) - exp_ghosts.insert(blkid + blocks_x * blocks_y); - - for(int j = 0; j < n_blocks; j++) { - bool exp = exp_ghosts.count(j) > 0; - bool act = p_ghost[j].contains(pc); - if(exp != act) { - log_app.error() << "mismatch: cell " << pc << " in p_ghost[" << j - << "]: exp=" << exp << " act=" << act; - errors++; - } - } - - pc[0]++; - } - - // check faces - for(int i = 0; i < faces_per_block[blkid]; i++) { - for(int j = 0; j < n_blocks; j++) { - bool exp = (j == blkid); - bool act = p_faces[j].contains(pf); - if(exp != act) { - log_app.error() << "mismatch: face " << pf << " in p_faces[" << j - << "]: exp=" << exp << " act=" << act; - errors++; - } - FaceType exptype = BC_INTERIOR; - // luckily the faces on the edge of a block come in chunks - int lr_faces = (nx + 1) * ny * nz; - int du_faces = nx * (ny + 1) * nz; - int bf_faces = nx * ny * (nz + 1); - assert((lr_faces + du_faces + bf_faces) == faces_per_block[blkid]); - if(i < lr_faces) { - int x = i / ny / nz; - if(x == 0) - exptype = ((bx == 0) ? ((problem_type == PTYPE_0) ? BC_EXTRAPOLATE - : (problem_type == PTYPE_1) ? BC_INFLOW - : BC_INFLOW) - : BC_BLOCK_BORDER); - if(x == nx) - exptype = - ((bx == blocks_x - 1) ? ((problem_type == PTYPE_0) ? BC_EXTRAPOLATE - : (problem_type == PTYPE_1) ? BC_EXTRAPOLATE - : BC_EXTRAPOLATE) - : BC_BLOCK_BORDER); - } else if(i < (lr_faces + du_faces)) { - int y = (i - lr_faces) / nx / nz; - if(y == 0) - exptype = ((by == 0) ? ((problem_type == PTYPE_0) ? BC_TANGENT - : (problem_type == PTYPE_1) ? BC_NOSLIP - : BC_TANGENT) - : BC_BLOCK_BORDER); - if(y == ny) - exptype = - ((by == blocks_y - 1) ? ((problem_type == PTYPE_0) ? BC_TANGENT - : (problem_type == PTYPE_1) ? BC_EXTRAPOLATE - : BC_TANGENT) - : BC_BLOCK_BORDER); - } else { - int z = (i - lr_faces - du_faces) / nx / ny; - if(z == 0) - exptype = ((bz == 0) ? ((problem_type == PTYPE_0) ? BC_TANGENT - : (problem_type == PTYPE_1) ? BC_TANGENT - : BC_TANGENT) - : BC_BLOCK_BORDER); - if(z == nz) - exptype = ((bz == blocks_z - 1) ? ((problem_type == PTYPE_0) ? BC_TANGENT - : (problem_type == PTYPE_1) ? BC_TANGENT - : BC_TANGENT) - : BC_BLOCK_BORDER); - } - - for(int k = 0; k < BC_TOTAL; k++) { - bool exp = (j == blkid) && (k == exptype); - bool act = p_facetypes[j][k].contains(pf); - if(exp != act) { - log_app.error() << "mismatch: face " << pf << " in p_facetypes[" << j - << "][" << k << "]: exp=" << exp << " act=" << act; - errors++; - } - } - } - pf[0]++; - } - } - for(unsigned idx = 0; idx < p_cells.size(); idx++) { - p_cells[idx].destroy(); - } - for(unsigned idx = 0; idx < p_faces.size(); idx++) { - p_faces[idx].destroy(); - } - for(unsigned i = 0; i < p_facetypes.size(); i++) { - for(unsigned j = 0; j < p_facetypes[i].size(); j++) { - p_facetypes[i][j].destroy(); - } - } - for(unsigned idx = 0; idx < p_ghost.size(); idx++) { - p_ghost[idx].destroy(); - } - - return errors; - } -}; - -class CircuitTest : public TestInterface { -public: - // graph config parameters - int num_nodes = 100; - int num_edges = 10; - int num_pieces = 2; - int pct_wire_in_piece = 50; - - CircuitTest(int argc, const char *argv[]) - { - for(int i = 1; i < argc; i++) { - if(!strcmp(argv[i], "-n")) { - num_nodes = atoi(argv[++i]); - continue; - } - - if(!strcmp(argv[i], "-e")) { - num_edges = atoi(argv[++i]); - continue; - } - - if(!strcmp(argv[i], "-p")) { - num_pieces = atoi(argv[++i]); - continue; - } - } - } - - struct InitDataArgs { - int index; - RegionInstance ri_nodes, ri_edges; - }; - - enum PRNGStreams - { - NODE_SUBCKT_STREAM, - EDGE_IN_NODE_STREAM, - EDGE_OUT_NODE_STREAM1, - EDGE_OUT_NODE_STREAM2, - }; - - // nodes and edges are generated pseudo-randomly so that we can check the results - // without - // needing all the field data in any one place - void random_node_data(int idx, int &subckt) - { - if(random_colors) - subckt = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBCKT_STREAM, num_pieces); - else - subckt = idx * num_pieces / num_nodes; - } - - void random_edge_data(int idx, Point<1> &in_node, Point<1> &out_node) - { - if(random_colors) { - in_node = Philox_2x32<>::rand_int(random_seed, idx, EDGE_IN_NODE_STREAM, num_nodes); - out_node = - Philox_2x32<>::rand_int(random_seed, idx, EDGE_OUT_NODE_STREAM1, num_nodes); - } else { - int subckt = idx * num_pieces / num_edges; - int n_lo = subckt * num_nodes / num_pieces; - int n_hi = (subckt + 1) * num_nodes / num_pieces; - in_node = n_lo + Philox_2x32<>::rand_int(random_seed, idx, EDGE_IN_NODE_STREAM, - n_hi - n_lo); - int pct = Philox_2x32<>::rand_int(random_seed, idx, EDGE_OUT_NODE_STREAM2, 100); - if(pct < pct_wire_in_piece) - out_node = n_lo + Philox_2x32<>::rand_int(random_seed, idx, EDGE_OUT_NODE_STREAM1, - n_hi - n_lo); - else - out_node = - Philox_2x32<>::rand_int(random_seed, idx, EDGE_OUT_NODE_STREAM1, num_nodes); - } - } - - static void init_data_task_wrapper(const void *args, size_t arglen, - const void *userdata, size_t userlen, Processor p) - { - CircuitTest *me = (CircuitTest *)testcfg; - me->init_data_task(args, arglen, p); - } - - void init_data_task(const void *args, size_t arglen, Processor p) - { - const InitDataArgs &i_args = *(const InitDataArgs *)args; - - log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes - << ", ri_edges=" << i_args.ri_edges << ")"; - - i_args.ri_nodes.fetch_metadata(p).wait(); - i_args.ri_edges.fetch_metadata(p).wait(); - - IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>(); - IndexSpace<1> is_edges = i_args.ri_edges.get_indexspace<1>(); - - log_app.debug() << "N: " << is_nodes; - log_app.debug() << "E: " << is_edges; - - { - AffineAccessor a_subckt_id(i_args.ri_nodes, 0 /* offset */); - - for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) { - int subckt; - random_node_data(i, subckt); - a_subckt_id.write(i, subckt); - } - } - - { - AffineAccessor, 1> a_in_node(i_args.ri_edges, - 0 * sizeof(Point<1>) /* offset */); - AffineAccessor, 1> a_out_node(i_args.ri_edges, - 1 * sizeof(Point<1>) /* offset */); - - for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) { - Point<1> in_node, out_node; - random_edge_data(i, in_node, out_node); - a_in_node.write(i, in_node); - a_out_node.write(i, out_node); - } - } - - if(show_graph) { - AffineAccessor a_subckt_id(i_args.ri_nodes, 0 /* offset */); - - for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) - std::cout << "subckt_id[" << i << "] = " << a_subckt_id.read(i) << "\n"; - - AffineAccessor, 1> a_in_node(i_args.ri_edges, - 0 * sizeof(Point<1>) /* offset */); - - for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) - std::cout << "in_node[" << i << "] = " << a_in_node.read(i) << "\n"; - - AffineAccessor, 1> a_out_node(i_args.ri_edges, - 1 * sizeof(Point<1>) /* offset */); - - for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) - std::cout << "out_node[" << i << "] = " << a_out_node.read(i) << "\n"; - } - } - - IndexSpace<1> is_nodes, is_edges; - std::vector ri_nodes; - std::vector, int>> subckt_field_data; - std::vector ri_edges; - std::vector, Point<1>>> in_node_field_data; - std::vector, Point<1>>> out_node_field_data; - - virtual void print_info(void) - { - printf("Realm dependent partitioning test - circuit: %d nodes, %d edges, %d pieces\n", - (int)num_nodes, (int)num_edges, (int)num_pieces); - } - - virtual Event initialize_data(const std::vector &memories, - const std::vector &procs) - { - // now create index spaces for nodes and edges - is_nodes = Rect<1>(0, num_nodes - 1); - is_edges = Rect<1>(0, num_edges - 1); - - // equal partition is used to do initial population of edges and nodes - std::vector> ss_nodes_eq; - std::vector> ss_edges_eq; - - is_nodes - .create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()) - .wait(); - is_edges - .create_equal_subspaces(num_pieces, 1, ss_edges_eq, Realm::ProfilingRequestSet()) - .wait(); - - log_app.debug() << "Initial partitions:"; - for(size_t i = 0; i < ss_nodes_eq.size(); i++) - log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i]; - for(size_t i = 0; i < ss_edges_eq.size(); i++) - log_app.debug() << " Edges #" << i << ": " << ss_edges_eq[i]; - - // create instances for each of these subspaces - std::vector node_fields, edge_fields; - node_fields.push_back(sizeof(int)); // subckt_id - assert(sizeof(int) == sizeof(Point<1>)); - edge_fields.push_back(sizeof(Point<1>)); // in_node - edge_fields.push_back(sizeof(Point<1>)); // out_node - - ri_nodes.resize(num_pieces); - subckt_field_data.resize(num_pieces); - - for(size_t i = 0; i < ss_nodes_eq.size(); i++) { - RegionInstance ri; - RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i], - node_fields, 0 /*SOA*/, - Realm::ProfilingRequestSet()) - .wait(); - ri_nodes[i] = ri; - - subckt_field_data[i].index_space = ss_nodes_eq[i]; - subckt_field_data[i].inst = ri_nodes[i]; - subckt_field_data[i].field_offset = 0; - } - - ri_edges.resize(num_pieces); - in_node_field_data.resize(num_pieces); - out_node_field_data.resize(num_pieces); - - for(size_t i = 0; i < ss_edges_eq.size(); i++) { - RegionInstance ri; - RegionInstance::create_instance(ri, memories[i % memories.size()], ss_edges_eq[i], - edge_fields, 0 /*SOA*/, - Realm::ProfilingRequestSet()) - .wait(); - ri_edges[i] = ri; - - in_node_field_data[i].index_space = ss_edges_eq[i]; - in_node_field_data[i].inst = ri_edges[i]; - in_node_field_data[i].field_offset = 0 * sizeof(Point<1>); - - out_node_field_data[i].index_space = ss_edges_eq[i]; - out_node_field_data[i].inst = ri_edges[i]; - out_node_field_data[i].field_offset = 1 * sizeof(Point<1>); - } - - // fire off tasks to initialize data - std::set events; - for(int i = 0; i < num_pieces; i++) { - Processor p = procs[i % memories.size()]; - InitDataArgs args; - args.index = i; - args.ri_nodes = ri_nodes[i]; - args.ri_edges = ri_edges[i]; - Event e = p.spawn(INIT_CIRCUIT_DATA_TASK, &args, sizeof(args)); - events.insert(e); - } - - return Event::merge_events(events); - } - - // the outputs of our partitioning will be: - // is_private, is_shared - subsets of is_nodes based on private/shared - // p_pvt, p_shr, p_ghost - subsets of the above split by subckt - // p_edges - subsets of is_edges for each subckt - - IndexSpace<1> is_shared, is_private; - std::vector> p_pvt, p_shr, p_ghost; - std::vector> p_edges; - - virtual Event perform_partitioning(void) - { - // first partition nodes by subckt id (this is the independent partition, - // but not actually used by the app) - std::vector> p_nodes; - - std::vector colors(num_pieces); - for(int i = 0; i < num_pieces; i++) - colors[i] = i; - - Event e1 = is_nodes.create_subspaces_by_field(subckt_field_data, colors, p_nodes, - Realm::ProfilingRequestSet()); - if(wait_on_events) - e1.wait(); - - // now compute p_edges based on the color of their in_node (i.e. a preimage) - Event e2 = is_edges.create_subspaces_by_preimage(in_node_field_data, p_nodes, p_edges, - Realm::ProfilingRequestSet(), e1); - if(wait_on_events) - e2.wait(); - - // an image of p_edges through out_node gives us all the shared nodes, along - // with some private nodes -#ifdef USE_IMAGE_DIFF - Event e4 = is_nodes.create_subspaces_by_image_with_difference( - out_node_field_data, p_edges, p_nodes, p_ghost, Realm::ProfilingRequestSet(), e2); - if(wait_on_events) - e4.wait(); -#else - std::vector> p_extra_nodes; - - Event e3 = is_nodes.create_subspaces_by_image( - out_node_field_data, p_edges, p_extra_nodes, Realm::ProfilingRequestSet(), e2); - if(wait_on_events) - e3.wait(); - - // subtracting out those private nodes gives us p_ghost - Event e4 = IndexSpace<1>::compute_differences(p_extra_nodes, p_nodes, p_ghost, - Realm::ProfilingRequestSet(), e3); - if(wait_on_events) - e4.wait(); -#endif - - // the union of everybody's ghost nodes is is_shared - Event e5 = IndexSpace<1>::compute_union(p_ghost, is_shared, - Realm::ProfilingRequestSet(), e4); - if(wait_on_events) - e5.wait(); - - // and is_private is just the nodes of is_nodes that aren't in is_shared - Event e6 = IndexSpace<1>::compute_difference(is_nodes, is_shared, is_private, - Realm::ProfilingRequestSet(), e5); - if(wait_on_events) - e6.wait(); - - // the intersection of the original p_nodes with is_shared gives us p_shr - // (note that we can do this in parallel with the computation of is_private) - Event e7 = IndexSpace<1>::compute_intersections(p_nodes, is_shared, p_shr, - Realm::ProfilingRequestSet(), e5); - if(wait_on_events) - e7.wait(); - - // and finally, the intersection of p_nodes with is_private gives us p_pvt - Event e8 = IndexSpace<1>::compute_intersections(p_nodes, is_private, p_pvt, - Realm::ProfilingRequestSet(), e6); - if(wait_on_events) - e8.wait(); - - // all done - wait on e7 and e8, which dominate every other operation - Event e9 = Event::merge_events(e7, e8); - - for(unsigned idx = 0; idx < p_nodes.size(); idx++) { - p_nodes[idx].destroy(e9); - } - - return e9; - } - - virtual int perform_dynamic_checks(void) - { - int errors = 0; - // compute the intermediates for the checks - these duplicate things we - // already have, but we're not supposed to know that here - std::vector> p_pvt_and_shr, p_all; - Event e1 = IndexSpace<1>::compute_unions( - p_pvt, p_shr, p_pvt_and_shr, Realm::ProfilingRequestSet(), Event::NO_EVENT); - Event e2 = IndexSpace<1>::compute_unions(p_pvt_and_shr, p_ghost, p_all, - Realm::ProfilingRequestSet(), e1); -#ifdef USE_IMAGE_DIFF - std::vector> p_in_test, p_out_test; - Event e5 = is_nodes.create_subspaces_by_image_with_difference( - in_node_field_data, p_edges, p_pvt_and_shr, p_in_test, - Realm::ProfilingRequestSet(), e1); - Event e6 = is_nodes.create_subspaces_by_image_with_difference( - out_node_field_data, p_edges, p_all, p_out_test, Realm::ProfilingRequestSet(), - e2); -#else - std::vector> p_in_img, p_out_img; - Event e3 = - is_nodes.create_subspaces_by_image(in_node_field_data, p_edges, p_in_img, - Realm::ProfilingRequestSet(), Event::NO_EVENT); - Event e4 = - is_nodes.create_subspaces_by_image(out_node_field_data, p_edges, p_out_img, - Realm::ProfilingRequestSet(), Event::NO_EVENT); - std::vector> p_in_test, p_out_test; - Event e5 = IndexSpace<1>::compute_differences(p_in_img, p_pvt_and_shr, p_in_test, - Realm::ProfilingRequestSet(), - Event::merge_events(e1, e3)); - Event e6 = IndexSpace<1>::compute_differences(p_out_img, p_all, p_out_test, - Realm::ProfilingRequestSet(), - Event::merge_events(e2, e4)); - for(unsigned idx = 0; idx < p_in_img.size(); idx++) { - p_in_img[idx].destroy(e5); - } - for(unsigned idx = 0; idx < p_out_img.size(); idx++) { - p_out_img[idx].destroy(e6); - } -#endif - - errors += check_empty(e5, p_in_test, "p_in_test"); - errors += check_empty(e6, p_out_test, "p_out_test"); - for(unsigned idx = 0; idx < p_pvt_and_shr.size(); idx++) { - p_pvt_and_shr[idx].destroy(e5); - } - for(unsigned idx = 0; idx < p_all.size(); idx++) { - p_all[idx].destroy(e6); - } - for(unsigned idx = 0; idx < p_in_test.size(); idx++) { - p_in_test[idx].destroy(e5); - } - for(unsigned idx = 0; idx < p_out_test.size(); idx++) { - p_out_test[idx].destroy(e6); - } - - return errors; - } - - virtual int check_partitioning(void) - { - int errors = 0; - - // we'll make up the list of nodes we expect to be shared as we walk the edges - std::map> ghost_nodes; - -#ifdef DUMP_OUTPUT_SPACES - dump_sparse_index_space<1, int>("is_private", is_private); - dump_sparse_index_space<1, int>("is_shared", is_shared); - - for(int p = 0; p < num_pieces; p++) { - std::cout << "Piece #" << p << "\n"; - dump_sparse_index_space<1, int>("p_pvt", p_pvt[p]); - dump_sparse_index_space<1, int>("p_shr", p_shr[p]); - dump_sparse_index_space<1, int>("p_ghost", p_ghost[p]); - } -#endif - - for(int i = 0; i < num_edges; i++) { - // regenerate the random info for this edge and the two nodes it touches - Point<1> in_node, out_node; - int in_subckt, out_subckt; - random_edge_data(i, in_node, out_node); - random_node_data(in_node, in_subckt); - random_node_data(out_node, out_subckt); - - // the edge should be in exactly the p_edges for in_subckt - for(int p = 0; p < num_pieces; p++) { - bool exp = (p == in_subckt); - bool act = p_edges[p].contains(i); - if(exp != act) { - log_app.error() << "mismatch: edge " << i << " in p_edges[" << p - << "]: exp=" << exp << " act=" << act; - errors++; - } - } - - // is the output node a ghost for this wire? - if(in_subckt != out_subckt) - ghost_nodes[out_node].insert(in_subckt); - } - - // now we can check the nodes - for(int i = 0; i < num_nodes; i++) { - int subckt; - random_node_data(i, subckt); - // check is_private and is_shared first - { - bool exp = ghost_nodes.count(i) == 0; - bool act = is_private.contains(i); - if(exp != act) { - log_app.error() << "mismatch: node " << i << " in is_private: exp=" << exp - << " act=" << act; - errors++; - } - } - { - bool exp = ghost_nodes.count(i) > 0; - bool act = is_shared.contains(i); - if(exp != act) { - log_app.error() << "mismatch: node " << i << " in is_shared: exp=" << exp - << " act=" << act; - errors++; - } - } - - // now check p_pvt/shr/ghost - for(int p = 0; p < num_pieces; p++) { - bool exp = (subckt == p) && (ghost_nodes.count(i) == 0); - bool act = p_pvt[p].contains(i); - if(exp != act) { - log_app.error() << "mismatch: node " << i << " in p_pvt[" << p - << "]: exp=" << exp << " act=" << act; - errors++; - } - } - for(int p = 0; p < num_pieces; p++) { - bool exp = (subckt == p) && (ghost_nodes.count(i) > 0); - bool act = p_shr[p].contains(i); - if(exp != act) { - log_app.error() << "mismatch: node " << i << " in p_shr[" << p - << "]: exp=" << exp << " act=" << act; - errors++; - } - } - for(int p = 0; p < num_pieces; p++) { - bool exp = - (subckt != p) && (ghost_nodes.count(i) > 0) && (ghost_nodes[i].count(p) > 0); - bool act = p_ghost[p].contains(i); - if(exp != act) { - log_app.error() << "mismatch: node " << i << " in p_ghost[" << p - << "]: exp=" << exp << " act=" << act; - errors++; - } - } - } - - is_shared.destroy(); - is_private.destroy(); - for(unsigned idx = 0; idx < p_pvt.size(); idx++) { - p_pvt[idx].destroy(); - } - for(unsigned idx = 0; idx < p_shr.size(); idx++) { - p_shr[idx].destroy(); - } - for(unsigned idx = 0; idx < p_ghost.size(); idx++) { - p_ghost[idx].destroy(); - } - for(unsigned idx = 0; idx < p_edges.size(); idx++) { - p_edges[idx].destroy(); - } - - return errors; - } -}; - -class PennantTest : public TestInterface { -public: -public: - // graph config parameters - enum MeshType - { - RectangularMesh, - }; - MeshType mesh_type = RectangularMesh; - int nzx = 10; // number of zones in x - int nzy = 10; // number of zones in y - int numpcx = 2; // number of submeshes in x - int numpcy = 2; // number of submeshes in y - - int npx, npy; // number of points in each dimension - int nz, ns, np, numpc; // total number of zones, sides, points, and pieces - std::vector zxbound, zybound; // x and y split points between submeshes - std::vector lz, ls, lp; // number of zones, sides, and points in each submesh - - // can't do 64-bit index types right now, so at least get most of our 32-bit space - typedef int INDEXTYPE; - static const INDEXTYPE FIRST_INDEX = -2000000000; // easier to read than INT_MIN+1 - - PennantTest(int argc, const char *argv[]) - { -#define INT_ARG(s, v) \ - if(!strcmp(argv[i], s)) { \ - v = atoi(argv[++i]); \ - continue; \ - } - for(int i = 1; i < argc; i++) { - INT_ARG("-nzx", nzx) - INT_ARG("-nzy", nzy) - INT_ARG("-numpcx", numpcx) - INT_ARG("-numpcy", numpcy) - if(!strcmp(argv[i], "-nz")) { - int v = atoi(argv[++i]); - nzx = nzy = v; - continue; - } - if(!strcmp(argv[i], "-numpc")) { - int v = atoi(argv[++i]); - numpcx = numpcy = v; - continue; - } - } -#undef INT_ARG - - switch(mesh_type) { - case RectangularMesh: - { - npx = nzx + 1; - npy = nzy + 1; - numpc = numpcx * numpcy; - - zxbound.resize(numpcx + 1); - for(int i = 0; i <= numpcx; i++) - zxbound[i] = (i * nzx) / numpcx; - - zybound.resize(numpcy + 1); - for(int i = 0; i <= numpcy; i++) - zybound[i] = (i * nzy) / numpcy; - - nz = ns = np = 0; - for(int pcy = 0; pcy < numpcy; pcy++) { - for(int pcx = 0; pcx < numpcx; pcx++) { - int lx = zxbound[pcx + 1] - zxbound[pcx]; - int ly = zybound[pcy + 1] - zybound[pcy]; - - int zones = lx * ly; - int sides = zones * 4; - // points are a little funny - shared edges go to the lower numbered piece - int points = ((pcx == 0) ? (lx + 1) : lx) * ((pcy == 0) ? (ly + 1) : ly); - - lz.push_back(zones); - ls.push_back(sides); - lp.push_back(points); - nz += zones; - ns += sides; - np += points; - } - } - - assert(nz == (nzx * nzy)); - assert(ns == (4 * nzx * nzy)); - assert(np == (npx * npy)); - - break; - } - } - } - - virtual void print_info(void) - { - printf("Realm dependent partitioning test - pennant: %d x %d zones, %d x %d pieces\n", - (int)nzx, (int)nzy, (int)numpcx, (int)numpcy); - } - - IndexSpace<1> is_zones, is_sides, is_points; - std::vector ri_zones; - std::vector, int>> zone_color_field_data; - std::vector ri_sides; - std::vector, Point<1>>> side_mapsz_field_data; - std::vector, Point<1>>> side_mapss3_field_data; - std::vector, Point<1>>> side_mapsp1_field_data; - std::vector, bool>> side_ok_field_data; - - struct InitDataArgs { - int index; - RegionInstance ri_zones, ri_sides; - }; - - virtual Event initialize_data(const std::vector &memories, - const std::vector &procs) - { - // top level index spaces - is_zones = Rect<1>(FIRST_INDEX, FIRST_INDEX + nz - 1); - is_sides = Rect<1>(FIRST_INDEX, FIRST_INDEX + ns - 1); - is_points = Rect<1>(FIRST_INDEX, FIRST_INDEX + np - 1); - - // weighted partitions based on the distribution we already computed - std::vector> ss_zones_w; - std::vector> ss_sides_w; - std::vector> ss_points_w; - - is_zones - .create_weighted_subspaces(numpc, 1, lz, ss_zones_w, Realm::ProfilingRequestSet()) - .wait(); - is_sides - .create_weighted_subspaces(numpc, 1, ls, ss_sides_w, Realm::ProfilingRequestSet()) - .wait(); - is_points - .create_weighted_subspaces(numpc, 1, lp, ss_points_w, - Realm::ProfilingRequestSet()) - .wait(); - - log_app.debug() << "Initial partitions:"; - for(size_t i = 0; i < ss_zones_w.size(); i++) - log_app.debug() << " Zones #" << i << ": " << ss_zones_w[i]; - for(size_t i = 0; i < ss_sides_w.size(); i++) - log_app.debug() << " Sides #" << i << ": " << ss_sides_w[i]; - for(size_t i = 0; i < ss_points_w.size(); i++) - log_app.debug() << " Points #" << i << ": " << ss_points_w[i]; - - // create instances for each of these subspaces - std::vector zone_fields, side_fields; - zone_fields.push_back(sizeof(int)); // color - assert(sizeof(int) == sizeof(Point<1>)); - side_fields.push_back(sizeof(Point<1>)); // mapsz - side_fields.push_back(sizeof(Point<1>)); // mapss3 - side_fields.push_back(sizeof(Point<1>)); // mapsp1 - side_fields.push_back(sizeof(bool)); // ok - - ri_zones.resize(numpc); - zone_color_field_data.resize(numpc); - - for(size_t i = 0; i < ss_zones_w.size(); i++) { - RegionInstance ri; - RegionInstance::create_instance(ri, memories[i % memories.size()], ss_zones_w[i], - zone_fields, 0 /*SOA*/, - Realm::ProfilingRequestSet()) - .wait(); - ri_zones[i] = ri; - - zone_color_field_data[i].index_space = ss_zones_w[i]; - zone_color_field_data[i].inst = ri_zones[i]; - zone_color_field_data[i].field_offset = 0; - } - - ri_sides.resize(numpc); - side_mapsz_field_data.resize(numpc); - side_mapss3_field_data.resize(numpc); - side_mapsp1_field_data.resize(numpc); - side_ok_field_data.resize(numpc); - - for(size_t i = 0; i < ss_sides_w.size(); i++) { - RegionInstance ri; - RegionInstance::create_instance(ri, memories[i % memories.size()], ss_sides_w[i], - side_fields, 0 /*SOA*/, - Realm::ProfilingRequestSet()) - .wait(); - ri_sides[i] = ri; - - side_mapsz_field_data[i].index_space = ss_sides_w[i]; - side_mapsz_field_data[i].inst = ri_sides[i]; - side_mapsz_field_data[i].field_offset = 0 * sizeof(Point<1>); - - side_mapss3_field_data[i].index_space = ss_sides_w[i]; - side_mapss3_field_data[i].inst = ri_sides[i]; - side_mapss3_field_data[i].field_offset = 1 * sizeof(Point<1>); - - side_mapsp1_field_data[i].index_space = ss_sides_w[i]; - side_mapsp1_field_data[i].inst = ri_sides[i]; - side_mapsp1_field_data[i].field_offset = 2 * sizeof(Point<1>); - - side_ok_field_data[i].index_space = ss_sides_w[i]; - side_ok_field_data[i].inst = ri_sides[i]; - side_ok_field_data[i].field_offset = 3 * sizeof(Point<1>); - } - - // fire off tasks to initialize data - std::set events; - for(int i = 0; i < numpc; i++) { - Processor p = procs[i % memories.size()]; - InitDataArgs args; - args.index = i; - args.ri_zones = ri_zones[i]; - args.ri_sides = ri_sides[i]; - Event e = p.spawn(INIT_PENNANT_DATA_TASK, &args, sizeof(args)); - events.insert(e); - } - - return Event::merge_events(events); - } - - static void init_data_task_wrapper(const void *args, size_t arglen, - const void *userdata, size_t userlen, Processor p) - { - PennantTest *me = (PennantTest *)testcfg; - me->init_data_task(args, arglen, p); - } - - Point<1> global_point_pointer(int py, int px) const - { - int pp = FIRST_INDEX; - - // start by steping over whole y slabs - again be careful that the extra slab belongs - // to pcy == 0 - int dy; - if(py > zybound[1]) { - int pcy = 1; - while(py > zybound[pcy + 1]) - pcy++; - int slabs = zybound[pcy] + 1; - pp += npx * slabs; - py -= slabs; - dy = zybound[pcy + 1] - zybound[pcy]; - } else { - dy = zybound[1] + 1; - } - - // now chunks in x, using just the y width of this row of chunks - int dx; - if(px > zxbound[1]) { - int pcx = 1; - while(px > zxbound[pcx + 1]) - pcx++; - int strips = zxbound[pcx] + 1; - pp += dy * strips; - px -= strips; - dx = zxbound[pcx + 1] - zxbound[pcx]; - } else { - dx = zxbound[1] + 1; - } - - // finally, px and py are now local and are handled easily - pp += py * dx + px; - - return pp; - } - - void init_data_task(const void *args, size_t arglen, Processor p) - { - const InitDataArgs &i_args = *(const InitDataArgs *)args; - - log_app.info() << "init task #" << i_args.index << " (ri_zones=" << i_args.ri_zones - << ", ri_sides=" << i_args.ri_sides << ")"; - - i_args.ri_zones.fetch_metadata(p).wait(); - i_args.ri_sides.fetch_metadata(p).wait(); - - IndexSpace<1> is_zones = i_args.ri_zones.get_indexspace<1>(); - IndexSpace<1> is_sides = i_args.ri_sides.get_indexspace<1>(); - - log_app.debug() << "Z: " << is_zones; - log_app.debug() << "S: " << is_sides; - - int pcx = i_args.index % numpcx; - int pcy = i_args.index / numpcx; - - int zxlo = zxbound[pcx]; - int zxhi = zxbound[pcx + 1]; - int zylo = zybound[pcy]; - int zyhi = zybound[pcy + 1]; - - { - AffineAccessor a_zone_color(i_args.ri_zones, 0 /* offset */); - AffineAccessor, 1> a_side_mapsz(i_args.ri_sides, - 0 * sizeof(Point<1>) /* offset */); - AffineAccessor, 1> a_side_mapss3(i_args.ri_sides, - 1 * sizeof(Point<1>) /* offset */); - AffineAccessor, 1> a_side_mapsp1(i_args.ri_sides, - 2 * sizeof(Point<1>) /* offset */); - AffineAccessor a_side_ok(i_args.ri_sides, - 3 * sizeof(Point<1>) /* offset */); - - Point<1> pz = is_zones.bounds.lo; - Point<1> ps = is_sides.bounds.lo; - - for(int zy = zylo; zy < zyhi; zy++) { - for(int zx = zxlo; zx < zxhi; zx++) { - // get 4 side pointers - Point<1> ps0 = ps; - ps[0]++; - Point<1> ps1 = ps; - ps[0]++; - Point<1> ps2 = ps; - ps[0]++; - Point<1> ps3 = ps; - ps[0]++; - - // point pointers are ugly because they can be in neighbors - use a helper - Point<1> pp0 = global_point_pointer(zy, zx); // go CCW - Point<1> pp1 = global_point_pointer(zy + 1, zx); - Point<1> pp2 = global_point_pointer(zy + 1, zx + 1); - Point<1> pp3 = global_point_pointer(zy, zx + 1); - - a_zone_color.write(pz, i_args.index); - - a_side_mapsz.write(ps0, pz); - a_side_mapsz.write(ps1, pz); - a_side_mapsz.write(ps2, pz); - a_side_mapsz.write(ps3, pz); - - a_side_mapss3.write(ps0, ps1); - a_side_mapss3.write(ps1, ps2); - a_side_mapss3.write(ps2, ps3); - a_side_mapss3.write(ps3, ps0); - - a_side_mapsp1.write(ps0, pp0); - a_side_mapsp1.write(ps1, pp1); - a_side_mapsp1.write(ps2, pp2); - a_side_mapsp1.write(ps3, pp3); - - a_side_ok.write(ps0, true); - a_side_ok.write(ps1, true); - a_side_ok.write(ps2, true); - a_side_ok.write(ps3, true); - - pz[0]++; - } - } - assert(pz[0] == is_zones.bounds.hi[0] + 1); - assert(ps[0] == is_sides.bounds.hi[0] + 1); - } - - if(show_graph) { - AffineAccessor a_zone_color(i_args.ri_zones, 0 /* offset */); - - for(int i = is_zones.bounds.lo; i <= is_zones.bounds.hi; i++) - std::cout << "Z[" << i << "]: color=" << a_zone_color.read(i) << "\n"; - - AffineAccessor, 1> a_side_mapsz(i_args.ri_sides, - 0 * sizeof(Point<1>) /* offset */); - AffineAccessor, 1> a_side_mapss3(i_args.ri_sides, - 1 * sizeof(Point<1>) /* offset */); - AffineAccessor, 1> a_side_mapsp1(i_args.ri_sides, - 2 * sizeof(Point<1>) /* offset */); - AffineAccessor a_side_ok(i_args.ri_sides, - 3 * sizeof(Point<1>) /* offset */); - - for(int i = is_sides.bounds.lo; i <= is_sides.bounds.hi; i++) - std::cout << "S[" << i << "]:" - << " mapsz=" << a_side_mapsz.read(i) - << " mapss3=" << a_side_mapss3.read(i) - << " mapsp1=" << a_side_mapsp1.read(i) << " ok=" << a_side_ok.read(i) - << "\n"; - } - } - - // the outputs of our partitioning will be: - // p_zones - subsets of is_zones split by piece - // p_sides - subsets of is_sides split by piece (with bad sides removed) - // p_points - subsets of is_points by piece (aliased) - - std::vector> p_zones; - std::vector> p_sides; - std::vector> p_points; - - virtual Event perform_partitioning(void) - { - // first get the set of bad sides (i.e. ok == false) - IndexSpace<1> bad_sides; - - Event e1 = is_sides.create_subspace_by_field(side_ok_field_data, false, bad_sides, - Realm::ProfilingRequestSet()); - if(wait_on_events) - e1.wait(); - - // map the bad sides through to bad zones - IndexSpace<1> bad_zones; - Event e2 = is_zones.create_subspace_by_image( - side_mapsz_field_data, bad_sides, bad_zones, Realm::ProfilingRequestSet(), e1); - if(wait_on_events) - e2.wait(); - bad_sides.destroy(e2); - - // subtract bad zones to get good zones - IndexSpace<1> good_zones; - Event e3 = IndexSpace<1>::compute_difference(is_zones, bad_zones, good_zones, - Realm::ProfilingRequestSet(), e2); - if(wait_on_events) - e3.wait(); - bad_zones.destroy(e3); - - // now do actual partitions with just good zones - std::vector colors(numpc); - for(int i = 0; i < numpc; i++) - colors[i] = i; - - Event e4 = good_zones.create_subspaces_by_field( - zone_color_field_data, colors, p_zones, Realm::ProfilingRequestSet(), e3); - if(wait_on_events) - e4.wait(); - good_zones.destroy(e4); - - // preimage of zones is sides - Event e5 = is_sides.create_subspaces_by_preimage( - side_mapsz_field_data, p_zones, p_sides, Realm::ProfilingRequestSet(), e4); - if(wait_on_events) - e5.wait(); - - // and image of sides->mapsp1 is points - Event e6 = is_points.create_subspaces_by_image( - side_mapsp1_field_data, p_sides, p_points, Realm::ProfilingRequestSet(), e5); - if(wait_on_events) - e6.wait(); - - return e6; - } - - virtual int perform_dynamic_checks(void) - { - int errors = 0; - - // pennant's checks are actually slower with the fused image/diff -#ifdef PENNANT_USE_IMAGE_DIFF - std::vector> p_z_test, p_p_test, p_s_test; - Event e4 = is_zones.create_subspaces_by_image_with_difference( - side_mapsz_field_data, p_sides, p_zones, p_z_test, Realm::ProfilingRequestSet()); - Event e5 = is_points.create_subspaces_by_image_with_difference( - side_mapsp1_field_data, p_sides, p_points, p_p_test, - Realm::ProfilingRequestSet()); - Event e6 = is_sides.create_subspaces_by_image_with_difference( - side_mapss3_field_data, p_sides, p_sides, p_s_test, Realm::ProfilingRequestSet()); -#else - std::vector> p_img_mapsz, p_img_mapsp1, p_img_mapss3; - Event e1 = is_zones.create_subspaces_by_image( - side_mapsz_field_data, p_sides, p_img_mapsz, Realm::ProfilingRequestSet()); - Event e2 = is_points.create_subspaces_by_image( - side_mapsp1_field_data, p_sides, p_img_mapsp1, Realm::ProfilingRequestSet()); - Event e3 = is_sides.create_subspaces_by_image( - side_mapss3_field_data, p_sides, p_img_mapss3, Realm::ProfilingRequestSet()); - std::vector> p_z_test, p_p_test, p_s_test; - Event e4 = IndexSpace<1>::compute_differences(p_img_mapsz, p_zones, p_z_test, - Realm::ProfilingRequestSet(), e1); - for(unsigned idx = 0; idx < p_img_mapsz.size(); idx++) { - p_img_mapsz[idx].destroy(e4); - } - Event e5 = IndexSpace<1>::compute_differences(p_img_mapsp1, p_points, p_p_test, - Realm::ProfilingRequestSet(), e2); - for(unsigned idx = 0; idx < p_img_mapsp1.size(); idx++) { - p_img_mapsp1[idx].destroy(e5); - } - Event e6 = IndexSpace<1>::compute_differences(p_img_mapss3, p_sides, p_s_test, - Realm::ProfilingRequestSet(), e3); - for(unsigned idx = 0; idx < p_img_mapss3.size(); idx++) { - p_img_mapss3[idx].destroy(e6); - } -#endif - errors += check_empty(e4, p_z_test, "p_z_test"); - errors += check_empty(e5, p_p_test, "p_p_test"); - errors += check_empty(e6, p_s_test, "p_s_test"); - for(unsigned idx = 0; idx < p_z_test.size(); idx++) { - p_z_test[idx].destroy(e4); - } - for(unsigned idx = 0; idx < p_p_test.size(); idx++) { - p_p_test[idx].destroy(e5); - } - for(unsigned idx = 0; idx < p_s_test.size(); idx++) { - p_s_test[idx].destroy(e6); - } - - return errors; - } - - virtual int check_partitioning(void) - { - int errors = 0; - - for(int pcy = 0; pcy < numpcy; pcy++) { - for(int pcx = 0; pcx < numpcx; pcx++) { - int idx = pcy * numpcx + pcx; - - int lx = zxbound[pcx + 1] - zxbound[pcx]; - int ly = zybound[pcy + 1] - zybound[pcy]; - - int exp_zones = lx * ly; - int exp_sides = exp_zones * 4; - int exp_points = (lx + 1) * (ly + 1); // easier because of aliasing - - int act_zones = p_zones[idx].volume(); - int act_sides = p_sides[idx].volume(); - int act_points = p_points[idx].volume(); - - if(exp_zones != act_zones) { - log_app.error() << "Piece #" << idx - << ": zone count mismatch: exp = " << exp_zones - << ", act = " << act_zones; - errors++; - } - if(exp_sides != act_sides) { - log_app.error() << "Piece #" << idx - << ": side count mismatch: exp = " << exp_sides - << ", act = " << act_sides; - errors++; - } - if(exp_points != act_points) { - log_app.error() << "Piece #" << idx - << ": point count mismatch: exp = " << exp_points - << ", act = " << act_points; - errors++; - } - } - } - - // check zones - Point<1> pz = is_zones.bounds.lo; - for(int pc = 0; pc < numpc; pc++) { - for(int i = 0; i < lz[pc]; i++) { - for(int j = 0; j < numpc; j++) { - bool exp = (j == pc); - bool act = p_zones[j].contains(pz); - if(exp != act) { - log_app.error() << "mismatch: zone " << pz << " in p_zones[" << j - << "]: exp=" << exp << " act=" << act; - errors++; - } - } - pz[0]++; - } - } - - // check sides - Point<1> ps = is_sides.bounds.lo; - for(int pc = 0; pc < numpc; pc++) { - for(int i = 0; i < ls[pc]; i++) { - for(int j = 0; j < numpc; j++) { - bool exp = (j == pc); - bool act = p_sides[j].contains(ps); - if(exp != act) { - log_app.error() << "mismatch: side " << ps << " in p_sides[" << j - << "]: exp=" << exp << " act=" << act; - errors++; - } - } - ps[0]++; - } - } - - // check points (trickier due to ghosting) - for(int py = 0; py < npy; py++) - for(int px = 0; px < npx; px++) { - Point<1> pp = global_point_pointer(py, px); - for(int pc = 0; pc < numpc; pc++) { - int pcy = pc / numpcx; - int pcx = pc % numpcx; - bool exp = ((py >= zybound[pcy]) && (py <= zybound[pcy + 1]) && - (px >= zxbound[pcx]) && (px <= zxbound[pcx + 1])); - bool act = p_points[pc].contains(pp); - if(exp != act) { - log_app.error() << "mismatch: point " << pp << " in p_points[" << pc - << "]: exp=" << exp << " act=" << act; - errors++; - } - } - } - - for(unsigned idx = 0; idx < p_zones.size(); idx++) { - p_zones[idx].destroy(); - } - for(unsigned idx = 0; idx < p_sides.size(); idx++) { - p_sides[idx].destroy(); - } - for(unsigned idx = 0; idx < p_points.size(); idx++) { - p_points[idx].destroy(); - } - - return errors; - } -}; - -template > -class RandStream { -public: - RandStream(unsigned _seed) - : seed(_seed) - , idx(0) - {} - - void setpos(unsigned long long _idx) { idx = _idx; } - void adjpos(long long _adj) { idx += _adj; } - - unsigned rand_int(unsigned n) - { - unsigned v = PRNG::rand_int(seed, idx >> 32, idx, n); - idx++; - return v; - } - - float rand_float(void) - { - float v = PRNG::rand_float(seed, idx >> 32, idx); - idx++; - return v; - } - - unsigned seed; - unsigned long long idx; -}; - -template -FT randval(RandStream<> &rs); - -template <> -float randval(RandStream<> &rs) -{ - return rs.rand_float(); -} - -template <> -int randval(RandStream<> &rs) -{ - return rs.rand_int(INT_MAX); -} - -template -class RandomTest : public TestInterface { -public: - RandomTest(int argc, const char *argv[]); - virtual ~RandomTest(void); - - virtual void print_info(void); - - virtual Event initialize_data(const std::vector &memories, - const std::vector &procs); - - virtual Event perform_partitioning(void); - - virtual int perform_dynamic_checks(void); - - virtual int check_partitioning(void); - - void fill_instance_data(IndexSpace ibounds, RegionInstance inst); - -protected: - T1 base1_min, base1_max, extent1_min, extent1_max; - T2 base2_min, base2_max, extent2_min, extent2_max; - int num_pieces, num_colors; - - Rect bounds1; - Rect bounds2; - IndexSpace root1; - IndexSpace root2; - std::vector colors; - std::vector ri_data1; - std::vector, FT>> fd_vals1; - std::vector, Point>> fd_ptrs1; -}; - -template -RandomTest::RandomTest(int argc, const char *argv[]) - : base1_min(0) - , base1_max(0) - , extent1_min(4) - , extent1_max(6) - , base2_min(0) - , base2_max(0) - , extent2_min(4) - , extent2_max(6) - , num_pieces(2) - , num_colors(4) -{ - RandStream<> rs(random_seed + 0); - - for(int i = 0; i < N1; i++) { - bounds1.lo[i] = base1_min + rs.rand_int(base1_max - base1_min + 1); - bounds1.hi[i] = - (bounds1.lo[i] + extent1_min + rs.rand_int(extent1_max - extent1_min + 1)); - } - for(int i = 0; i < N2; i++) { - bounds2.lo[i] = base2_min + rs.rand_int(base2_max - base2_min + 1); - bounds2.hi[i] = - (bounds2.lo[i] + extent2_min + rs.rand_int(extent2_max - extent2_min + 1)); - } - - colors.resize(num_colors); - for(int i = 0; i < num_colors; i++) - colors[i] = randval(rs); -} - -template -RandomTest::~RandomTest(void) -{} - -template -void RandomTest::print_info(void) -{ - printf("Realm dependent partitioning test - random\n"); -} - -template -void RandomTest::fill_instance_data(IndexSpace ibounds, - RegionInstance inst) -{ - { - // start with value field - AffineAccessor a_vals(inst, 0); - - // iterate over all points in root1 with initial random values - RandStream<> rs1(random_seed + 1); - for(PointInRectIterator pir(bounds1); pir.valid; pir.step()) { - FT v = colors[rs1.rand_int(colors.size())]; - if(ibounds.contains(pir.p)) - a_vals.write(pir.p, v); - } - - // print results - for(PointInRectIterator pir(bounds1); pir.valid; pir.step()) { - if(ibounds.contains(pir.p)) - log_app.debug() << "v[" << pir.p << "] = " << a_vals.read(pir.p); - } - } - - { - // now pointer field - AffineAccessor, N1, T1> a_ptrs(inst, 0 + sizeof(FT)); - - // iterate over all points in root1 with initial random values - RandStream<> rs2(random_seed + 2); - for(PointInRectIterator pir(bounds1); pir.valid; pir.step()) { - Point p2; - for(int i = 0; i < N2; i++) - p2[i] = bounds2.lo[i] + rs2.rand_int(bounds2.hi[i] - bounds2.lo[i] + 1); - if(ibounds.contains(pir.p)) - a_ptrs.write(pir.p, p2); - } - - // print results - for(PointInRectIterator pir(bounds1); pir.valid; pir.step()) { - if(ibounds.contains(pir.p)) - log_app.debug() << "p[" << pir.p << "] = " << a_ptrs.read(pir.p); - } - } -} - -template -Event RandomTest::initialize_data(const std::vector &memories, - const std::vector &procs) -{ - root1 = IndexSpace(bounds1); - root2 = IndexSpace(bounds2); - log_app.debug() << "root1 = " << root1; - log_app.debug() << "root2 = " << root2; - - // create instances to hold actual data - size_t num_insts = memories.size(); - log_app.debug() << "procs: " << procs; - log_app.debug() << "mems: " << memories; - std::vector> ss_inst1; - root1.create_equal_subspaces(num_insts, 1, ss_inst1, Realm::ProfilingRequestSet()) - .wait(); - - std::vector field_sizes; - field_sizes.push_back(sizeof(FT)); - field_sizes.push_back(sizeof(Point)); - - ri_data1.resize(num_insts); - fd_vals1.resize(num_insts); - fd_ptrs1.resize(num_insts); - - for(size_t i = 0; i < num_insts; i++) { - RegionInstance ri; - RegionInstance::create_instance(ri, memories[i], ss_inst1[i], field_sizes, 0 /*SOA*/, - Realm::ProfilingRequestSet()) - .wait(); - log_app.debug() << "inst[" << i << "] = " << ri << " (" << ss_inst1[i] << ")"; - ri_data1[i] = ri; - - fd_vals1[i].index_space = ss_inst1[i]; - fd_vals1[i].inst = ri; - fd_vals1[i].field_offset = 0; - - fd_ptrs1[i].index_space = ss_inst1[i]; - fd_ptrs1[i].inst = ri; - fd_ptrs1[i].field_offset = 0 + sizeof(FT); - } - - log_app.debug() << "colors = " << colors; - - for(size_t i = 0; i < num_insts; i++) { - fill_instance_data(root1 /*ss_inst1[i]*/, ri_data1[i]); - } - - return Event::NO_EVENT; -} - -template -Event RandomTest::perform_partitioning(void) -{ - // start by filtering root1 by color - std::vector piece_colors(colors.begin(), colors.begin() + num_pieces); - std::vector> ss_by_color; - Event e1 = root1.create_subspaces_by_field(fd_vals1, piece_colors, ss_by_color, - ProfilingRequestSet()); - e1.wait(); - - for(int i = 0; i < num_pieces; i++) { - log_app.debug() << "bycolor[" << i << "] (" << colors[i] << ") = " << ss_by_color[i]; - dump_sparse_index_space("", ss_by_color[i]); - } - - // images - std::vector> ss_images; - Event e2 = root2.create_subspaces_by_image(fd_ptrs1, ss_by_color, ss_images, - ProfilingRequestSet(), e1); - - e2.wait(); - - for(int i = 0; i < num_pieces; i++) { - log_app.debug() << "image[" << i << "] = " << ss_images[i]; - dump_sparse_index_space("", ss_images[i]); - } - - // preimages - std::vector> ss_preimages; - Event e3 = root1.create_subspaces_by_preimage(fd_ptrs1, ss_images, ss_preimages, - ProfilingRequestSet(), e2); - - e3.wait(); - - for(int i = 0; i < num_pieces; i++) { - log_app.debug() << "preimage[" << i << "] = " << ss_preimages[i]; - dump_sparse_index_space("", ss_preimages[i]); - ss_by_color[i].destroy(); - ss_images[i].destroy(); - ss_preimages[i].destroy(); - } - - return Event::NO_EVENT; -} - -template -int RandomTest::perform_dynamic_checks(void) -{ - return 0; -} - -template -int RandomTest::check_partitioning(void) -{ - return 0; -} - -void top_level_task(const void *args, size_t arglen, const void *userdata, size_t userlen, - Processor p) -{ - int errors = 0; - - testcfg->print_info(); - - // find all the system memories - we'll stride our data across them - // for each memory, we'll need one CPU that can do the initialization of the data - std::vector sysmems; - std::vector procs; - - Machine machine = Machine::get_machine(); - { - std::set all_memories; - machine.get_all_memories(all_memories); - for(std::set::const_iterator it = all_memories.begin(); - it != all_memories.end(); it++) { - Memory m = *it; - - // skip memories with no capacity for creating instances - if(m.capacity() == 0) - continue; - - if(m.kind() == Memory::SYSTEM_MEM) { - sysmems.push_back(m); - std::set pset; - machine.get_shared_processors(m, pset); - Processor p = Processor::NO_PROC; - for(std::set::const_iterator it2 = pset.begin(); it2 != pset.end(); - it2++) { - if(it2->kind() == Processor::LOC_PROC) { - p = *it2; - break; - } - } - assert(p.exists()); - procs.push_back(p); - log_app.debug() << "System mem #" << (sysmems.size() - 1) << " = " - << *sysmems.rbegin() << " (" << *procs.rbegin() << ")"; - } - } - } - assert(sysmems.size() > 0); - - { - Realm::TimeStamp ts("initialization", true, &log_app); - - Event e = testcfg->initialize_data(sysmems, procs); - // wait for all initialization to be done - e.wait(); - } - - // now actual partitioning work - { - Realm::TimeStamp ts("dependent partitioning work", true, &log_app); - - Event e = testcfg->perform_partitioning(); - - e.wait(); - } - - // dynamic checks (which would be eliminated by compiler) - { - Realm::TimeStamp ts("dynamic checks", true, &log_app); - errors += testcfg->perform_dynamic_checks(); + Realm::TimeStamp ts("dynamic checks", true, &log_app); + errors += testcfg->perform_dynamic_checks(); } if(!skip_check) { @@ -4460,440 +812,46 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, size_ printf("all done!\n"); } -template -class RandomAffineTest : public TestInterface { -public: - RandomAffineTest(int argc, const char *argv[], - const std::vector &transforms); - virtual ~RandomAffineTest(void); - - virtual void print_info(void); - - virtual Event initialize_data(const std::vector &memories, - const std::vector &procs); - - virtual Event perform_partitioning(void); - - virtual int perform_dynamic_checks(void); - - virtual int check_partitioning(void); - - void fill_instance_data(IndexSpace ibounds, RegionInstance inst); - - int verify_results(const IndexSpace &root, const TRANSFORM &transform, - const std::vector>> &images, - const std::vector>> &preimages); - -protected: - std::vector transforms; - T1 base1_min, base1_max, extent1_min, extent1_max; - T2 base2_min, base2_max, extent2_min, extent2_max; - int num_pieces, num_colors; - - // std::vector> transforms; - - std::vector>> dense_images; - std::vector>> sparse_images; - - std::vector> ss_by_color; - - std::vector>> dense_preimages; - std::vector>> sparse_preimages; - - Rect bounds1; - Rect bounds2; - IndexSpace root1; - IndexSpace root2; - IndexSpace root2_sparse; - std::vector colors; - std::vector ri_data1; - std::vector, FT>> fd_vals1; -}; - -template -RandomAffineTest::RandomAffineTest( - int argc, const char *argv[], const std::vector &_transforms) - : transforms(_transforms) - , base1_min(0) - , base1_max(0) - , extent1_min(4) - , extent1_max(6) - , base2_min(0) - , base2_max(0) - , extent2_min(4) - , extent2_max(6) - , num_pieces(2) - , num_colors(4) -{ - RandStream<> rs(random_seed + 2); - - for(int i = 0; i < N1; i++) { - bounds1.lo[i] = base1_min + rs.rand_int(base1_max - base1_min + 1); - bounds1.hi[i] = - (bounds1.lo[i] + extent1_min + rs.rand_int(extent1_max - extent1_min + 1)); - } - for(int i = 0; i < N2; i++) { - bounds2.lo[i] = base2_min + rs.rand_int(base2_max - base2_min + 1); - bounds2.hi[i] = - (bounds2.lo[i] + extent2_min + rs.rand_int(extent2_max - extent2_min + 1)); - } - - colors.resize(num_colors); - - for(int i = 0; i < num_colors; i++) - colors[i] = randval(rs); - - dense_images.resize(transforms.size()); - sparse_images.resize(transforms.size()); - - dense_preimages.resize(transforms.size()); - sparse_preimages.resize(transforms.size()); -} - -template -RandomAffineTest::~RandomAffineTest(void) -{} - -template -void RandomAffineTest::print_info(void) -{ - printf("Realm dependent partitioning test - random affine\n"); -} - -template -void RandomAffineTest::fill_instance_data( - IndexSpace ibounds, RegionInstance inst) -{ - { - // start with value field - AffineAccessor a_vals(inst, 0); - - // iterate over all points in root1 with initial random values - RandStream<> rs1(random_seed + 1); - for(PointInRectIterator pir(bounds1); pir.valid; pir.step()) { - FT v = colors[rs1.rand_int(2)]; - if(ibounds.contains(pir.p)) - a_vals.write(pir.p, v); - } - - // print results - for(PointInRectIterator pir(bounds1); pir.valid; pir.step()) { - if(ibounds.contains(pir.p)) - log_app.debug() << "v[" << pir.p << "] = " << a_vals.read(pir.p); - } - } -} - -template -Event RandomAffineTest::initialize_data( - const std::vector &memories, const std::vector &procs) -{ - std::vector> sparse_points; - int index = 0; - for(PointInRectIterator pir(bounds2); pir.valid; pir.step()) { - if(index % 2 == 0) { - sparse_points.push_back(pir.p); - } - index++; - } - SparsityMap sparse_map = - SparsityMap::construct(sparse_points, true, true); - - root1 = IndexSpace(bounds1); - root2 = IndexSpace(bounds2); - root2_sparse = IndexSpace(bounds2, sparse_map); - - log_app.debug() << "root1 = " << root1; - log_app.debug() << "root2 = " << root2; - log_app.debug() << "root2_sparse = " << root2_sparse; - - // create instances to hold actual data - size_t num_insts = memories.size(); - log_app.debug() << "procs: " << procs; - log_app.debug() << "mems: " << memories; - std::vector> ss_inst1; - root1.create_equal_subspaces(num_insts, 1, ss_inst1, Realm::ProfilingRequestSet()) - .wait(); - - std::vector field_sizes; - field_sizes.push_back(sizeof(FT)); - field_sizes.push_back(sizeof(Point)); - - ri_data1.resize(num_insts); - fd_vals1.resize(num_insts); - - for(size_t i = 0; i < num_insts; i++) { - RegionInstance ri; - RegionInstance::create_instance(ri, memories[i], ss_inst1[i], field_sizes, 0 /*SOA*/, - Realm::ProfilingRequestSet()) - .wait(); - log_app.debug() << "inst[" << i << "] = " << ri << " (" << ss_inst1[i] << ")"; - ri_data1[i] = ri; - - fd_vals1[i].index_space = ss_inst1[i]; - fd_vals1[i].inst = ri; - fd_vals1[i].field_offset = 0; - } - - log_app.debug() << "colors = " << colors; - - for(size_t i = 0; i < num_insts; i++) { - fill_instance_data(root1 /*ss_inst1[i]*/, ri_data1[i]); - } - - return Event::NO_EVENT; -} - -template -Event RandomAffineTest::perform_partitioning(void) -{ - // start by filtering root1 by color - std::vector piece_colors(colors.begin(), colors.begin() + num_pieces); - - Event e1 = root1.create_subspaces_by_field(fd_vals1, piece_colors, ss_by_color, - ProfilingRequestSet()); - e1.wait(); - - for(int i = 0; i < num_pieces; i++) { - log_app.debug() << "bycolor[" << i << "] (" << colors[i] << ") = " << ss_by_color[i]; - dump_sparse_index_space("", ss_by_color[i]); - } - - for(size_t idx = 0; idx < transforms.size(); idx++) { - log_app.debug() << "Compute images for transform idx=" << idx; - - unsigned long long start_time = Clock::current_time_in_nanoseconds(); - // images - Event e2 = root2.create_subspaces_by_image( - transforms[idx], ss_by_color, dense_images[idx], ProfilingRequestSet(), e1); - e2.wait(); - - log_app.debug() << "affine image time=" - << (Clock::current_time_in_nanoseconds() - start_time); - - for(int i = 0; i < num_pieces; i++) { - log_app.debug() << "image[" << i << "] = " << dense_images[idx][i]; - dump_sparse_index_space("", dense_images[idx][i]); - } - - start_time = Clock::current_time_in_nanoseconds(); - Event e3 = root2_sparse.create_subspaces_by_image( - transforms[idx], ss_by_color, sparse_images[idx], ProfilingRequestSet(), e2); - - e3.wait(); - log_app.debug() << "affine sparse image time=" - << (Clock::current_time_in_nanoseconds() - start_time); - - for(int i = 0; i < num_pieces; i++) { - log_app.debug() << "sparse_image1[" << i << "] = " << sparse_images[idx][i]; - dump_sparse_index_space("", sparse_images[idx][i]); - } - - // preimages - Event e4 = root1.create_subspaces_by_preimage(transforms[idx], dense_images[idx], - dense_preimages[idx], - ProfilingRequestSet(), e3); - e4.wait(); - - for(int i = 0; i < num_pieces; i++) { - log_app.debug() << "dense_preimage[" << i << "] = " << dense_preimages[idx][i]; - dump_sparse_index_space("", dense_preimages[idx][i]); - } - - Event e5 = root1.create_subspaces_by_preimage(transforms[idx], sparse_images[idx], - sparse_preimages[idx], - ProfilingRequestSet(), e4); - e5.wait(); - - for(int i = 0; i < num_pieces; i++) { - log_app.debug() << "sparse_preimage[" << i << "] = " << sparse_preimages[idx][i]; - dump_sparse_index_space("", sparse_preimages[idx][i]); - } - } - - return Event::NO_EVENT; -} - -template -int RandomAffineTest::perform_dynamic_checks(void) -{ - return 0; -} - -template -int RandomAffineTest::verify_results( - const IndexSpace &root, const TRANSFORM &transform, - const std::vector>> &images, - const std::vector>> &preimages) -{ - for(size_t idx = 0; idx < transforms.size(); idx++) { - assert(ss_by_color.size() == images[idx].size() && - images[idx].size() == preimages[idx].size()); - int image_total = 0; - for(const auto &image : images[idx]) { - for(IndexSpaceIterator it2(image); it2.valid; it2.step()) { - image_total += it2.rect.volume(); - } - } - - int preimage_total = 0; - for(const auto &preimage : preimages[idx]) { - for(IndexSpaceIterator it2(preimage); it2.valid; it2.step()) { - preimage_total += it2.rect.volume(); - } - } +// Constructor function-pointer type +using CtorFn = TestInterface* (*)(int, const char** argv); - if(image_total != preimage_total) - return 1; - - for(size_t i = 0; i < ss_by_color.size(); i++) { - for(IndexSpaceIterator it(ss_by_color[i]); it.valid; it.step()) { - for(PointInRectIterator point(it.rect); point.valid; point.step()) { - auto target_point = transforms[idx][point.p]; - if(root.contains(target_point)) { - if(!images[idx][i].contains(target_point)) { - return 1; - } - if(!preimages[idx][i].contains(point.p)) { - return 1; - } - } - } - } - } - } - return 0; +// ---- Byfield constructors ---- +template +static TestInterface* make_byfield(int argc, const char** argv) { + return new ByfieldTest(argc, argv); } -template -int RandomAffineTest::check_partitioning(void) -{ - int result = 0; - for(size_t i = 0; i < transforms.size(); i++) { - if(verify_results(root2, transforms[i], dense_images, dense_preimages) || - verify_results(root2_sparse, transforms[i], sparse_images, sparse_preimages)) { - result++; - } - } - root1.destroy(); - root2.destroy(); - root2_sparse.destroy(); - for(unsigned i = 0; i < dense_images.size(); i++) { - for(unsigned j = 0; j < dense_images[i].size(); j++) { - dense_images[i][j].destroy(); - } - } - for(unsigned i = 0; i < sparse_images.size(); i++) { - for(unsigned j = 0; j < sparse_images[i].size(); j++) { - sparse_images[i][j].destroy(); - } - } - for(unsigned i = 0; i < dense_preimages.size(); i++) { - for(unsigned j = 0; j < dense_preimages[i].size(); j++) { - dense_preimages[i][j].destroy(); - } - } - for(unsigned i = 0; i < sparse_preimages.size(); i++) { - for(unsigned j = 0; j < sparse_preimages[i].size(); j++) { - sparse_preimages[i][j].destroy(); - } - } - return result; -} +static constexpr CtorFn BYFIELD_CTORS[3] = { + &make_byfield<1>, + &make_byfield<2>, + &make_byfield<3>, +}; -template -std::vector> create_translate_transforms(int size) -{ - RandStream<> rs(random_seed + 2); - std::vector> transforms; - { - TranslationTransform translate; - translate.offset = Point::ZEROES(); - for(int i = 0; i < N2; i++) { - translate.offset[i] = rs.rand_int(size - 1); - } - transforms.push_back(translate); - } - return transforms; +// ---- Image constructors ---- +template +static TestInterface* make_image(int argc, const char** argv) { + return new ImageTest(argc, argv); } -template -std::vector> create_affine_transforms() -{ - std::vector> transforms; - - { - AffineTransform transpose; - for(int i = 0; i < N2; i++) { - for(int j = 0; j < N1; j++) { - transpose.transform[i][j] = (i == N1 - j - 1); - } - } - transpose.offset = Point::ZEROES(); - transforms.push_back(transpose); - } - - { - AffineTransform translate; - for(int i = 0; i < N2; i++) { - for(int j = 0; j < N1; j++) { - translate.transform[i][j] = (i == j); - } - } - translate.offset = Point::ZEROES(); - transforms.push_back(translate); - } - - { - AffineTransform scale; - for(int i = 0; i < N2; i++) { - for(int j = 0; j < N1; j++) { - scale.transform[i][j] = (i == j) ? 2 : 0; - } - } - scale.offset = Point::ZEROES(); - transforms.push_back(scale); - } +static constexpr CtorFn IMAGE_CTORS[3][3] = { + { &make_image<1,1>, &make_image<1,2>, &make_image<1,3> }, + { &make_image<2,1>, &make_image<2,2>, &make_image<2,3> }, + { &make_image<3,1>, &make_image<3,2>, &make_image<3,3> }, +}; - { - AffineTransform shear; - for(int i = 0; i < N2; i++) { - for(int j = 0; j < N1; j++) { - shear.transform[i][j] = (i == j); - } - shear.transform[i][i + 1] = 1; - } - shear.offset = Point::ZEROES(); - transforms.push_back(shear); - } +using TaskWrapperFn = void (*)(const void*, size_t, const void*, size_t, Processor); - { - AffineTransform reflect; - for(int i = 0; i < N2; i++) { - for(int j = 0; j < N1; j++) { - reflect.transform[i][j] = (i == j) ? -1 : 0; - } - } - reflect.offset = Point::ZEROES(); - // transforms.push_back(reflect); - } - return transforms; -} +static constexpr TaskWrapperFn BYFIELD_INIT_TBL[3] = { + &ByfieldTest<1>::init_data_task_wrapper, + &ByfieldTest<2>::init_data_task_wrapper, + &ByfieldTest<3>::init_data_task_wrapper, +}; -TestInterface *run_structured_test(TransformType type, int argc, char **argv) -{ - switch(type) { - case TransformType::AFFINE: - return new RandomAffineTest<2, int, 2, int, int, AffineTransform<2, 2, int>>( - argc, const_cast(argv), - create_affine_transforms<2, int, 2, int, int>()); - case TransformType::TRANSLATION: - return new RandomAffineTest<2, int, 2, int, int, TranslationTransform<2, int>>( - argc, const_cast(argv), - create_translate_transforms<2, int, 2, int, int>(4)); - } - return nullptr; -} +static constexpr TaskWrapperFn IMAGE_INIT_TBL[3][3] = { + { &ImageTest<1,1>::init_data_task_wrapper, &ImageTest<1,2>::init_data_task_wrapper, &ImageTest<1,3>::init_data_task_wrapper }, + { &ImageTest<2,1>::init_data_task_wrapper, &ImageTest<2,2>::init_data_task_wrapper, &ImageTest<2,3>::init_data_task_wrapper }, + { &ImageTest<3,1>::init_data_task_wrapper, &ImageTest<3,2>::init_data_task_wrapper, &ImageTest<3,3>::init_data_task_wrapper }, +}; int main(int argc, char **argv) { @@ -4928,54 +886,28 @@ int main(int argc, char **argv) continue; } - // test cases consume the rest of the args - if(!strcmp(argv[i], "circuit")) { - testcfg = new CircuitTest(argc - i, const_cast(argv + i)); - break; - } - - if(!strcmp(argv[i], "basic")) { - testcfg = new BasicTest(argc - i, const_cast(argv + i)); - break; - } - - if(!strcmp(argv[i], "tile")) { - testcfg = new TileTest(argc - i, const_cast(argv + i)); - break; - } - - if (!strcmp(argv[i], "range")) { - testcfg = new RangeTest(argc - i, const_cast(argv + i)); - break; - } - - if (!strcmp(argv[i], "multi")) { - testcfg = new Range2DTest(argc - i, const_cast(argv + i)); - break; + if(!strcmp(argv[i], "-d1")) { + dimension1 = atoi(argv[++i]); + continue; } - if(!strcmp(argv[i], "pennant")) { - testcfg = new PennantTest(argc - i, const_cast(argv + i)); - break; + if(!strcmp(argv[i], "-d2")) { + dimension2 = atoi(argv[++i]); + continue; } - if(!strcmp(argv[i], "miniaero")) { - testcfg = new MiniAeroTest(argc - i, const_cast(argv + i)); - break; - } + if(!strcmp(argv[i], "byfield")) { + if (dimension1 < 1 || dimension1 > 3) + assert(false && "invalid dimension"); - if(!strcmp(argv[i], "random")) { - testcfg = new RandomTest<1, int, 2, int, int>(argc - i, - const_cast(argv + i)); + testcfg = BYFIELD_CTORS[dimension1 - 1](argc - i, const_cast(argv + i)); break; } - if(!strcmp(argv[i], "affine")) { - TransformType type = TransformType::AFFINE; - if(i < argc - 1 && !strcmp(argv[++i], "-type")) { - type = static_cast(atoi(argv[++i])); - } - testcfg = run_structured_test(type, argc, argv); + if(!strcmp(argv[i], "image")) { + if (dimension1 < 1 || dimension1 > 3 || dimension2 < 1 || dimension2 > 3) + assert(false && "invalid dimension"); + testcfg = IMAGE_CTORS[dimension1 - 1][dimension2 - 1](argc - i, const_cast(argv + i)); break; } @@ -4984,17 +916,16 @@ int main(int argc, char **argv) // if no test specified, use circuit (with default parameters) if(!testcfg) { - testcfg = new CircuitTest(0, 0); + assert(false); } rt.register_task(TOP_LEVEL_TASK, top_level_task); - rt.register_task(INIT_CIRCUIT_DATA_TASK, CircuitTest::init_data_task_wrapper); - rt.register_task(INIT_PENNANT_DATA_TASK, PennantTest::init_data_task_wrapper); - rt.register_task(INIT_BASIC_DATA_TASK, BasicTest::init_data_task_wrapper); - rt.register_task(INIT_TILE_DATA_TASK, TileTest::init_data_task_wrapper); - rt.register_task(INIT_RANGE_DATA_TASK, RangeTest::init_data_task_wrapper); - rt.register_task(INIT_RANGE2D_DATA_TASK, Range2DTest::init_data_task_wrapper); - rt.register_task(INIT_MINIAERO_DATA_TASK, MiniAeroTest::init_data_task_wrapper); + + if (dimension1 < 1 || dimension1 > 3 || dimension2 < 1 || dimension2 > 3) + assert(false && "invalid dimension"); + + rt.register_task(INIT_BYFIELD_DATA_TASK, BYFIELD_INIT_TBL[dimension1 - 1]); + rt.register_task(INIT_IMAGE_DATA_TASK, IMAGE_INIT_TBL[dimension1 - 1][dimension2 - 1]); signal(SIGALRM, sigalrm_handler); From 3434f6a0246e6fe0060dae02ded9449bc50c0fdf Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Mon, 9 Mar 2026 00:37:51 -0700 Subject: [PATCH 18/32] implemented cpu bvh --- src/realm/deppart/byfield.cc | 2 +- src/realm/deppart/byfield_gpu_impl.hpp | 18 +- src/realm/deppart/image.cc | 5 +- src/realm/deppart/image.h | 2 + src/realm/deppart/image_gpu_impl.hpp | 38 +- src/realm/deppart/partitions.cc | 1 - src/realm/deppart/partitions.h | 5 + src/realm/deppart/partitions_gpu_impl.hpp | 108 ++- src/realm/deppart/preimage.cc | 24 +- src/realm/deppart/preimage_gpu_impl.hpp | 48 +- src/realm/deppart/preimage_gpu_tmpl.cu | 10 - src/realm/deppart/sparsity_impl.cc | 331 ++++++- src/realm/indexspace.inl | 90 +- src/realm/sparsity.h | 66 +- tests/benchmark.cc | 1031 +++++++++++++++++---- tests/unit_tests/sparsity_map_test.cc | 2 +- 16 files changed, 1513 insertions(+), 268 deletions(-) diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc index ce543e1b44..7c1fe148c1 100644 --- a/src/realm/deppart/byfield.cc +++ b/src/realm/deppart/byfield.cc @@ -44,7 +44,7 @@ namespace Realm { if (val) { device_size = atoi(val); } - size_t optimal_size = is.bounds.volume() * sizeof(RectDesc); + size_t optimal_size = is.bounds.volume() * 10 * sizeof(RectDesc); std::vector affinities; unsigned best_bandwidth = 0; Processor best_proc = Processor::NO_PROC; diff --git a/src/realm/deppart/byfield_gpu_impl.hpp b/src/realm/deppart/byfield_gpu_impl.hpp index e309cf7609..849556a53d 100644 --- a/src/realm/deppart/byfield_gpu_impl.hpp +++ b/src/realm/deppart/byfield_gpu_impl.hpp @@ -27,7 +27,7 @@ void GPUByFieldMicroOp::execute() size_t tile_size = field_data[0].scratch_buffer.get_layout()->bytes_used; - Arena buffer_arena(reinterpret_cast(AffineAccessor(field_data[0].scratch_buffer, 0).base), tile_size); + Arena buffer_arena(field_data[0].scratch_buffer.pointer_untyped(0, tile_size), tile_size); inst_space.offsets = buffer_arena.alloc(field_data.size() + 1); inst_space.num_children = field_data.size(); @@ -203,14 +203,18 @@ void GPUByFieldMicroOp::execute() } catch (arena_oom&) { std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; - std::cout << buffer_arena.used() << " bytes used in arena." << std::endl; curr_tile /= 2; if (curr_tile == 0) { - host_fallback = true; - if (num_output > 0) { - this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + if (host_fallback) { + GPUMicroOp::shatter_rects(inst_space, num_completed); + curr_tile = 1; + } else { + host_fallback = true; + if (num_output > 0) { + this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + } + curr_tile = tile_size / 2; } - curr_tile = tile_size / 2; } } } @@ -254,7 +258,7 @@ void GPUByFieldMicroOp::execute() if (entry_counts[idx] > 0) { Rect* h_rects = reinterpret_cast *>(AffineAccessor(h_instances[idx], 0).base); span> h_rects_span(h_rects, entry_counts[idx]); - impl->contribute_dense_rect_list(h_rects_span, false); + impl->contribute_dense_rect_list(h_rects_span, true); h_instances[idx].destroy(); } else { impl->contribute_nothing(); diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc index b0dcd4383a..8d37d81969 100644 --- a/src/realm/deppart/image.cc +++ b/src/realm/deppart/image.cc @@ -38,9 +38,12 @@ namespace Realm { std::vector& requirements) const { size_t minimal_size = 0; size_t source_entries = 0; - bool bvh = true; + bool bvh = false; for (auto subspace : source_spaces) { source_entries += subspace.entries == 0 ? 1 : subspace.entries; + if (subspace.entries > 1) { + bvh = true; + } } minimal_size += sizeof(Rect) * source_entries; if (this->dense()) { diff --git a/src/realm/deppart/image.h b/src/realm/deppart/image.h index ab81ecafae..4eed6da566 100644 --- a/src/realm/deppart/image.h +++ b/src/realm/deppart/image.h @@ -171,6 +171,8 @@ namespace Realm { void add_sparsity_output(IndexSpace _source, SparsityMap _sparsity); + bool is_image_microop() const override { return true; } + protected: IndexSpace parent_space; DomainTransform domain_transform; diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp index 643845296d..83f907d922 100644 --- a/src/realm/deppart/image_gpu_impl.hpp +++ b/src/realm/deppart/image_gpu_impl.hpp @@ -7,8 +7,6 @@ #include #include "realm/nvtx.h" -#include - namespace Realm { //TODO: INTERSECTING INPUT/OUTPUT RECTS CAN BE DONE WITH BVH IF BECOME EXPENSIVE @@ -46,12 +44,12 @@ void GPUImageMicroOp::gpu_populate_rngs() return; } - NVTX_DEPPART(gpu_image); + NVTX_DEPPART(gpu_image_range); RegionInstance buffer = domain_transform.range_data[0].scratch_buffer; size_t tile_size = buffer.get_layout()->bytes_used; std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; - Arena buffer_arena(reinterpret_cast(AffineAccessor(buffer, 0).base), tile_size); + Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size); cudaStream_t stream = Cuda::get_task_cuda_stream(); @@ -244,14 +242,18 @@ void GPUImageMicroOp::gpu_populate_rngs() } catch (arena_oom&) { std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; - std::cout << buffer_arena.used() << " bytes used in arena." << std::endl; curr_tile /= 2; if (curr_tile == 0) { - host_fallback = true; - if (num_output > 0) { - this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + if (host_fallback) { + GPUMicroOp::shatter_rects(inst_space, num_completed); + curr_tile = 1; + } else { + host_fallback = true; + if (num_output > 0) { + this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + } + curr_tile = tile_size / 2; } - curr_tile = tile_size / 2; } } } @@ -331,7 +333,7 @@ void GPUImageMicroOp::gpu_populate_ptrs() size_t tile_size = buffer.get_layout()->bytes_used; std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; - Arena buffer_arena(reinterpret_cast(AffineAccessor(buffer, 0).base), tile_size); + Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size); collapsed_space src_space; src_space.offsets = buffer_arena.alloc(sources.size()+1); @@ -390,8 +392,6 @@ void GPUImageMicroOp::gpu_populate_ptrs() try { std::cout << "Image iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; buffer_arena.start(); - std::cout << "Amount Used: " << buffer_arena.used() << std::endl; - std::cout << "Expected Amount Used: " << left + num_output * sizeof(RectDesc) << std::endl; if (num_completed + curr_tile > inst_space.num_entries) { curr_tile = inst_space.num_entries - num_completed; } @@ -514,14 +514,18 @@ void GPUImageMicroOp::gpu_populate_ptrs() } catch (arena_oom&) { std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; - std::cout << buffer_arena.used() << " bytes used in arena." << std::endl; curr_tile /= 2; if (curr_tile == 0) { - host_fallback = true; - if (num_output > 0) { - this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + if (host_fallback) { + GPUMicroOp::shatter_rects(inst_space, num_completed); + curr_tile = 1; + } else { + host_fallback = true; + if (num_output > 0) { + this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + } + curr_tile = tile_size / 2; } - curr_tile = tile_size / 2; } } } diff --git a/src/realm/deppart/partitions.cc b/src/realm/deppart/partitions.cc index f342519f71..1c16670c47 100644 --- a/src/realm/deppart/partitions.cc +++ b/src/realm/deppart/partitions.cc @@ -18,7 +18,6 @@ // index space partitioning for Realm #include "realm/deppart/partitions.h" - #include "realm/profiling.h" #include "realm/runtime_impl.h" diff --git a/src/realm/deppart/partitions.h b/src/realm/deppart/partitions.h index 8b67e5e642..68e5b40084 100644 --- a/src/realm/deppart/partitions.h +++ b/src/realm/deppart/partitions.h @@ -81,6 +81,7 @@ namespace Realm { size_t* offsets; size_t num_children; Rect bounds; + RegionInstance h_instance = RegionInstance::NO_INST; }; // Stores everything necessary to query a BVH @@ -348,6 +349,8 @@ namespace Realm { virtual void execute(void) = 0; + static void shatter_rects(collapsed_space & inst_space, size_t &num_completed); + template static void collapse_multi_space(const std::vector& field_data, collapsed_space &out_space, Arena &my_arena, cudaStream_t stream); @@ -375,6 +378,8 @@ namespace Realm { template void send_output(RectDesc* d_rects, size_t total_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap); + virtual bool is_image_microop() const { return false; } + bool exclusive = false; }; diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp index 82abfd57d9..0827f1844c 100644 --- a/src/realm/deppart/partitions_gpu_impl.hpp +++ b/src/realm/deppart/partitions_gpu_impl.hpp @@ -40,10 +40,18 @@ //NVTX macros to only add ranges if defined. #ifdef REALM_USE_NVTX - #define NVTX_CAT(a,b) a##b +#include - #define NVTX_DEPPART(message) \ - nvtxScopedRange NVTX_CAT(nvtx_, message)("cuda", #message, 0) +inline int32_t next_nvtx_payload() { + static std::atomic counter{0}; + return counter.fetch_add(1, std::memory_order_relaxed); +} + +#define NVTX_CAT2(a, b) a##b +#define NVTX_CAT(a, b) NVTX_CAT2(a, b) + +#define NVTX_DEPPART(message) \ + nvtxScopedRange NVTX_CAT(nvtx_, __LINE__)("cuda", #message, next_nvtx_payload()) #else @@ -98,12 +106,93 @@ namespace Realm { return found; } + template + void GPUMicroOp::shatter_rects(collapsed_space & inst_space, size_t &num_completed) { + + NVTX_DEPPART(shatter_rects); + cudaStream_t stream = Cuda::get_task_cuda_stream(); + size_t new_size = (inst_space.entries_buffer[num_completed].bounds.volume() + 1) / 2; + assert(new_size > 0); + size_t num_new_entries = 0; + std::vector offsets(inst_space.num_children + 1); + std::vector new_offsets(inst_space.num_children + 1); + CUDA_CHECK(cudaMemcpyAsync(offsets.data(), inst_space.offsets, (inst_space.num_children + 1) * sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t i = 0; i < inst_space.num_children; ++i) { + new_offsets[i] = num_new_entries; + if (offsets[i+1] <= num_completed) { + continue; + } + for (size_t j = offsets[i]; j < offsets[i+1]; ++j) { + if (j >= num_completed) { + num_new_entries += (inst_space.entries_buffer[j].bounds.volume() + new_size - 1) / new_size; + } + } + } + new_offsets[inst_space.num_children] = num_new_entries; + CUDA_CHECK(cudaMemcpyAsync(inst_space.offsets, new_offsets.data(), (inst_space.num_children + 1) * sizeof(size_t), cudaMemcpyHostToDevice, stream), stream); + RegionInstance new_entries_buffer = realm_malloc(num_new_entries * sizeof(SparsityMapEntry), inst_space.h_instance.get_location()); + SparsityMapEntry *new_entries_ptr = reinterpret_cast *>(new_entries_buffer.pointer_untyped(0, num_new_entries * sizeof(SparsityMapEntry))); + + size_t write_loc = 0; + for (size_t i = num_completed; i < inst_space.num_entries; i++) { + Rect bounds = inst_space.entries_buffer[i].bounds; + if (bounds.volume() <= new_size) { + new_entries_ptr[write_loc] = inst_space.entries_buffer[i]; + write_loc++; + continue; + } + size_t count = (bounds.volume() + new_size - 1) / new_size; + // split in the largest dimension available + int split_dim = 0; + T total = std::max(bounds.hi[0] - bounds.lo[0] + 1, T(0)); + if(N > 1) { + for(int d = 1; d < N; d++) { + T extent = std::max(bounds.hi[d] - bounds.lo[d] + 1, T(0)); + if(extent > total) { + total = extent; + split_dim = d; + } + } + } + T px = bounds.lo[split_dim]; + // have to divide before multiplying to avoid overflow + T base_span_size = total / count; + T base_span_rem = total - (base_span_size * count); + T leftover = 0; + for(size_t j = 0; j < count; j++) { + new_entries_ptr[write_loc] = inst_space.entries_buffer[i]; + T nx = px + (base_span_size - 1); + if(base_span_rem != 0) { + leftover += base_span_rem; + if(leftover >= T(count)) { + nx += 1; + leftover -= count; + } + } + new_entries_ptr[write_loc].bounds.lo[split_dim] = px; + new_entries_ptr[write_loc].bounds.hi[split_dim] = nx; + px = nx + 1; + write_loc++; + } + } + + num_completed = 0; + inst_space.entries_buffer = new_entries_ptr; + inst_space.num_entries = num_new_entries; + inst_space.h_instance.destroy(); + inst_space.h_instance = new_entries_buffer; + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + } + //Given a list of spaces, compacts them all into one collapsed_space template template void GPUMicroOp::collapse_multi_space(const std::vector& spaces, collapsed_space &out_space, Arena &my_arena, cudaStream_t stream) { + NVTX_DEPPART(collapse_multi_space); out_space.bounds = Rect::make_empty(); char *val = std::getenv("SHATTER_SIZE"); // or any env var @@ -197,6 +286,8 @@ namespace Realm { CUDA_CHECK(cudaMemcpyAsync(out_space.entries_buffer, h_entries, out_space.num_entries * sizeof(SparsityMapEntry), cudaMemcpyHostToDevice, stream), stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); h_instance.destroy(); + } else { + out_space.h_instance = h_instance; } CUDA_CHECK(cudaStreamSynchronize(stream), stream); @@ -206,6 +297,8 @@ namespace Realm { template void GPUMicroOp::collapse_parent_space(const IndexSpace& parent_space, collapsed_space &out_space, Arena &my_arena, cudaStream_t stream) { + + NVTX_DEPPART(collapse_parent_space); if (parent_space.dense()) { SparsityMapEntry entry; entry.bounds = parent_space.bounds; @@ -229,6 +322,7 @@ namespace Realm { template void GPUMicroOp::build_bvh(const collapsed_space &space, BVH &result, Arena &my_arena, cudaStream_t stream) { + NVTX_DEPPART(build_bvh); //We want to keep the entire BVH that we return in one instance for convenience. size_t indices_instance_size = space.num_entries * sizeof(uint64_t); size_t labels_instance_size = space.offsets == nullptr ? 0 : space.num_entries * sizeof(size_t); @@ -329,6 +423,7 @@ namespace Realm { void GPUMicroOp::construct_input_rectlist(const collapsed_space &lhs, const collapsed_space &rhs, out_t* &d_valid_rects, size_t& out_size, uint32_t* counters, uint32_t* out_offsets, Arena &my_arena, cudaStream_t stream) { + NVTX_DEPPART(construct_input_rectlist); CUDA_CHECK(cudaMemsetAsync(counters, 0, (lhs.num_children) * sizeof(uint32_t), stream), stream); BVH my_bvh; @@ -388,6 +483,8 @@ namespace Realm { template void GPUMicroOp::volume_prefix_sum(const out_t* d_rects, size_t total_rects, size_t* &d_prefix_rects, size_t& num_pts, Arena &my_arena, cudaStream_t stream) { + + NVTX_DEPPART(volume_prefix_sum); d_prefix_rects = my_arena.alloc(total_rects+1); CUDA_CHECK(cudaMemsetAsync(d_prefix_rects, 0, sizeof(size_t), stream), stream); @@ -1477,7 +1574,7 @@ namespace Realm { template void GPUMicroOp::split_output(RectDesc* d_rects, size_t total_rects, std::vector &output_instances, std::vector &output_counts, Arena &my_arena) { - NVTX_DEPPART(send_output); + NVTX_DEPPART(split_output); cudaStream_t stream = Cuda::get_task_cuda_stream(); bool use_sysmem = false; @@ -1626,7 +1723,8 @@ namespace Realm { CUDA_CHECK(cudaMemcpyAsync(h_rects, final_rects + start, (end - start) * sizeof(Rect), cudaMemcpyDeviceToHost, stream), stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); span> h_rects_span(h_rects, end - start); - impl->contribute_dense_rect_list(h_rects_span, false); + bool disjoint = !this->is_image_microop(); + impl->contribute_dense_rect_list(h_rects_span, disjoint); h_rects_instance.destroy(); } else { impl->contribute_nothing(); diff --git a/src/realm/deppart/preimage.cc b/src/realm/deppart/preimage.cc index b25c8b2c41..4feaa585e4 100644 --- a/src/realm/deppart/preimage.cc +++ b/src/realm/deppart/preimage.cc @@ -43,6 +43,9 @@ namespace Realm { bool bvh = false; for (auto subspace : target_spaces) { source_entries += subspace.entries == 0 ? 1 : subspace.entries; + if (subspace.entries > 1) { + bvh = true; + } } minimal_size += sizeof(Rect) * source_entries; if (this->dense()) { @@ -54,9 +57,9 @@ namespace Realm { minimal_size += (source_entries * sizeof(uint64_t)) + (source_entries * sizeof(size_t)) + - ((2*source_entries - 1) * sizeof(Rect)) + + ((2*source_entries - 1) * sizeof(Rect)) + (2 * (2*source_entries - 1) * sizeof(int)) + - sizeof(Rect) + + sizeof(Rect) + (2 * source_entries * sizeof(uint64_t)) + (source_entries * sizeof(uint64_t)); } @@ -72,7 +75,7 @@ namespace Realm { device_size = atoi(val); } minimal_size = max(minimal_size, device_size); - size_t optimal_size = is.bounds.volume() * sizeof(Rect) * target_spaces.size() + minimal_size; + size_t optimal_size = is.bounds.volume() * sizeof(Rect) * target_spaces.size() * 10 + minimal_size; std::vector affinities; unsigned best_bandwidth = 0; Processor best_proc = Processor::NO_PROC; @@ -216,6 +219,13 @@ namespace Realm { { TimeStamp ts("PreimageMicroOp::execute", true, &log_uop_timing); std::map *> rect_map; + if (is_ranged || N2 > 1) { + for (const IndexSpace& target : targets) { + if (!target.dense()) { + target.sparsity.impl()->request_bvh(); + } + } + } if(is_ranged) populate_bitmasks_ranges(rect_map); @@ -737,6 +747,14 @@ namespace Realm { TimeStamp ts("PreimageMicroOp::execute", true, &log_uop_timing); std::map *> rect_map; + if (N2 > 1) { + for (const IndexSpace& target : targets) { + if (!target.dense()) { + target.sparsity.impl()->request_bvh(); + } + } + } + populate_bitmasks(rect_map); #ifdef DEBUG_PARTITIONING std::cout << rect_map.size() << " non-empty preimages present in instance " diff --git a/src/realm/deppart/preimage_gpu_impl.hpp b/src/realm/deppart/preimage_gpu_impl.hpp index 3e464c582f..960e427beb 100644 --- a/src/realm/deppart/preimage_gpu_impl.hpp +++ b/src/realm/deppart/preimage_gpu_impl.hpp @@ -19,9 +19,9 @@ namespace Realm { size_t tile_size = buffer.get_layout()->bytes_used; std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; - Arena buffer_arena(reinterpret_cast(AffineAccessor(buffer, 0).base), tile_size); + Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size); - NVTX_DEPPART(gpu_preimage); + NVTX_DEPPART(gpu_preimage_range); Memory sysmem; find_memory(sysmem, Memory::SYSTEM_MEM); @@ -85,7 +85,6 @@ namespace Realm { std::cout << "Preimage iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; buffer_arena.start(); - std::cout << "Amount Used: " << buffer_arena.used() << std::endl; if (num_completed + curr_tile > inst_space.num_entries) { curr_tile = inst_space.num_entries - num_completed; } @@ -114,8 +113,6 @@ namespace Realm { size_t* d_prefix_rects; GPUMicroOp::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream); - nvtx_range_push("cuda", "build target entries"); - PointDesc* d_points; size_t num_valid_points; @@ -258,14 +255,18 @@ namespace Realm { } catch (arena_oom&) { std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; - std::cout << buffer_arena.used() << " bytes used in arena." << std::endl; curr_tile /= 2; if (curr_tile == 0) { - host_fallback = true; - if (num_output > 0) { - this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + if (host_fallback) { + GPUMicroOp::shatter_rects(inst_space, num_completed); + curr_tile = 1; + } else { + host_fallback = true; + if (num_output > 0) { + this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + } + curr_tile = tile_size / 2; } - curr_tile = tile_size / 2; } } } @@ -307,7 +308,7 @@ namespace Realm { if (entry_counts[idx] > 0) { Rect* h_rects = reinterpret_cast *>(AffineAccessor(h_instances[idx], 0).base); span> h_rects_span(h_rects, entry_counts[idx]); - impl->contribute_dense_rect_list(h_rects_span, false); + impl->contribute_dense_rect_list(h_rects_span, true); h_instances[idx].destroy(); } else { impl->contribute_nothing(); @@ -326,15 +327,15 @@ namespace Realm { size_t tile_size = buffer.get_layout()->bytes_used; std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; - Arena buffer_arena(reinterpret_cast(AffineAccessor(buffer, 0).base), tile_size); - - NVTX_DEPPART(gpu_preimage); + Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size); Memory sysmem; find_memory(sysmem, Memory::SYSTEM_MEM); cudaStream_t stream = Cuda::get_task_cuda_stream(); + NVTX_DEPPART(gpu_preimage); + collapsed_space inst_space; // We combine all of our instances into one to batch work, tracking the offsets between instances. @@ -392,7 +393,6 @@ namespace Realm { std::cout << "Preimage iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; buffer_arena.start(); - std::cout << "Amount Used: " << buffer_arena.used() << std::endl; if (num_completed + curr_tile > inst_space.num_entries) { curr_tile = inst_space.num_entries - num_completed; } @@ -421,8 +421,6 @@ namespace Realm { size_t* d_prefix_rects; GPUMicroOp::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream); - nvtx_range_push("cuda", "build target entries"); - PointDesc* d_points; size_t num_valid_points; @@ -565,14 +563,18 @@ namespace Realm { } catch (arena_oom&) { std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; - std::cout << buffer_arena.used() << " bytes used in arena." << std::endl; curr_tile /= 2; if (curr_tile == 0) { - host_fallback = true; - if (num_output > 0) { - this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + if (host_fallback) { + GPUMicroOp::shatter_rects(inst_space, num_completed); + curr_tile = 1; + } else { + host_fallback = true; + if (num_output > 0) { + this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + } + curr_tile = tile_size / 2; } - curr_tile = tile_size / 2; } } } @@ -614,7 +616,7 @@ namespace Realm { if (entry_counts[idx] > 0) { Rect* h_rects = reinterpret_cast *>(AffineAccessor(h_instances[idx], 0).base); span> h_rects_span(h_rects, entry_counts[idx]); - impl->contribute_dense_rect_list(h_rects_span, false); + impl->contribute_dense_rect_list(h_rects_span, true); h_instances[idx].destroy(); } else { impl->contribute_nothing(); diff --git a/src/realm/deppart/preimage_gpu_tmpl.cu b/src/realm/deppart/preimage_gpu_tmpl.cu index eb532a5a1d..be634fcc34 100644 --- a/src/realm/deppart/preimage_gpu_tmpl.cu +++ b/src/realm/deppart/preimage_gpu_tmpl.cu @@ -13,8 +13,6 @@ * limitations under the License. */ -// per‐dimension instantiator for the GPU version of -// ImageMicroOp<…>::gpu_populate_bitmasks_ptrs #define REALM_TEMPLATES_ONLY #include "realm/deppart/preimage_gpu_kernels.hpp" @@ -48,14 +46,6 @@ namespace Realm { #define N1 INST_N1 #define N2 INST_N2 - // Replace MyBitmask with whatever bitmask‐type you actually use - // (it must have an `as_vector.rects` member that your code touches). - // - // This explicitly instantiates: - // template void - // ImageMicroOp::gpu_populate_bitmasks_ptrs( - // std::map&); - // #define DO_DOUBLE(T1,T2) \ template class GPUPreimageMicroOp; \ template class PreimageMicroOp; diff --git a/src/realm/deppart/sparsity_impl.cc b/src/realm/deppart/sparsity_impl.cc index b4938edb3b..a1a511b744 100644 --- a/src/realm/deppart/sparsity_impl.cc +++ b/src/realm/deppart/sparsity_impl.cc @@ -883,6 +883,334 @@ namespace Realm { } } + template + int SparsityMapPublicImpl::choose_bvh_split_axis( + const std::vector& entry_ids, + size_t lo, size_t hi) const + { + assert(lo < hi); + + Rect bbox = entries[entry_ids[lo]].bounds; + for(size_t i = lo + 1; i < hi; i++) + bbox = bbox.union_bbox(entries[entry_ids[i]].bounds); + + int split_axis = 0; + long double best_extent = + static_cast(bbox.hi[0]) - static_cast(bbox.lo[0]); + + for(int d = 1; d < N; d++) { + long double extent = + static_cast(bbox.hi[d]) - static_cast(bbox.lo[d]); + if(extent > best_extent) { + best_extent = extent; + split_axis = d; + } + } + + return split_axis; + } + + template +bool SparsityMapPublicImpl::bvh_centroid_less(int axis, + uint32_t a, + uint32_t b) const + { + const Rect& ra = entries[a].bounds; + const Rect& rb = entries[b].bounds; + + // comparing (lo + hi) is equivalent to comparing centroids along the axis + const auto sa = ra.lo[axis] + ra.hi[axis]; + const auto sb = rb.lo[axis] + rb.hi[axis]; + if(sa != sb) + return (sa < sb); + + // deterministic tie-break + for(int i = 0; i < N; i++) { + if(ra.lo[i] != rb.lo[i]) return (ra.lo[i] < rb.lo[i]); + if(ra.hi[i] != rb.hi[i]) return (ra.hi[i] < rb.hi[i]); + } + + return (a < b); + } + + template + int SparsityMapPublicImpl::build_bvh_subtree(CPU_BVH& bvh, + std::vector& entry_ids, + size_t lo, + size_t hi) const + { + assert(lo < hi); + + // leaf: exactly one sparsity-map entry + if((hi - lo) == 1) { + const uint32_t entry_idx = entry_ids[lo]; + const uint32_t leaf_slot = static_cast(bvh.leaf_entries.size()); + bvh.leaf_entries.push_back(entry_idx); + + typename CPU_BVH::Node node; + node.bounds = entries[entry_idx].bounds; + node.left = -1; + node.right = -1; + node.begin = leaf_slot; + node.end = leaf_slot + 1; + + const int node_idx = static_cast(bvh.nodes.size()); + bvh.nodes.push_back(node); + return node_idx; + } + + const int split_axis = choose_bvh_split_axis(entry_ids, lo, hi); + const size_t mid = lo + ((hi - lo) >> 1); + + std::nth_element(entry_ids.begin() + lo, + entry_ids.begin() + mid, + entry_ids.begin() + hi, + [this, split_axis](uint32_t a, uint32_t b) { + return bvh_centroid_less(split_axis, a, b); + }); + + const int left_idx = build_bvh_subtree(bvh, entry_ids, lo, mid); + const int right_idx = build_bvh_subtree(bvh, entry_ids, mid, hi); + + typename CPU_BVH::Node node; + node.left = left_idx; + node.right = right_idx; + node.begin = bvh.nodes[left_idx].begin; + node.end = bvh.nodes[right_idx].end; + node.bounds = bvh.nodes[left_idx].bounds.union_bbox(bvh.nodes[right_idx].bounds); + + const int node_idx = static_cast(bvh.nodes.size()); + bvh.nodes.push_back(node); + return node_idx; + } + + template + void SparsityMapPublicImpl::request_bvh(void) + { + // fast path + if(bvh_valid.load_acquire()) + return; + + // the BVH indexes the entry list, so entries must already exist + if(!entries_valid.load_acquire()) + assert(false); + + if (from_gpu) { + auto gpu_entries = get_entries(); + entries = std::vector>(gpu_entries.data(), gpu_entries.data() + gpu_entries.size()); + } + + std::lock_guard lock(bvh_mutex); + + // somebody else may have built it while we were waiting + if(bvh_valid.load()) + return; + + CPU_BVH new_bvh; + new_bvh.clear(); + + const size_t count = entries.size(); + + // empty sparsity map: publish an empty-but-valid BVH + if(count == 0) { + entries_bvh = std::move(new_bvh); + bvh_valid.store_release(true); + return; + } + + // one leaf per sparsity-map entry + std::vector entry_ids(count); + for(uint32_t i = 0; i < count; i++) { + assert(!entries[i].sparsity.exists() && (entries[i].bitmap == 0)); + entry_ids[i] = i; + } + + // exact upper bounds for a binary tree with one entry per leaf + new_bvh.nodes.reserve((2 * count) - 1); + new_bvh.leaf_entries.reserve(count); + + new_bvh.root = build_bvh_subtree(new_bvh, entry_ids, 0, count); + + // publish only after construction is complete + entries_bvh = std::move(new_bvh); + bvh_valid.store_release(true); + } + + template bool SparsityMapPublicImpl::has_bvh() const + { + return bvh_valid.load_acquire(); + } + + + template + bool CPU_BVH::contains(const span>& entries, + const Point& p) const + { + if(!valid()) + return false; + + // Root bbox reject. + if(!nodes[root].bounds.contains(p)) + return false; + + std::vector stack; + stack.reserve(64); + stack.push_back(root); + + while(!stack.empty()) { + const int node_idx = stack.back(); + stack.pop_back(); + + const Node& node = nodes[node_idx]; + if(!node.bounds.contains(p)) + continue; + + if(node.is_leaf()) { + // Leaves currently correspond to exactly one entry, but use the range + // to keep the code compatible with future small-bucket leaves. + for(uint32_t i = node.begin; i < node.end; i++) { + const uint32_t entry_idx = leaf_entries[i]; + const SparsityMapEntry& entry = entries[entry_idx]; + + if(!entry.bounds.contains(p)) + continue; + + if(entry.sparsity.exists()) { + assert(0); + } else if(entry.bitmap != 0) { + assert(0); + } else { + return true; + } + } + } else { + // Push children whose bbox might still contain the point. + const int left = node.left; + const int right = node.right; + + if((right >= 0) && nodes[right].bounds.contains(p)) + stack.push_back(right); + if((left >= 0) && nodes[left].bounds.contains(p)) + stack.push_back(left); + } + } + + return false; + } + + template + bool CPU_BVH::contains_any(const span>& entries, + const Rect& r) const + { + if(!valid()) + return false; + + // Root bbox reject. + if(!nodes[root].bounds.overlaps(r)) + return false; + + std::vector stack; + stack.reserve(64); + stack.push_back(root); + + while(!stack.empty()) { + const int node_idx = stack.back(); + stack.pop_back(); + + const Node& node = nodes[node_idx]; + if(!node.bounds.overlaps(r)) + continue; + + if(node.is_leaf()) { + for(uint32_t i = node.begin; i < node.end; i++) { + const uint32_t entry_idx = leaf_entries[i]; + const SparsityMapEntry& entry = entries[entry_idx]; + + if(!entry.bounds.overlaps(r)) + continue; + + if(entry.sparsity.exists()) { + assert(0); + } else if(entry.bitmap != 0) { + assert(0); + } else { + return true; + } + } + } else { + const int left = node.left; + const int right = node.right; + + if((right >= 0) && nodes[right].bounds.overlaps(r)) + stack.push_back(right); + if((left >= 0) && nodes[left].bounds.overlaps(r)) + stack.push_back(left); + } + } + + return false; + } + + template + bool CPU_BVH::contains_all(const span>& entries, + const Rect& r) const + { + if(!valid()) + return false; + + // Root bbox reject. + if(!nodes[root].bounds.contains(r)) + return false; + + size_t total_volume = 0; + + std::vector stack; + stack.reserve(64); + stack.push_back(root); + + while(!stack.empty()) { + const int node_idx = stack.back(); + stack.pop_back(); + + const Node& node = nodes[node_idx]; + if(!node.bounds.overlaps(r)) + continue; + + if(node.is_leaf()) { + for(uint32_t i = node.begin; i < node.end; i++) { + const uint32_t entry_idx = leaf_entries[i]; + const SparsityMapEntry& entry = entries[entry_idx]; + + if(!entry.bounds.overlaps(r)) + continue; + + if(entry.sparsity.exists()) { + assert(0); + } else if(entry.bitmap != 0) { + assert(0); + } else { + Rect isect = entry.bounds.intersection(r); + total_volume += isect.volume(); + + // Early out as soon as we know we've covered enough. + if(total_volume >= r.volume()) + return true; + } + } + } else { + const int left = node.left; + const int right = node.right; + + if((right >= 0) && nodes[right].bounds.overlaps(r)) + stack.push_back(right); + if((left >= 0) && nodes[left].bounds.overlaps(r)) + stack.push_back(left); + } + } + + return (total_volume >= r.volume()); + } + //////////////////////////////////////////////////////////////////////// // // class SparsityMapImpl @@ -2036,7 +2364,8 @@ SparsityMapImpl::~SparsityMapImpl(void) #define DOIT(N, T) \ template class SparsityMapPublicImpl; \ template class SparsityMapImpl; \ - template class SparsityMap; + template class SparsityMap; \ + template struct CPU_BVH; FOREACH_NT(DOIT) }; // namespace Realm diff --git a/src/realm/indexspace.inl b/src/realm/indexspace.inl index d2c41e4c4e..b55e8b1aee 100644 --- a/src/realm/indexspace.inl +++ b/src/realm/indexspace.inl @@ -613,6 +613,11 @@ namespace Realm { } return true; } else { + + if (impl->has_bvh()) { + return impl->entries_bvh.contains(entries, p); + } + for(size_t i = 0; i < entries.size(); i++) { SparsityMapEntry entry = entries[i]; if(!entry.bounds.contains(p)) { @@ -639,30 +644,34 @@ namespace Realm { if(!bounds.contains(r)) return false; - if(!dense()) { - // test against sparsity map too - size_t total_volume = 0; - SparsityMapPublicImpl *impl = sparsity.impl(); - span> entries = impl->get_entries(); - for(size_t i = 0; i < entries.size(); i++) { - SparsityMapEntry entry = entries[i]; - if(!entry.bounds.overlaps(r)) continue; - if(entry.sparsity.exists()) { - assert(0); - } else if(entry.bitmap != 0) { - assert(0); - } else { - Rect isect = entry.bounds.intersection(r); - total_volume += isect.volume(); - } - } + if(dense()) { + return true; + } + // test against sparsity map too + size_t total_volume = 0; + SparsityMapPublicImpl *impl = sparsity.impl(); + span> entries = impl->get_entries(); - // did we miss anything? - if(total_volume < r.volume()) - return false; + if(impl->has_bvh()) { + return impl->entries_bvh.contains_all(entries, r); } - return true; + for(size_t i = 0; i < entries.size(); i++) { + SparsityMapEntry entry = entries[i]; + if(!entry.bounds.overlaps(r)) + continue; + if(entry.sparsity.exists()) { + assert(0); + } else if(entry.bitmap != 0) { + assert(0); + } else { + Rect isect = entry.bounds.intersection(r); + total_volume += isect.volume(); + } + } + + // did we miss anything? + return (total_volume == r.volume()); } template @@ -672,26 +681,31 @@ namespace Realm { if(!bounds.overlaps(r)) return false; - if(!dense()) { - // test against sparsity map too - SparsityMapPublicImpl *impl = sparsity.impl(); - span> entries = impl->get_entries(); - for(size_t i = 0; i < entries.size(); i++) { - SparsityMapEntry entry = entries[i]; - if(!entry.bounds.overlaps(r)) continue; - if(entry.sparsity.exists()) { - assert(0); - } else if(entry.bitmap != 0) { - assert(0); - } else { - return true; - } + if(dense()) { + return true; + } + // test against sparsity map too + SparsityMapPublicImpl *impl = sparsity.impl(); + span> entries = impl->get_entries(); + + if(impl->has_bvh()) { + return impl->entries_bvh.contains_any(entries, r); + } + + for(size_t i = 0; i < entries.size(); i++) { + SparsityMapEntry entry = entries[i]; + if(!entry.bounds.overlaps(r)) + continue; + if(entry.sparsity.exists()) { + assert(0); + } else if(entry.bitmap != 0) { + assert(0); + } else { + return true; } - - return false; } - return true; + return false; } template diff --git a/src/realm/sparsity.h b/src/realm/sparsity.h index bf46284c6d..dc9fe74300 100644 --- a/src/realm/sparsity.h +++ b/src/realm/sparsity.h @@ -30,6 +30,7 @@ #include "realm/atomics.h" #include +#include #include /** @@ -153,6 +154,44 @@ namespace Realm { HierarchicalBitMap *bitmap; }; + template + struct CPU_BVH { + struct Node { + Rect bounds; + int left = -1; + int right = -1; + + // range in leaf_entries covered by this subtree + uint32_t begin = 0; + uint32_t end = 0; + + bool is_leaf() const { return left < 0; } + }; + + std::vector nodes; + std::vector leaf_entries; + int root = -1; + + bool valid() const { + return root >= 0; + } + + void clear() { + nodes.clear(); + leaf_entries.clear(); + root = -1; + } + + bool contains(const span>& entries, + const Point& p) const; + + bool contains_any(const span>& entries, + const Rect& r) const; + + bool contains_all(const span>& entries, + const Rect& r) const; + }; + template REALM_PUBLIC_API std::ostream &operator<<(std::ostream &os, const SparsityMapEntry &entry); @@ -173,6 +212,12 @@ namespace Realm { // cannot be constructed directly SparsityMapPublicImpl(void); + int choose_bvh_split_axis(const std::vector& entry_ids, + size_t lo, size_t hi) const; + bool bvh_centroid_less(int axis, uint32_t a, uint32_t b) const; + int build_bvh_subtree(CPU_BVH &bvh, std::vector &entry_ids, + size_t lo, size_t hi) const; + public: /** * Make this sparsity map valid. @@ -244,8 +289,27 @@ namespace Realm { bool compute_covering(const Rect &bounds, size_t max_rects, int max_overhead, std::vector> &covering); + /** + * If this sparsity map doesn't already have an acceleration structure, + * build a BVH over the entries. + */ + REALM_PUBLIC_API + void request_bvh(void); + + /** + * Determine whether this sparsity map has an acceleration structure. + * @return true if the sparsity map has a valid bvh, false otherwise + */ + bool has_bvh() const; + + CPU_BVH entries_bvh; + + + protected: - atomic entries_valid{false}, approx_valid{false}; + atomic entries_valid{false}, approx_valid{false}, bvh_valid{false}; + + std::mutex bvh_mutex; //BOTH RegionInstance and vector are returned as a span //only on can be valid (i.e. only finalize or gpu_finalize can be called, not both) diff --git a/tests/benchmark.cc b/tests/benchmark.cc index 9615a3bcbc..6b78151d68 100644 --- a/tests/benchmark.cc +++ b/tests/benchmark.cc @@ -42,16 +42,8 @@ enum TOP_LEVEL_TASK = Processor::TASK_ID_FIRST_AVAILABLE + 0, INIT_BYFIELD_DATA_TASK, INIT_IMAGE_DATA_TASK, -}; - -enum TestType { - BYFIELD = 0 -}; - -enum TransformType -{ - AFFINE = 0, - TRANSLATION = 1, + INIT_IMAGE_RANGE_DATA_TASK, + INIT_PREIMAGE_DATA_TASK }; namespace std { @@ -108,8 +100,6 @@ namespace { bool skip_check = false; int dimension1 = 1; int dimension2 = 1; - TestType test_type = BYFIELD; - TestInterface *testcfg = 0; }; // namespace @@ -149,6 +139,31 @@ Event alloc_piece(RegionInstance &result, size_t size, Memory location) { return RegionInstance::create_instance(result, location, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()); } +template +IndexSpace create_sparse_index_space(const Rect &bounds, size_t sparse_factor, + bool randomize, size_t idx) +{ + std::vector> points; + for(PointInRectIterator it(bounds); it.valid; it.step()) { + size_t flattened = idx * bounds.volume(); + size_t stride = 1; + for (int d = 0; d < N; d++) { + flattened += (it.p[d] - bounds.lo[d]) * stride; + stride *= (bounds.hi[d] - bounds.lo[d] + 1); + } + if(randomize) { + if(Philox_2x32<>::rand_int(random_seed, flattened, 0, sparse_factor) == 0) { + points.push_back(it.p); + } + } else { + if(flattened % sparse_factor == 0) { + points.push_back(it.p); + } + } + } + return IndexSpace(points, true); +} + /* * Byfield test - create a graph, partition it by * node subgraph id and then check that the partitioning @@ -438,7 +453,8 @@ class ImageTest : public TestInterface { // graph config parameters int num_nodes = 1000; int num_edges = 1000; - int num_sources = 4; + int sparse_factor = 4; + int num_spaces = 4; int num_pieces = 4; std::string filename; @@ -459,14 +475,18 @@ class ImageTest : public TestInterface { continue; } if(!strcmp(argv[i], "-s")) { - num_sources = atoi(argv[++i]); + num_spaces = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-f")) { + sparse_factor = atoi(argv[++i]); continue; } } - if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_sources <= 0) { - log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " sources=" << num_sources << "\n"; + if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_spaces <= 0) { + log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " sources=" << num_spaces << "\n"; exit(1); } } @@ -540,8 +560,8 @@ class ImageTest : public TestInterface { virtual void print_info(void) { - printf("Realm %dD -> %dD Image dependent partitioning test: %d nodes, %d edges, %d pieces ,%d sources\n", (int) N2, (int) N1, - (int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_sources); + printf("Realm %dD -> %dD Image dependent partitioning test: %d nodes, %d edges, %d pieces ,%d sources, %d sparse factor\n", (int) N2, (int) N1, + (int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) sparse_factor); } virtual Event initialize_data(const std::vector &memories, @@ -615,8 +635,13 @@ class ImageTest : public TestInterface { // Ensure that the results are identical std::vector> sources(num_pieces); - for(int i = 0; i < num_sources; i++) - sources[i] = point_field_data[i % num_pieces].index_space; + for(int i = 0; i < num_spaces; i++) { + if (sparse_factor <= 1) { + sources[i] = point_field_data[i % num_pieces].index_space; + } else { + sources[i] = create_sparse_index_space(is_nodes.bounds, sparse_factor, random_colors, i); + } + } // We need a GPU memory for GPU partitioning Memory gpu_memory; @@ -648,7 +673,7 @@ class ImageTest : public TestInterface { } std::vector> image_inputs(num_pieces); - std::vector> image_subspaces(num_sources); + std::vector> image_subspaces(num_spaces); std::vector image_requirements(num_pieces); for (int i = 0; i < num_pieces; i++) { @@ -656,7 +681,7 @@ class ImageTest : public TestInterface { image_inputs[i].space = point_field_data_gpu[i].index_space; } - for (int i = 0; i < num_sources; i++) { + for (int i = 0; i < num_spaces; i++) { image_subspaces[i].space = sources[i]; image_subspaces[i].entries = sources[i].dense() ? 1 : sources[i].sparsity.impl()->get_entries().size(); } @@ -730,187 +755,873 @@ class ImageTest : public TestInterface { } }; -void top_level_task(const void *args, size_t arglen, const void *userdata, size_t userlen, - Processor p) -{ - int errors = 0; - - testcfg->print_info(); - - // find all the system memories - we'll stride our data across them - // for each memory, we'll need one CPU that can do the initialization of the data - std::vector sysmems; - std::vector procs; +template +class ImageRangeTest : public TestInterface { +public: + // graph config parameters + int num_nodes = 1000; + int num_edges = 1000; + int rect_size = 10; + int num_spaces = 4; + int num_pieces = 4; + int sparse_factor = 4; + std::string filename; - Machine machine = Machine::get_machine(); + ImageRangeTest(int argc, const char *argv[]) { - std::set all_memories; - machine.get_all_memories(all_memories); - for(std::set::const_iterator it = all_memories.begin(); - it != all_memories.end(); it++) { - Memory m = *it; + for(int i = 1; i < argc; i++) { - // skip memories with no capacity for creating instances - if(m.capacity() == 0) + if(!strcmp(argv[i], "-p")) { + num_pieces = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-n")) { + num_nodes = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-e")) { + num_edges = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-r")) { + rect_size = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-s")) { + num_spaces = atoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "-f")) { + sparse_factor = atoi(argv[++i]); continue; - - if(m.kind() == Memory::SYSTEM_MEM) { - sysmems.push_back(m); - std::set pset; - machine.get_shared_processors(m, pset); - Processor p = Processor::NO_PROC; - for(std::set::const_iterator it2 = pset.begin(); it2 != pset.end(); - it2++) { - if(it2->kind() == Processor::LOC_PROC) { - p = *it2; - break; - } - } - assert(p.exists()); - procs.push_back(p); - log_app.debug() << "System mem #" << (sysmems.size() - 1) << " = " - << *sysmems.rbegin() << " (" << *procs.rbegin() << ")"; } } - } - assert(sysmems.size() > 0); - { - Realm::TimeStamp ts("initialization", true, &log_app); - Event e = testcfg->initialize_data(sysmems, procs); - // wait for all initialization to be done - e.wait(); + if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_spaces <= 0 || rect_size <= 0) { + log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " sources=" << num_spaces << " rect size=" << rect_size << "\n"; + exit(1); + } } - // now actual partitioning work - { - Realm::TimeStamp ts("dependent partitioning work", true, &log_app); - - Event e = testcfg->perform_partitioning(); - - e.wait(); - } + struct InitDataArgs { + int index; + RegionInstance ri_nodes; + }; - // dynamic checks (which would be eliminated by compiler) + enum PRNGStreams { - Realm::TimeStamp ts("dynamic checks", true, &log_app); - errors += testcfg->perform_dynamic_checks(); - } + NODE_SUBGRAPH_STREAM, + }; - if(!skip_check) { - log_app.print() << "checking correctness of partitioning"; - Realm::TimeStamp ts("verification", true, &log_app); - errors += testcfg->check_partitioning(); + // assign subgraph ids to nodes + void chase_rect(int idx, Rect& color) + { + for (int d = 0; d < N1; d++) { + if(random_colors) { + color.lo[d] = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_edges); + color.hi[d] = color.lo[d] + Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, 2 * rect_size); + } else { + color.lo[d] = (idx * num_edges / num_nodes) % num_edges; + color.hi[d] = color.lo[d] + rect_size; + } + } } - if(errors > 0) { - printf("Exiting with errors\n"); - exit(1); + static void init_data_task_wrapper(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) + { + ImageRangeTest *me = (ImageRangeTest *)testcfg; + me->init_data_task(args, arglen, p); } - printf("all done!\n"); -} - -// Constructor function-pointer type -using CtorFn = TestInterface* (*)(int, const char** argv); - -// ---- Byfield constructors ---- -template -static TestInterface* make_byfield(int argc, const char** argv) { - return new ByfieldTest(argc, argv); -} + //Each piece has a task to initialize its data + void init_data_task(const void *args, size_t arglen, Processor p) + { + const InitDataArgs &i_args = *(const InitDataArgs *)args; -static constexpr CtorFn BYFIELD_CTORS[3] = { - &make_byfield<1>, - &make_byfield<2>, - &make_byfield<3>, -}; + log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes + << ")"; -// ---- Image constructors ---- -template -static TestInterface* make_image(int argc, const char** argv) { - return new ImageTest(argc, argv); -} + i_args.ri_nodes.fetch_metadata(p).wait(); -static constexpr CtorFn IMAGE_CTORS[3][3] = { - { &make_image<1,1>, &make_image<1,2>, &make_image<1,3> }, - { &make_image<2,1>, &make_image<2,2>, &make_image<2,3> }, - { &make_image<3,1>, &make_image<3,2>, &make_image<3,3> }, -}; + IndexSpace nodes_space = i_args.ri_nodes.template get_indexspace(); -using TaskWrapperFn = void (*)(const void*, size_t, const void*, size_t, Processor); + log_app.debug() << "N: " << is_nodes; -static constexpr TaskWrapperFn BYFIELD_INIT_TBL[3] = { - &ByfieldTest<1>::init_data_task_wrapper, - &ByfieldTest<2>::init_data_task_wrapper, - &ByfieldTest<3>::init_data_task_wrapper, -}; + //For each node in the graph, mark it with a random (or deterministic) subgraph id + { + AffineAccessor, N2> a_rect(i_args.ri_nodes, 0 /* offset */); -static constexpr TaskWrapperFn IMAGE_INIT_TBL[3][3] = { - { &ImageTest<1,1>::init_data_task_wrapper, &ImageTest<1,2>::init_data_task_wrapper, &ImageTest<1,3>::init_data_task_wrapper }, - { &ImageTest<2,1>::init_data_task_wrapper, &ImageTest<2,2>::init_data_task_wrapper, &ImageTest<2,3>::init_data_task_wrapper }, - { &ImageTest<3,1>::init_data_task_wrapper, &ImageTest<3,2>::init_data_task_wrapper, &ImageTest<3,3>::init_data_task_wrapper }, -}; + for (IndexSpaceIterator it(is_nodes); it.valid; it.step()) { + for (PointInRectIterator point(it.rect); point.valid; point.step()) { + int idx = 0; + int stride = 1; + for (int d = 0; d < N2; d++) { + idx += (point.p[d] - is_nodes.bounds.lo[d]) * stride; + stride *= (is_nodes.bounds.hi[d] - is_nodes.bounds.lo[d] + 1); + } + Rect destination; + chase_rect(idx, destination); + a_rect.write(point.p, destination); + } + } + } + } -int main(int argc, char **argv) -{ - Runtime rt; + IndexSpace is_nodes; + IndexSpace is_edges; + std::vector ri_nodes; + std::vector, Rect> > rect_field_data; - rt.init(&argc, &argv); + virtual void print_info(void) + { + printf("Realm %dD -> %dD Image Range dependent partitioning test: %d nodes, %d edges, %d pieces ,%d sources, %d rect size, %d sparse factor\n", (int) N2, (int) N1, + (int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) rect_size, (int) sparse_factor); + } - // parse global options - for(int i = 1; i < argc; i++) { - if(!strcmp(argv[i], "-seed")) { - random_seed = atoi(argv[++i]); - continue; + virtual Event initialize_data(const std::vector &memories, + const std::vector &procs) + { + // now create index space for nodes + Point node_lo, node_hi; + for (int d = 0; d < N2; d++) { + node_lo[d] = 0; + node_hi[d] = num_nodes - 1; } + is_nodes = Rect(node_lo, node_hi); - if(!strcmp(argv[i], "-random")) { - random_colors = true; - continue; + Point edge_lo, edge_hi; + for (int d = 0; d < N1; d++) { + edge_lo[d] = 0; + edge_hi[d] = num_edges - 1; } + is_edges = Rect(edge_lo, edge_hi); - if(!strcmp(argv[i], "-wait")) { - wait_on_events = true; - continue; - } + // equal partition is used to do initial population of edges and nodes + std::vector > ss_nodes_eq; - if(!strcmp(argv[i], "-show")) { - show_graph = true; - continue; - } + log_app.info() << "Creating equal subspaces\n"; - if(!strcmp(argv[i], "-nocheck")) { - skip_check = true; - continue; - } + is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); - if(!strcmp(argv[i], "-d1")) { - dimension1 = atoi(argv[++i]); - continue; - } + // create instances for each of these subspaces + std::vector node_fields; + node_fields.push_back(sizeof(Rect)); - if(!strcmp(argv[i], "-d2")) { - dimension2 = atoi(argv[++i]); - continue; - } + ri_nodes.resize(num_pieces); + rect_field_data.resize(num_pieces); - if(!strcmp(argv[i], "byfield")) { - if (dimension1 < 1 || dimension1 > 3) - assert(false && "invalid dimension"); + for(size_t i = 0; i < ss_nodes_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i], + node_fields, 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_nodes[i] = ri; - testcfg = BYFIELD_CTORS[dimension1 - 1](argc - i, const_cast(argv + i)); - break; + rect_field_data[i].index_space = ss_nodes_eq[i]; + rect_field_data[i].inst = ri_nodes[i]; + rect_field_data[i].field_offset = 0; } - if(!strcmp(argv[i], "image")) { + // fire off tasks to initialize data + std::set events; + for(int i = 0; i < num_pieces; i++) { + Processor p = procs[i % procs.size()]; + InitDataArgs args; + args.index = i; + args.ri_nodes = ri_nodes[i]; + Event e = p.spawn(INIT_IMAGE_RANGE_DATA_TASK, &args, sizeof(args)); + events.insert(e); + } + + return Event::merge_events(events); + } + + // the outputs of our partitioning will be: + // p_nodes - nodes partitioned by subgraph id (from GPU) + // p_nodes_cpu - nodes partitioned by subgraph id (from CPU) + + + std::vector > p_edges, p_garbage_edges, p_edges_cpu; + + virtual Event perform_partitioning(void) + { + // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU + // Ensure that the results are identical + + std::vector> sources(num_spaces); + for(int i = 0; i < num_spaces; i++) { + if (sparse_factor <= 1) { + sources[i] = rect_field_data[i % num_pieces].index_space; + } else { + sources[i] = create_sparse_index_space(is_nodes.bounds, sparse_factor, random_colors, i); + } + } + + // We need a GPU memory for GPU partitioning + Memory gpu_memory; + bool found_gpu_memory = false; + Machine machine = Machine::get_machine(); + std::set all_memories; + machine.get_all_memories(all_memories); + for(Memory memory : all_memories) { + if(memory.kind() == Memory::GPU_FB_MEM) { + gpu_memory = memory; + found_gpu_memory = true; + break; + } + } + if (!found_gpu_memory) { + log_app.error() << "No GPU memory found for partitioning test\n"; + return Event::NO_EVENT; + } + + + std::vector node_fields; + node_fields.push_back(sizeof(Rect)); + + std::vector, Rect>> rect_field_data_gpu; + rect_field_data_gpu.resize(num_pieces); + + for (int i = 0; i < num_pieces; i++) { + copy_piece(rect_field_data[i], rect_field_data_gpu[i], node_fields, 0, gpu_memory).wait(); + } + + std::vector> image_inputs(num_pieces); + std::vector> image_subspaces(num_spaces); + std::vector image_requirements(num_pieces); + + for (int i = 0; i < num_pieces; i++) { + image_inputs[i].location = rect_field_data_gpu[i].inst.get_location(); + image_inputs[i].space = rect_field_data_gpu[i].index_space; + } + + for (int i = 0; i < num_spaces; i++) { + image_subspaces[i].space = sources[i]; + image_subspaces[i].entries = sources[i].dense() ? 1 : sources[i].sparsity.impl()->get_entries().size(); + } + + is_edges.by_image_buffer_requirements(image_subspaces, image_inputs, image_requirements); + + for (int i = 0; i < num_pieces; i++) { + alloc_piece(rect_field_data_gpu[i].scratch_buffer, image_requirements[i].upper_bound, gpu_memory).wait(); + } + + wait_on_events = true; + log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; + Event warmup = is_edges.create_subspaces_by_image(rect_field_data_gpu, + sources, + p_garbage_edges, + Realm::ProfilingRequestSet()); + warmup.wait(); + + Event gpu_call = is_edges.create_subspaces_by_image(rect_field_data_gpu, + sources, + p_edges, + Realm::ProfilingRequestSet()); + + Event cpu_call = is_edges.create_subspaces_by_image(rect_field_data, + sources, + p_edges_cpu, + Realm::ProfilingRequestSet()); + + return Event::merge_events({gpu_call, cpu_call}); + + } + + virtual int perform_dynamic_checks(void) + { + // Nothing to do here + return 0; + } + + virtual int check_partitioning(void) + { + int errors = 0; + + if (!p_edges.size()) { + return p_edges.size() == p_edges_cpu.size(); + } + + log_app.info() << "Checking correctness of partitioning " << "\n"; + + for(int i = 0; i < num_spaces; i++) { + for(IndexSpaceIterator it(p_edges[i]); it.valid; it.step()) { + for(PointInRectIterator point(it.rect); point.valid; point.step()) { + if (!p_edges_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra image point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for(IndexSpaceIterator it(p_edges_cpu[i]); it.valid; it.step()) { + for(PointInRectIterator point(it.rect); point.valid; point.step()) { + if (!p_edges[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing image point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + + } + return errors; + } +}; + +template +class PreimageTest : public TestInterface { +public: + // graph config parameters + int num_nodes = 1000; + int num_edges = 1000; + int num_spaces = 4; + int num_pieces = 4; + int sparse_factor = 4; + std::string filename; + + PreimageTest(int argc, const char *argv[]) + { + for(int i = 1; i < argc; i++) { + + if(!strcmp(argv[i], "-p")) { + num_pieces = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-n")) { + num_nodes = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-e")) { + num_edges = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-s")) { + num_spaces = atoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "-f")) { + sparse_factor = atoi(argv[++i]); + continue; + } + } + + + if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_spaces <= 0) { + log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " targets=" << num_spaces << "\n"; + exit(1); + } + } + + struct InitDataArgs { + int index; + RegionInstance ri_nodes; + }; + + enum PRNGStreams + { + NODE_SUBGRAPH_STREAM, + }; + + // assign subgraph ids to nodes + void chase_point(int idx, Point& color) + { + for (int d = 0; d < N2; d++) { + if(random_colors) + color[d] = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_edges); + else + color[d] = (idx * num_edges / num_nodes) % num_edges; + } + } + + static void init_data_task_wrapper(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) + { + PreimageTest *me = (PreimageTest *)testcfg; + me->init_data_task(args, arglen, p); + } + + //Each piece has a task to initialize its data + void init_data_task(const void *args, size_t arglen, Processor p) + { + const InitDataArgs &i_args = *(const InitDataArgs *)args; + + log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes + << ")"; + + i_args.ri_nodes.fetch_metadata(p).wait(); + + IndexSpace nodes_space = i_args.ri_nodes.template get_indexspace(); + + log_app.debug() << "N: " << is_nodes; + + //For each node in the graph, mark it with a random (or deterministic) subgraph id + { + AffineAccessor, N1> a_point(i_args.ri_nodes, 0 /* offset */); + + for (IndexSpaceIterator it(is_nodes); it.valid; it.step()) { + for (PointInRectIterator point(it.rect); point.valid; point.step()) { + int idx = 0; + int stride = 1; + for (int d = 0; d < N1; d++) { + idx += (point.p[d] - is_nodes.bounds.lo[d]) * stride; + stride *= (is_nodes.bounds.hi[d] - is_nodes.bounds.lo[d] + 1); + } + Point destination; + chase_point(idx, destination); + a_point.write(point.p, destination); + } + } + } + } + + IndexSpace is_nodes; + IndexSpace is_edges; + std::vector ri_nodes; + std::vector, Point> > point_field_data; + + virtual void print_info(void) + { + printf("Realm %dD -> %dD Preimage dependent partitioning test: %d nodes, %d edges, %d pieces ,%d targets, %d sparse factor\n", (int) N1, (int) N2, + (int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) sparse_factor); + } + + virtual Event initialize_data(const std::vector &memories, + const std::vector &procs) + { + // now create index space for nodes + Point node_lo, node_hi; + for (int d = 0; d < N1; d++) { + node_lo[d] = 0; + node_hi[d] = num_nodes - 1; + } + is_nodes = Rect(node_lo, node_hi); + + Point edge_lo, edge_hi; + for (int d = 0; d < N2; d++) { + edge_lo[d] = 0; + edge_hi[d] = num_edges - 1; + } + is_edges = Rect(edge_lo, edge_hi); + + // equal partition is used to do initial population of edges and nodes + std::vector > ss_nodes_eq; + + log_app.info() << "Creating equal subspaces\n"; + + is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); + + // create instances for each of these subspaces + std::vector node_fields; + node_fields.push_back(sizeof(Point)); + + ri_nodes.resize(num_pieces); + point_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_nodes_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i], + node_fields, 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_nodes[i] = ri; + + point_field_data[i].index_space = ss_nodes_eq[i]; + point_field_data[i].inst = ri_nodes[i]; + point_field_data[i].field_offset = 0; + } + + // fire off tasks to initialize data + std::set events; + for(int i = 0; i < num_pieces; i++) { + Processor p = procs[i % procs.size()]; + InitDataArgs args; + args.index = i; + args.ri_nodes = ri_nodes[i]; + Event e = p.spawn(INIT_PREIMAGE_DATA_TASK, &args, sizeof(args)); + events.insert(e); + } + + return Event::merge_events(events); + } + + // the outputs of our partitioning will be: + // p_nodes - nodes partitioned by subgraph id (from GPU) + // p_nodes_cpu - nodes partitioned by subgraph id (from CPU) + + + std::vector > p_nodes, p_garbage_nodes, p_nodes_cpu; + + virtual Event perform_partitioning(void) + { + // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU + // Ensure that the results are identical + + std::vector> targets; + if (sparse_factor <= 1) { + is_edges.create_equal_subspaces(num_spaces, 1, targets, Realm::ProfilingRequestSet()).wait(); + } else { + targets.resize(num_spaces); + for (int i = 0; i < num_spaces; i++) { + targets[i] = create_sparse_index_space(is_edges.bounds, sparse_factor, random_colors, i); + } + } + + // We need a GPU memory for GPU partitioning + Memory gpu_memory; + bool found_gpu_memory = false; + Machine machine = Machine::get_machine(); + std::set all_memories; + machine.get_all_memories(all_memories); + for(Memory memory : all_memories) { + if(memory.kind() == Memory::GPU_FB_MEM) { + gpu_memory = memory; + found_gpu_memory = true; + break; + } + } + if (!found_gpu_memory) { + log_app.error() << "No GPU memory found for partitioning test\n"; + return Event::NO_EVENT; + } + + + std::vector node_fields; + node_fields.push_back(sizeof(Point)); + + std::vector, Point>> point_field_data_gpu; + point_field_data_gpu.resize(num_pieces); + + for (int i = 0; i < num_pieces; i++) { + copy_piece(point_field_data[i], point_field_data_gpu[i], node_fields, 0, gpu_memory).wait(); + } + + std::vector> preimage_inputs(num_pieces); + std::vector> preimage_subspaces(num_spaces); + std::vector preimage_requirements(num_pieces); + + for (int i = 0; i < num_pieces; i++) { + preimage_inputs[i].location = point_field_data_gpu[i].inst.get_location(); + preimage_inputs[i].space = point_field_data_gpu[i].index_space; + } + + for (int i = 0; i < num_spaces; i++) { + preimage_subspaces[i].space = targets[i]; + preimage_subspaces[i].entries = targets[i].dense() ? 1 : targets[i].sparsity.impl()->get_entries().size(); + } + + is_nodes.by_preimage_buffer_requirements(preimage_subspaces, preimage_inputs, preimage_requirements); + + for (int i = 0; i < num_pieces; i++) { + alloc_piece(point_field_data_gpu[i].scratch_buffer, preimage_requirements[i].upper_bound, gpu_memory).wait(); + } + + wait_on_events = true; + log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; + Event warmup = is_nodes.create_subspaces_by_preimage(point_field_data_gpu, + targets, + p_garbage_nodes, + Realm::ProfilingRequestSet()); + warmup.wait(); + + Event gpu_call = is_nodes.create_subspaces_by_preimage(point_field_data_gpu, + targets, + p_nodes, + Realm::ProfilingRequestSet()); + + gpu_call.wait(); + + long long start = Clock::current_time_in_microseconds(); + Event cpu_call = is_nodes.create_subspaces_by_preimage(point_field_data, + targets, + p_nodes_cpu, + Realm::ProfilingRequestSet()); + cpu_call.wait(); + std::cout << "CPU TIME: " << (Clock::current_time_in_microseconds() - start) / 1000 << " ms\n"; + + return Event::merge_events({gpu_call, cpu_call}); + + } + + virtual int perform_dynamic_checks(void) + { + // Nothing to do here + return 0; + } + + virtual int check_partitioning(void) + { + int errors = 0; + + if (!p_nodes.size()) { + return p_nodes.size() != p_nodes_cpu.size(); + } + + log_app.info() << "Checking correctness of partitioning " << "\n"; + + for(int i = 0; i < num_spaces; i++) { + if (!p_nodes[i].dense() && (N1 > 1)) { + p_nodes[i].sparsity.impl()->request_bvh(); + if (!p_nodes_cpu[i].dense()) { + p_nodes_cpu[i].sparsity.impl()->request_bvh(); + } + } + for(IndexSpaceIterator it(p_nodes[i]); it.valid; it.step()) { + for(PointInRectIterator point(it.rect); point.valid; point.step()) { + if (!p_nodes_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra image point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for(IndexSpaceIterator it(p_nodes_cpu[i]); it.valid; it.step()) { + for(PointInRectIterator point(it.rect); point.valid; point.step()) { + if (!p_nodes[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing image point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + + } + return errors; + } +}; + +void top_level_task(const void *args, size_t arglen, const void *userdata, size_t userlen, + Processor p) +{ + int errors = 0; + + testcfg->print_info(); + + // find all the system memories - we'll stride our data across them + // for each memory, we'll need one CPU that can do the initialization of the data + std::vector sysmems; + std::vector procs; + + Machine machine = Machine::get_machine(); + { + std::set all_memories; + machine.get_all_memories(all_memories); + for(std::set::const_iterator it = all_memories.begin(); + it != all_memories.end(); it++) { + Memory m = *it; + + // skip memories with no capacity for creating instances + if(m.capacity() == 0) + continue; + + if(m.kind() == Memory::SYSTEM_MEM) { + sysmems.push_back(m); + std::set pset; + machine.get_shared_processors(m, pset); + Processor p = Processor::NO_PROC; + for(std::set::const_iterator it2 = pset.begin(); it2 != pset.end(); + it2++) { + if(it2->kind() == Processor::LOC_PROC) { + p = *it2; + break; + } + } + assert(p.exists()); + procs.push_back(p); + log_app.debug() << "System mem #" << (sysmems.size() - 1) << " = " + << *sysmems.rbegin() << " (" << *procs.rbegin() << ")"; + } + } + } + assert(sysmems.size() > 0); + + { + Realm::TimeStamp ts("initialization", true, &log_app); + + Event e = testcfg->initialize_data(sysmems, procs); + // wait for all initialization to be done + e.wait(); + } + + // now actual partitioning work + { + Realm::TimeStamp ts("dependent partitioning work", true, &log_app); + + Event e = testcfg->perform_partitioning(); + + e.wait(); + } + + // dynamic checks (which would be eliminated by compiler) + { + Realm::TimeStamp ts("dynamic checks", true, &log_app); + errors += testcfg->perform_dynamic_checks(); + } + + if(!skip_check) { + log_app.print() << "checking correctness of partitioning"; + Realm::TimeStamp ts("verification", true, &log_app); + errors += testcfg->check_partitioning(); + } + + if(errors > 0) { + printf("Exiting with errors\n"); + exit(1); + } + + printf("all done!\n"); +} + +// Constructor function-pointer type +using CtorFn = TestInterface* (*)(int, const char** argv); + +// ---- Byfield constructors ---- +template +static TestInterface* make_byfield(int argc, const char** argv) { + return new ByfieldTest(argc, argv); +} + +static constexpr CtorFn BYFIELD_CTORS[3] = { + &make_byfield<1>, + &make_byfield<2>, + &make_byfield<3>, +}; + +// ---- Image constructors ---- +template +static TestInterface* make_image(int argc, const char** argv) { + return new ImageTest(argc, argv); +} + +static constexpr CtorFn IMAGE_CTORS[3][3] = { + { &make_image<1,1>, &make_image<1,2>, &make_image<1,3> }, + { &make_image<2,1>, &make_image<2,2>, &make_image<2,3> }, + { &make_image<3,1>, &make_image<3,2>, &make_image<3,3> }, +}; + +// ---- Image Range constructors ---- +template +static TestInterface* make_image_range(int argc, const char** argv) { + return new ImageRangeTest(argc, argv); +} + +static constexpr CtorFn IMAGE_RANGE_CTORS[3][3] = { + { &make_image_range<1,1>, &make_image_range<1,2>, &make_image_range<1,3> }, + { &make_image_range<2,1>, &make_image_range<2,2>, &make_image_range<2,3> }, + { &make_image_range<3,1>, &make_image_range<3,2>, &make_image_range<3,3> }, +}; + +// ---- Image constructors ---- +template +static TestInterface* make_preimage(int argc, const char** argv) { + return new PreimageTest(argc, argv); +} + +static constexpr CtorFn PREIMAGE_CTORS[3][3] = { + { &make_preimage<1,1>, &make_preimage<1,2>, &make_preimage<1,3> }, + { &make_preimage<2,1>, &make_preimage<2,2>, &make_preimage<2,3> }, + { &make_preimage<3,1>, &make_preimage<3,2>, &make_preimage<3,3> }, +}; + +using TaskWrapperFn = void (*)(const void*, size_t, const void*, size_t, Processor); + +static constexpr TaskWrapperFn BYFIELD_INIT_TBL[3] = { + &ByfieldTest<1>::init_data_task_wrapper, + &ByfieldTest<2>::init_data_task_wrapper, + &ByfieldTest<3>::init_data_task_wrapper, +}; + +static constexpr TaskWrapperFn IMAGE_INIT_TBL[3][3] = { + { &ImageTest<1,1>::init_data_task_wrapper, &ImageTest<1,2>::init_data_task_wrapper, &ImageTest<1,3>::init_data_task_wrapper }, + { &ImageTest<2,1>::init_data_task_wrapper, &ImageTest<2,2>::init_data_task_wrapper, &ImageTest<2,3>::init_data_task_wrapper }, + { &ImageTest<3,1>::init_data_task_wrapper, &ImageTest<3,2>::init_data_task_wrapper, &ImageTest<3,3>::init_data_task_wrapper }, +}; + +static constexpr TaskWrapperFn IMAGE_RANGE_INIT_TBL[3][3] = { + { &ImageRangeTest<1,1>::init_data_task_wrapper, &ImageRangeTest<1,2>::init_data_task_wrapper, &ImageRangeTest<1,3>::init_data_task_wrapper }, + { &ImageRangeTest<2,1>::init_data_task_wrapper, &ImageRangeTest<2,2>::init_data_task_wrapper, &ImageRangeTest<2,3>::init_data_task_wrapper }, + { &ImageRangeTest<3,1>::init_data_task_wrapper, &ImageRangeTest<3,2>::init_data_task_wrapper, &ImageRangeTest<3,3>::init_data_task_wrapper }, +}; + +static constexpr TaskWrapperFn PREIMAGE_INIT_TBL[3][3] = { + { &PreimageTest<1,1>::init_data_task_wrapper, &PreimageTest<1,2>::init_data_task_wrapper, &PreimageTest<1,3>::init_data_task_wrapper }, + { &PreimageTest<2,1>::init_data_task_wrapper, &PreimageTest<2,2>::init_data_task_wrapper, &PreimageTest<2,3>::init_data_task_wrapper }, + { &PreimageTest<3,1>::init_data_task_wrapper, &PreimageTest<3,2>::init_data_task_wrapper, &PreimageTest<3,3>::init_data_task_wrapper }, +}; + +int main(int argc, char **argv) +{ + Runtime rt; + + rt.init(&argc, &argv); + + // parse global options + for(int i = 1; i < argc; i++) { + if(!strcmp(argv[i], "-seed")) { + random_seed = atoi(argv[++i]); + continue; + } + + if(!strcmp(argv[i], "-random")) { + random_colors = true; + continue; + } + + if(!strcmp(argv[i], "-wait")) { + wait_on_events = true; + continue; + } + + if(!strcmp(argv[i], "-show")) { + show_graph = true; + continue; + } + + if(!strcmp(argv[i], "-nocheck")) { + skip_check = true; + continue; + } + + if(!strcmp(argv[i], "-d1")) { + dimension1 = atoi(argv[++i]); + continue; + } + + if(!strcmp(argv[i], "-d2")) { + dimension2 = atoi(argv[++i]); + continue; + } + + if(!strcmp(argv[i], "byfield")) { + if (dimension1 < 1 || dimension1 > 3) + assert(false && "invalid dimension"); + + testcfg = BYFIELD_CTORS[dimension1 - 1](argc - i, const_cast(argv + i)); + break; + } + + if(!strcmp(argv[i], "image")) { if (dimension1 < 1 || dimension1 > 3 || dimension2 < 1 || dimension2 > 3) assert(false && "invalid dimension"); testcfg = IMAGE_CTORS[dimension1 - 1][dimension2 - 1](argc - i, const_cast(argv + i)); break; } + if(!strcmp(argv[i], "range")) { + if (dimension1 < 1 || dimension1 > 3 || dimension2 < 1 || dimension2 > 3) + assert(false && "invalid dimension"); + testcfg = IMAGE_RANGE_CTORS[dimension1 - 1][dimension2 - 1](argc - i, const_cast(argv + i)); + break; + } + + if(!strcmp(argv[i], "preimage")) { + if (dimension1 < 1 || dimension1 > 3 || dimension2 < 1 || dimension2 > 3) + assert(false && "invalid dimension"); + testcfg = PREIMAGE_CTORS[dimension1 - 1][dimension2 - 1](argc - i, const_cast(argv + i)); + break; + } + // printf("unknown parameter: %s\n", argv[i]); } @@ -926,6 +1637,8 @@ int main(int argc, char **argv) rt.register_task(INIT_BYFIELD_DATA_TASK, BYFIELD_INIT_TBL[dimension1 - 1]); rt.register_task(INIT_IMAGE_DATA_TASK, IMAGE_INIT_TBL[dimension1 - 1][dimension2 - 1]); + rt.register_task(INIT_IMAGE_RANGE_DATA_TASK, IMAGE_RANGE_INIT_TBL[dimension1 - 1][dimension2 - 1]); + rt.register_task(INIT_PREIMAGE_DATA_TASK, PREIMAGE_INIT_TBL[dimension1 - 1][dimension2 - 1]); signal(SIGALRM, sigalrm_handler); diff --git a/tests/unit_tests/sparsity_map_test.cc b/tests/unit_tests/sparsity_map_test.cc index ab673f7b27..a0fafbf834 100644 --- a/tests/unit_tests/sparsity_map_test.cc +++ b/tests/unit_tests/sparsity_map_test.cc @@ -284,7 +284,7 @@ void run_contribute_dense_case(const ContributeDenseRectTestData &test_case) impl->set_contributor_count(1); impl->contribute_dense_rect_list(test_case.rects, test_case.disjoint); - std::vector> entries = public_impl->get_entries(); + span> entries = public_impl->get_entries(); ASSERT_TRUE(public_impl->is_valid()); ASSERT_EQ(entries.size(), test_case.expected.size()); for(size_t i = 0; i < entries.size(); i++) { From 7a0c30c80460b6295f12e75da72c3e8b17ffca65 Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Tue, 10 Mar 2026 17:19:51 -0700 Subject: [PATCH 19/32] preparing to run on perlmutter --- src/realm/cuda/cuda_internal.h | 3 + src/realm/cuda/cuda_module.cc | 7 + src/realm/deppart/byfield.cc | 18 +- src/realm/deppart/byfield_gpu_impl.hpp | 13 +- src/realm/deppart/image.cc | 26 +- src/realm/deppart/image_gpu_impl.hpp | 23 +- src/realm/deppart/partitions.h | 41 +- src/realm/deppart/partitions_gpu_impl.hpp | 15 +- src/realm/deppart/preimage.cc | 19 +- src/realm/deppart/preimage_gpu_impl.hpp | 24 +- tests/benchmark.cc | 510 ++++++++++++++++++++-- 11 files changed, 592 insertions(+), 107 deletions(-) diff --git a/src/realm/cuda/cuda_internal.h b/src/realm/cuda/cuda_internal.h index 614710bfe1..13d127c12b 100644 --- a/src/realm/cuda/cuda_internal.h +++ b/src/realm/cuda/cuda_internal.h @@ -412,6 +412,7 @@ namespace Realm { get_null_task_stream(void) const; // needed by librealm_kokkos.so GPUStream *get_next_task_stream(bool create = false); GPUStream *get_next_d2d_stream(); + GPUStream *get_deppart_stream() const; void launch_batch_affine_fill_kernel(void *fill_info, size_t dim, size_t elemSize, size_t volume, GPUStream *stream); @@ -489,6 +490,8 @@ namespace Realm { GPUStream *host_to_device_stream = nullptr; GPUStream *device_to_host_stream = nullptr; GPUStream *device_to_device_stream = nullptr; + GPUStream *deppart_stream = nullptr; + std::vector device_to_device_streams; std::vector peer_to_peer_streams; // indexed by target std::vector task_streams; diff --git a/src/realm/cuda/cuda_module.cc b/src/realm/cuda/cuda_module.cc index 0147bc2b0d..ce84eb5704 100644 --- a/src/realm/cuda/cuda_module.cc +++ b/src/realm/cuda/cuda_module.cc @@ -1058,6 +1058,11 @@ namespace Realm { return device_to_device_streams[d2d_stream_index]; } + GPUStream *GPU::get_deppart_stream() const + { + return deppart_stream; + } + static void launch_kernel(const Realm::Cuda::GPU::GPUFuncInfo &func_info, void *params, size_t num_elems, GPUStream *stream) { @@ -2040,6 +2045,7 @@ namespace Realm { host_to_device_stream = new GPUStream(this, worker); device_to_host_stream = new GPUStream(this, worker); + deppart_stream = new GPUStream(this, worker); CUdevice dev; int numSMs; @@ -2164,6 +2170,7 @@ namespace Realm { // destroy streams delete host_to_device_stream; delete device_to_host_stream; + delete deppart_stream; delete_container_contents(device_to_device_streams); diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc index 7c1fe148c1..06c936f0b2 100644 --- a/src/realm/deppart/byfield.cc +++ b/src/realm/deppart/byfield.cc @@ -23,6 +23,7 @@ #include "realm/deppart/rectlist.h" #include "realm/deppart/inst_helper.h" #include "realm/logging.h" +#include "realm/cuda/cuda_internal.h" namespace Realm { @@ -45,16 +46,8 @@ namespace Realm { device_size = atoi(val); } size_t optimal_size = is.bounds.volume() * 10 * sizeof(RectDesc); - std::vector affinities; - unsigned best_bandwidth = 0; Processor best_proc = Processor::NO_PROC; - Machine::get_machine().get_proc_mem_affinity(affinities, Processor::NO_PROC, mem); - for (auto affinity : affinities) { - if (affinity.bandwidth > best_bandwidth) { - best_bandwidth = affinity.bandwidth; - best_proc = affinity.p; - } - } + assert(choose_proc(best_proc, mem)); requirements[i].affinity_processor = best_proc; requirements[i].lower_bound = device_size; requirements[i].upper_bound = max(device_size, optimal_size); @@ -332,6 +325,13 @@ namespace Realm { bool _exclusive) : parent_space(_parent), field_data(_field_data) { this->exclusive = _exclusive; + Memory my_mem = field_data[0].inst.get_location(); + Processor best_proc; + assert(choose_proc(best_proc, my_mem)); + Cuda::GPUProcessor* gpu_proc = dynamic_cast(get_runtime()->get_processor_impl(best_proc)); + assert(gpu_proc); + this->gpu = gpu_proc->gpu; + this->stream = gpu_proc->gpu->get_deppart_stream(); } template diff --git a/src/realm/deppart/byfield_gpu_impl.hpp b/src/realm/deppart/byfield_gpu_impl.hpp index 849556a53d..8e1c953730 100644 --- a/src/realm/deppart/byfield_gpu_impl.hpp +++ b/src/realm/deppart/byfield_gpu_impl.hpp @@ -18,15 +18,19 @@ template void GPUByFieldMicroOp::execute() { + Cuda::AutoGPUContext agc(this->gpu); + // For profiling. NVTX_DEPPART(byfield_gpu); - cudaStream_t stream = Cuda::get_task_cuda_stream(); + CUstream stream = this->stream->get_stream(); collapsed_space inst_space; size_t tile_size = field_data[0].scratch_buffer.get_layout()->bytes_used; + //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; + Arena buffer_arena(field_data[0].scratch_buffer.pointer_untyped(0, tile_size), tile_size); inst_space.offsets = buffer_arena.alloc(field_data.size() + 1); @@ -97,12 +101,13 @@ void GPUByFieldMicroOp::execute() size_t num_completed = 0; size_t curr_tile = tile_size / 2; int count = 0; + if (count) {} bool host_fallback = false; std::vector h_instances(colors.size(), RegionInstance::NO_INST); std::vector entry_counts(colors.size(), 0); while (num_completed < inst_space.num_entries) { try { - std::cout << "Byfield iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; + //std::cout << "Byfield iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; buffer_arena.start(); if (num_completed + curr_tile > inst_space.num_entries) { curr_tile = inst_space.num_entries - num_completed; @@ -202,11 +207,11 @@ void GPUByFieldMicroOp::execute() CUDA_CHECK(cudaStreamSynchronize(stream), stream); } catch (arena_oom&) { - std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; + //std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; curr_tile /= 2; if (curr_tile == 0) { if (host_fallback) { - GPUMicroOp::shatter_rects(inst_space, num_completed); + GPUMicroOp::shatter_rects(inst_space, num_completed, stream); curr_tile = 1; } else { host_fallback = true; diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc index 8d37d81969..ec8cfb834d 100644 --- a/src/realm/deppart/image.cc +++ b/src/realm/deppart/image.cc @@ -24,6 +24,7 @@ #include "realm/deppart/inst_helper.h" #include "realm/deppart/preimage.h" #include "realm/logging.h" +#include "realm/cuda/cuda_internal.h" namespace Realm { @@ -74,16 +75,8 @@ namespace Realm { } minimal_size = max(minimal_size, device_size); size_t optimal_size = is.bounds.volume() * sizeof(Rect) * source_spaces.size() * 10 + minimal_size; - std::vector affinities; - unsigned best_bandwidth = 0; - Processor best_proc = Processor::NO_PROC; - Machine::get_machine().get_proc_mem_affinity(affinities, Processor::NO_PROC, mem); - for (auto affinity : affinities) { - if (affinity.bandwidth > best_bandwidth) { - best_bandwidth = affinity.bandwidth; - best_proc = affinity.p; - } - } + Processor best_proc; + assert(choose_proc(best_proc, mem)); requirements[i].affinity_processor = best_proc; requirements[i].lower_bound = minimal_size; requirements[i].upper_bound = optimal_size; @@ -948,7 +941,14 @@ namespace Realm { bool _exclusive) : parent_space(_parent), domain_transform(_domain_transform) { - this->exclusive = _exclusive; + this->exclusive = _exclusive; + Memory my_mem = domain_transform.ptr_data.empty() ? domain_transform.range_data[0].inst.get_location() : domain_transform.ptr_data[0].inst.get_location(); + Processor best_proc; + assert(choose_proc(best_proc, my_mem)); + Cuda::GPUProcessor* gpu_proc = dynamic_cast(get_runtime()->get_processor_impl(best_proc)); + assert(gpu_proc); + this->gpu = gpu_proc->gpu; + this->stream = gpu_proc->gpu->get_deppart_stream(); } template @@ -995,7 +995,9 @@ namespace Realm { template void GPUImageMicroOp::execute(void) { - TimeStamp ts("StructuredImageMicroOp::execute", true, &log_uop_timing); + TimeStamp ts("GPUImageMicroOp::execute", true, &log_uop_timing); + + Cuda::AutoGPUContext agc(this->gpu); if (domain_transform.ptr_data.size() > 0) { gpu_populate_ptrs(); } else { diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp index 83f907d922..43682e06dd 100644 --- a/src/realm/deppart/image_gpu_impl.hpp +++ b/src/realm/deppart/image_gpu_impl.hpp @@ -48,10 +48,10 @@ void GPUImageMicroOp::gpu_populate_rngs() RegionInstance buffer = domain_transform.range_data[0].scratch_buffer; size_t tile_size = buffer.get_layout()->bytes_used; - std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; + //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size); - cudaStream_t stream = Cuda::get_task_cuda_stream(); + CUstream stream = this->stream->get_stream(); collapsed_space src_space; src_space.offsets = buffer_arena.alloc(sources.size()+1); @@ -98,13 +98,13 @@ void GPUImageMicroOp::gpu_populate_rngs() size_t num_completed = 0; size_t curr_tile = tile_size / 2; int count = 0; - + if (count) {} bool host_fallback = false; std::vector h_instances(sources.size(), RegionInstance::NO_INST); std::vector entry_counts(sources.size(), 0); while (num_completed < inst_space.num_entries) { try { - std::cout << "Image Range iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; + //std::cout << "Image Range iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; buffer_arena.start(); buffer_arena.flip_parity(); if (num_completed + curr_tile > inst_space.num_entries) { @@ -241,11 +241,11 @@ void GPUImageMicroOp::gpu_populate_rngs() CUDA_CHECK(cudaStreamSynchronize(stream), stream); } catch (arena_oom&) { - std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; + //std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; curr_tile /= 2; if (curr_tile == 0) { if (host_fallback) { - GPUMicroOp::shatter_rects(inst_space, num_completed); + GPUMicroOp::shatter_rects(inst_space, num_completed, stream); curr_tile = 1; } else { host_fallback = true; @@ -329,10 +329,10 @@ void GPUImageMicroOp::gpu_populate_ptrs() Memory sysmem; find_memory(sysmem, Memory::SYSTEM_MEM); - cudaStream_t stream = Cuda::get_task_cuda_stream(); + CUstream stream = this->stream->get_stream(); size_t tile_size = buffer.get_layout()->bytes_used; - std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; + //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size); collapsed_space src_space; @@ -385,12 +385,13 @@ void GPUImageMicroOp::gpu_populate_ptrs() size_t num_completed = 0; size_t curr_tile = tile_size / 2; int count = 0; + if (count) {} bool host_fallback = false; std::vector h_instances(sources.size(), RegionInstance::NO_INST); std::vector entry_counts(sources.size(), 0); while (num_completed < inst_space.num_entries) { try { - std::cout << "Image iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; + //std::cout << "Image iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; buffer_arena.start(); if (num_completed + curr_tile > inst_space.num_entries) { curr_tile = inst_space.num_entries - num_completed; @@ -513,11 +514,11 @@ void GPUImageMicroOp::gpu_populate_ptrs() CUDA_CHECK(cudaStreamSynchronize(stream), stream); } catch (arena_oom&) { - std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; + //std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; curr_tile /= 2; if (curr_tile == 0) { if (host_fallback) { - GPUMicroOp::shatter_rects(inst_space, num_completed); + GPUMicroOp::shatter_rects(inst_space, num_completed, stream); curr_tile = 1; } else { host_fallback = true; diff --git a/src/realm/deppart/partitions.h b/src/realm/deppart/partitions.h index 68e5b40084..9ccbde6b75 100644 --- a/src/realm/deppart/partitions.h +++ b/src/realm/deppart/partitions.h @@ -34,9 +34,14 @@ #include "realm/deppart/sparsity_impl.h" #include "realm/deppart/inst_helper.h" #include "realm/bgwork.h" +#ifdef REALM_USE_CUDA +#include "realm/cuda/cuda_module.h" struct CUstream_st; -typedef CUstream_st* cudaStream_t; +typedef CUstream_st* CUstream; + +#endif + namespace Realm { @@ -45,6 +50,10 @@ namespace Realm { #ifdef REALM_USE_CUDA + namespace Cuda { + class GPUStream; + } + template struct HiFlag { T hi; @@ -349,20 +358,20 @@ namespace Realm { virtual void execute(void) = 0; - static void shatter_rects(collapsed_space & inst_space, size_t &num_completed); + static void shatter_rects(collapsed_space & inst_space, size_t &num_completed, CUstream stream); template - static void collapse_multi_space(const std::vector& field_data, collapsed_space &out_space, Arena &my_arena, cudaStream_t stream); + static void collapse_multi_space(const std::vector& field_data, collapsed_space &out_space, Arena &my_arena, CUstream stream); - static void collapse_parent_space(const IndexSpace& parent_space, collapsed_space &out_space, Arena &my_arena, cudaStream_t stream); + static void collapse_parent_space(const IndexSpace& parent_space, collapsed_space &out_space, Arena &my_arena, CUstream stream); - static void build_bvh(const collapsed_space &space, BVH &bvh, Arena &my_arena, cudaStream_t stream); + static void build_bvh(const collapsed_space &space, BVH &bvh, Arena &my_arena, CUstream stream); template - static void construct_input_rectlist(const collapsed_space &lhs, const collapsed_space &rhs, out_t* &d_valid_rects, size_t& out_size, uint32_t* counters, uint32_t* out_offsets, Arena &my_arena, cudaStream_t stream); + static void construct_input_rectlist(const collapsed_space &lhs, const collapsed_space &rhs, out_t* &d_valid_rects, size_t& out_size, uint32_t* counters, uint32_t* out_offsets, Arena &my_arena, CUstream stream); template - static void volume_prefix_sum(const out_t* d_rects, size_t total_rects, size_t* &d_prefix_rects, size_t& num_pts, Arena &my_arena, cudaStream_t stream); + static void volume_prefix_sum(const out_t* d_rects, size_t total_rects, size_t* &d_prefix_rects, size_t& num_pts, Arena &my_arena, CUstream stream); template void complete_pipeline(PointDesc* d_points, size_t total_pts, RectDesc* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap); @@ -381,6 +390,8 @@ namespace Realm { virtual bool is_image_microop() const { return false; } bool exclusive = false; + Cuda::GPU* gpu; + Cuda::GPUStream* stream; }; #endif @@ -490,6 +501,22 @@ namespace Realm { static ActiveMessageHandlerReg areg; }; + // Finds a memory of the specified kind. Returns true on success, false otherwise. + inline bool choose_proc(Processor &best_proc, Memory location) + { + std::vector affinities; + unsigned best_bandwidth = 0; + best_proc = Processor::NO_PROC; + Machine::get_machine().get_proc_mem_affinity(affinities, Processor::NO_PROC, location); + for (auto affinity : affinities) { + if (affinity.bandwidth > best_bandwidth) { + best_bandwidth = affinity.bandwidth; + best_proc = affinity.p; + } + } + return best_proc != Processor::NO_PROC; + } + }; diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp index 0827f1844c..d136c2138b 100644 --- a/src/realm/deppart/partitions_gpu_impl.hpp +++ b/src/realm/deppart/partitions_gpu_impl.hpp @@ -107,10 +107,9 @@ namespace Realm { } template - void GPUMicroOp::shatter_rects(collapsed_space & inst_space, size_t &num_completed) { + void GPUMicroOp::shatter_rects(collapsed_space & inst_space, size_t &num_completed, CUstream stream) { NVTX_DEPPART(shatter_rects); - cudaStream_t stream = Cuda::get_task_cuda_stream(); size_t new_size = (inst_space.entries_buffer[num_completed].bounds.volume() + 1) / 2; assert(new_size > 0); size_t num_new_entries = 0; @@ -189,7 +188,7 @@ namespace Realm { //Given a list of spaces, compacts them all into one collapsed_space template template - void GPUMicroOp::collapse_multi_space(const std::vector& spaces, collapsed_space &out_space, Arena &my_arena, cudaStream_t stream) + void GPUMicroOp::collapse_multi_space(const std::vector& spaces, collapsed_space &out_space, Arena &my_arena, CUstream stream) { NVTX_DEPPART(collapse_multi_space); @@ -609,7 +608,7 @@ namespace Realm { return; } NVTX_DEPPART(complete_rect_pipeline); - cudaStream_t stream = Cuda::get_task_cuda_stream(); + CUstream stream = this->stream->get_stream(); Memory my_mem; assert(find_memory(my_mem, Memory::GPU_FB_MEM)); @@ -1300,7 +1299,7 @@ namespace Realm { { NVTX_DEPPART(complete1d_pipeline); - cudaStream_t stream = Cuda::get_task_cuda_stream(); + CUstream stream = this->stream->get_stream(); RectDesc* d_rects_in = d_rects; @@ -1454,7 +1453,7 @@ namespace Realm { } - cudaStream_t stream = Cuda::get_task_cuda_stream(); + CUstream stream = this->stream->get_stream(); size_t bytes_T = total_pts * sizeof(T); size_t bytes_S = total_pts * sizeof(size_t); @@ -1576,7 +1575,7 @@ namespace Realm { { NVTX_DEPPART(split_output); - cudaStream_t stream = Cuda::get_task_cuda_stream(); + CUstream stream = this->stream->get_stream(); bool use_sysmem = false; RegionInstance sys_instance = RegionInstance::NO_INST; @@ -1680,7 +1679,7 @@ namespace Realm { size_t prev = my_arena.mark(); - cudaStream_t stream = Cuda::get_task_cuda_stream(); + CUstream stream = this->stream->get_stream(); SparsityMapEntry* final_entries = my_arena.alloc>(total_rects); Rect* final_rects = my_arena.alloc>(total_rects); diff --git a/src/realm/deppart/preimage.cc b/src/realm/deppart/preimage.cc index 4feaa585e4..37bfef188a 100644 --- a/src/realm/deppart/preimage.cc +++ b/src/realm/deppart/preimage.cc @@ -26,6 +26,7 @@ #include "../logging.h" #include #include +#include "realm/cuda/cuda_internal.h" namespace Realm { @@ -76,16 +77,8 @@ namespace Realm { } minimal_size = max(minimal_size, device_size); size_t optimal_size = is.bounds.volume() * sizeof(Rect) * target_spaces.size() * 10 + minimal_size; - std::vector affinities; - unsigned best_bandwidth = 0; Processor best_proc = Processor::NO_PROC; - Machine::get_machine().get_proc_mem_affinity(affinities, Processor::NO_PROC, mem); - for (auto affinity : affinities) { - if (affinity.bandwidth > best_bandwidth) { - best_bandwidth = affinity.bandwidth; - best_proc = affinity.p; - } - } + assert(choose_proc(best_proc, mem)); requirements[i].affinity_processor = best_proc; requirements[i].lower_bound = minimal_size; requirements[i].upper_bound = optimal_size; @@ -825,6 +818,13 @@ namespace Realm { IndexSpace _parent_space, bool _exclusive) : domain_transform(_domain_transform), parent_space(_parent_space) { this->exclusive = _exclusive; + Memory my_mem = domain_transform.ptr_data.empty() ? domain_transform.range_data[0].inst.get_location() : domain_transform.ptr_data[0].inst.get_location(); + Processor best_proc; + assert(choose_proc(best_proc, my_mem)); + Cuda::GPUProcessor* gpu_proc = dynamic_cast(get_runtime()->get_processor_impl(best_proc)); + assert(gpu_proc); + this->gpu = gpu_proc->gpu; + this->stream = gpu_proc->gpu->get_deppart_stream(); } template @@ -841,6 +841,7 @@ namespace Realm { template void GPUPreimageMicroOp::execute(void) { TimeStamp ts("GPUPreimageMicroOp::execute", true, &log_uop_timing); + Cuda::AutoGPUContext agc(this->gpu); if (domain_transform.ptr_data.size() > 0) { gpu_populate_bitmasks(); } else if (domain_transform.range_data.size() > 0) { diff --git a/src/realm/deppart/preimage_gpu_impl.hpp b/src/realm/deppart/preimage_gpu_impl.hpp index 960e427beb..3a104f2e84 100644 --- a/src/realm/deppart/preimage_gpu_impl.hpp +++ b/src/realm/deppart/preimage_gpu_impl.hpp @@ -18,7 +18,7 @@ namespace Realm { RegionInstance buffer = domain_transform.range_data[0].scratch_buffer; size_t tile_size = buffer.get_layout()->bytes_used; - std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; + //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size); NVTX_DEPPART(gpu_preimage_range); @@ -26,7 +26,7 @@ namespace Realm { Memory sysmem; find_memory(sysmem, Memory::SYSTEM_MEM); - cudaStream_t stream = Cuda::get_task_cuda_stream(); + CUstream stream = this->stream->get_stream(); collapsed_space inst_space; @@ -76,14 +76,14 @@ namespace Realm { size_t num_completed = 0; size_t curr_tile = tile_size / 2; int count = 0; - + if (count) {} bool host_fallback = false; std::vector h_instances(targets.size(), RegionInstance::NO_INST); std::vector entry_counts(targets.size(), 0); while (num_completed < inst_space.num_entries) { try { - std::cout << "Preimage iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; + //std::cout << "Preimage iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; buffer_arena.start(); if (num_completed + curr_tile > inst_space.num_entries) { curr_tile = inst_space.num_entries - num_completed; @@ -254,11 +254,11 @@ namespace Realm { CUDA_CHECK(cudaStreamSynchronize(stream), stream); } catch (arena_oom&) { - std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; + //std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; curr_tile /= 2; if (curr_tile == 0) { if (host_fallback) { - GPUMicroOp::shatter_rects(inst_space, num_completed); + GPUMicroOp::shatter_rects(inst_space, num_completed, stream); curr_tile = 1; } else { host_fallback = true; @@ -326,13 +326,13 @@ namespace Realm { RegionInstance buffer = domain_transform.ptr_data[0].scratch_buffer; size_t tile_size = buffer.get_layout()->bytes_used; - std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; + //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size); Memory sysmem; find_memory(sysmem, Memory::SYSTEM_MEM); - cudaStream_t stream = Cuda::get_task_cuda_stream(); + CUstream stream = this->stream->get_stream(); NVTX_DEPPART(gpu_preimage); @@ -384,14 +384,14 @@ namespace Realm { size_t num_completed = 0; size_t curr_tile = tile_size / 2; int count = 0; - + if (count) {} bool host_fallback = false; std::vector h_instances(targets.size(), RegionInstance::NO_INST); std::vector entry_counts(targets.size(), 0); while (num_completed < inst_space.num_entries) { try { - std::cout << "Preimage iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; + //std::cout << "Preimage iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; buffer_arena.start(); if (num_completed + curr_tile > inst_space.num_entries) { curr_tile = inst_space.num_entries - num_completed; @@ -562,11 +562,11 @@ namespace Realm { CUDA_CHECK(cudaStreamSynchronize(stream), stream); } catch (arena_oom&) { - std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; + //std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; curr_tile /= 2; if (curr_tile == 0) { if (host_fallback) { - GPUMicroOp::shatter_rects(inst_space, num_completed); + GPUMicroOp::shatter_rects(inst_space, num_completed, stream); curr_tile = 1; } else { host_fallback = true; diff --git a/tests/benchmark.cc b/tests/benchmark.cc index 6b78151d68..177ceeb558 100644 --- a/tests/benchmark.cc +++ b/tests/benchmark.cc @@ -43,7 +43,8 @@ enum INIT_BYFIELD_DATA_TASK, INIT_IMAGE_DATA_TASK, INIT_IMAGE_RANGE_DATA_TASK, - INIT_PREIMAGE_DATA_TASK + INIT_PREIMAGE_DATA_TASK, + INIT_PREIMAGE_RANGE_DATA_TASK }; namespace std { @@ -100,6 +101,7 @@ namespace { bool skip_check = false; int dimension1 = 1; int dimension2 = 1; + std::string op; TestInterface *testcfg = 0; }; // namespace @@ -152,11 +154,11 @@ IndexSpace create_sparse_index_space(const Rect &bounds, size_t spar stride *= (bounds.hi[d] - bounds.lo[d] + 1); } if(randomize) { - if(Philox_2x32<>::rand_int(random_seed, flattened, 0, sparse_factor) == 0) { + if(Philox_2x32<>::rand_int(random_seed, flattened, 0, 100) < sparse_factor) { points.push_back(it.p); } } else { - if(flattened % sparse_factor == 0) { + if( (99 * flattened) % 100 < sparse_factor) { points.push_back(it.p); } } @@ -176,6 +178,7 @@ class ByfieldTest : public TestInterface { int num_nodes = 1000; int num_pieces = 4; int num_colors = 4; + size_t buffer_size = 100; std::string filename; ByfieldTest(int argc, const char *argv[]) @@ -194,11 +197,15 @@ class ByfieldTest : public TestInterface { num_colors = atoi(argv[++i]); continue; } + if(!strcmp(argv[i], "-b")) { + buffer_size = atoi(argv[++i]); + continue; + } } - if (num_nodes <= 0 || num_pieces <= 0 || num_colors <= 0) { - log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_colors << " pieces=" << num_pieces << "\n"; + if (num_nodes <= 0 || num_pieces <= 0 || num_colors <= 0 || buffer_size <= 0 || buffer_size > 100) { + log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_colors << " pieces=" << num_pieces << " buffer size=" << buffer_size << "\n"; exit(1); } } @@ -269,8 +276,8 @@ class ByfieldTest : public TestInterface { virtual void print_info(void) { - printf("Realm %dD Byfield dependent partitioning test: %d nodes, %d colors, %d pieces\n", (int) N, - (int)num_nodes, (int) num_colors, (int)num_pieces); + //printf("Realm %dD Byfield dependent partitioning test: %d nodes, %d colors, %d pieces, %lu tile size\n", (int) N, + //(int)num_nodes, (int) num_colors, (int)num_pieces, buffer_size); } virtual Event initialize_data(const std::vector &memories, @@ -380,11 +387,12 @@ class ByfieldTest : public TestInterface { is_colors.by_field_buffer_requirements(byfield_inputs, byfield_requirements); + for (int i = 0; i < num_pieces; i++) { - alloc_piece(piece_field_data_gpu[i].scratch_buffer, byfield_requirements[i].upper_bound, gpu_memory).wait(); + size_t alloc_size = byfield_requirements[i].lower_bound + (byfield_requirements[i].upper_bound - byfield_requirements[i].lower_bound) * buffer_size / 100; + alloc_piece(piece_field_data_gpu[i].scratch_buffer, alloc_size, gpu_memory).wait(); } - wait_on_events = true; log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; Event warmup = is_colors.create_subspaces_by_field(piece_field_data_gpu, colors, @@ -392,16 +400,27 @@ class ByfieldTest : public TestInterface { Realm::ProfilingRequestSet()); warmup.wait(); + long long start_gpu = Clock::current_time_in_microseconds(); Event gpu_call = is_colors.create_subspaces_by_field(piece_field_data_gpu, colors, p_nodes, Realm::ProfilingRequestSet()); + gpu_call.wait(); + long long gpu_time = Clock::current_time_in_microseconds() - start_gpu; + long long start_cpu = Clock::current_time_in_microseconds(); + Event cpu_call = is_colors.create_subspaces_by_field(piece_id_field_data, colors, p_nodes_cpu, Realm::ProfilingRequestSet()); + cpu_call.wait(); + long long cpu_time = Clock::current_time_in_microseconds() - start_cpu; + + printf("RESULT,op=byfield,d1=%d,num_nodes=%d,buffer_size=%zu,gpu_us=%lld,cpu_us=%lld\n", + N, num_nodes, buffer_size, gpu_time, cpu_time); + return Event::merge_events({gpu_call, cpu_call}); } @@ -423,6 +442,12 @@ class ByfieldTest : public TestInterface { log_app.info() << "Checking correctness of partitioning " << "\n"; for(int i = 0; i < num_pieces; i++) { + if (!p_nodes[i].dense() && (N > 1)) { + p_nodes[i].sparsity.impl()->request_bvh(); + if (!p_nodes_cpu[i].dense()) { + p_nodes_cpu[i].sparsity.impl()->request_bvh(); + } + } for(IndexSpaceIterator it(p_nodes[i]); it.valid; it.step()) { for(PointInRectIterator point(it.rect); point.valid; point.step()) { if (!p_nodes_cpu[i].contains(point.p)) { @@ -453,9 +478,10 @@ class ImageTest : public TestInterface { // graph config parameters int num_nodes = 1000; int num_edges = 1000; - int sparse_factor = 4; + int sparse_factor = 50; int num_spaces = 4; int num_pieces = 4; + size_t buffer_size = 100; std::string filename; ImageTest(int argc, const char *argv[]) @@ -482,11 +508,15 @@ class ImageTest : public TestInterface { sparse_factor = atoi(argv[++i]); continue; } + if (!strcmp(argv[i], "-b")) { + buffer_size = atoi(argv[++i]); + continue; + } } if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_spaces <= 0) { - log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " sources=" << num_spaces << "\n"; + log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " sources=" << num_spaces << " buffer size=" << buffer_size << "\n"; exit(1); } } @@ -560,8 +590,8 @@ class ImageTest : public TestInterface { virtual void print_info(void) { - printf("Realm %dD -> %dD Image dependent partitioning test: %d nodes, %d edges, %d pieces ,%d sources, %d sparse factor\n", (int) N2, (int) N1, - (int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) sparse_factor); + //printf("Realm %dD -> %dD Image dependent partitioning test: %d nodes, %d edges, %d pieces ,%d sources, %d sparse factor, %lu tile size\n", (int) N2, (int) N1, + //(int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) sparse_factor, buffer_size); } virtual Event initialize_data(const std::vector &memories, @@ -689,10 +719,10 @@ class ImageTest : public TestInterface { is_edges.by_image_buffer_requirements(image_subspaces, image_inputs, image_requirements); for (int i = 0; i < num_pieces; i++) { - alloc_piece(point_field_data_gpu[i].scratch_buffer, image_requirements[i].upper_bound, gpu_memory).wait(); + size_t alloc_size = image_requirements[i].lower_bound + (image_requirements[i].upper_bound - image_requirements[i].lower_bound) * buffer_size / 100; + alloc_piece(point_field_data_gpu[i].scratch_buffer, alloc_size, gpu_memory).wait(); } - wait_on_events = true; log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; Event warmup = is_edges.create_subspaces_by_image(point_field_data_gpu, sources, @@ -705,11 +735,19 @@ class ImageTest : public TestInterface { p_edges, Realm::ProfilingRequestSet()); + if ( wait_on_events ) { + gpu_call.wait(); + } + Event cpu_call = is_edges.create_subspaces_by_image(point_field_data, sources, p_edges_cpu, Realm::ProfilingRequestSet()); + if ( wait_on_events ) { + cpu_call.wait(); + } + return Event::merge_events({gpu_call, cpu_call}); } @@ -731,6 +769,14 @@ class ImageTest : public TestInterface { log_app.info() << "Checking correctness of partitioning " << "\n"; for(int i = 0; i < num_pieces; i++) { + if (N1 > 1) { + if (!p_edges[i].dense()) { + p_edges[i].sparsity.impl()->request_bvh(); + } + if (!p_edges_cpu[i].dense()) { + p_edges_cpu[i].sparsity.impl()->request_bvh(); + } + } for(IndexSpaceIterator it(p_edges[i]); it.valid; it.step()) { for(PointInRectIterator point(it.rect); point.valid; point.step()) { if (!p_edges_cpu[i].contains(point.p)) { @@ -764,7 +810,8 @@ class ImageRangeTest : public TestInterface { int rect_size = 10; int num_spaces = 4; int num_pieces = 4; - int sparse_factor = 4; + int sparse_factor = 50; + size_t buffer_size = 100; std::string filename; ImageRangeTest(int argc, const char *argv[]) @@ -795,11 +842,15 @@ class ImageRangeTest : public TestInterface { sparse_factor = atoi(argv[++i]); continue; } + if (!strcmp(argv[i], "-b")) { + buffer_size = atoi(argv[++i]); + continue; + } } - if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_spaces <= 0 || rect_size <= 0) { - log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " sources=" << num_spaces << " rect size=" << rect_size << "\n"; + if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_spaces <= 0 || rect_size <= 0 || sparse_factor < 0 || sparse_factor > 100 || buffer_size < 0 || buffer_size > 100) { + log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " sources=" << num_spaces << " rect size=" << rect_size << " sparse factor=" << sparse_factor << " buffer_size=" << buffer_size << "\n"; exit(1); } } @@ -876,8 +927,8 @@ class ImageRangeTest : public TestInterface { virtual void print_info(void) { - printf("Realm %dD -> %dD Image Range dependent partitioning test: %d nodes, %d edges, %d pieces ,%d sources, %d rect size, %d sparse factor\n", (int) N2, (int) N1, - (int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) rect_size, (int) sparse_factor); + //printf("Realm %dD -> %dD Image Range dependent partitioning test: %d nodes, %d edges, %d pieces ,%d sources, %d rect size, %d sparse factor, %lu tile size\n", (int) N2, (int) N1, + // (int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) rect_size, (int) sparse_factor, buffer_size); } virtual Event initialize_data(const std::vector &memories, @@ -1005,10 +1056,10 @@ class ImageRangeTest : public TestInterface { is_edges.by_image_buffer_requirements(image_subspaces, image_inputs, image_requirements); for (int i = 0; i < num_pieces; i++) { - alloc_piece(rect_field_data_gpu[i].scratch_buffer, image_requirements[i].upper_bound, gpu_memory).wait(); + size_t alloc_size = image_requirements[i].lower_bound + (image_requirements[i].upper_bound - image_requirements[i].lower_bound) * buffer_size / 100; + alloc_piece(rect_field_data_gpu[i].scratch_buffer, alloc_size, gpu_memory).wait(); } - wait_on_events = true; log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; Event warmup = is_edges.create_subspaces_by_image(rect_field_data_gpu, sources, @@ -1021,11 +1072,19 @@ class ImageRangeTest : public TestInterface { p_edges, Realm::ProfilingRequestSet()); + if ( wait_on_events ) { + gpu_call.wait(); + } + Event cpu_call = is_edges.create_subspaces_by_image(rect_field_data, sources, p_edges_cpu, Realm::ProfilingRequestSet()); + if ( wait_on_events ) { + cpu_call.wait(); + } + return Event::merge_events({gpu_call, cpu_call}); } @@ -1047,6 +1106,16 @@ class ImageRangeTest : public TestInterface { log_app.info() << "Checking correctness of partitioning " << "\n"; for(int i = 0; i < num_spaces; i++) { + + if (N1 > 1) { + if (!p_edges[i].dense()) { + p_edges[i].sparsity.impl()->request_bvh(); + } + if (!p_edges_cpu[i].dense()) { + p_edges_cpu[i].sparsity.impl()->request_bvh(); + } + } + for(IndexSpaceIterator it(p_edges[i]); it.valid; it.step()) { for(PointInRectIterator point(it.rect); point.valid; point.step()) { if (!p_edges_cpu[i].contains(point.p)) { @@ -1079,7 +1148,8 @@ class PreimageTest : public TestInterface { int num_edges = 1000; int num_spaces = 4; int num_pieces = 4; - int sparse_factor = 4; + int sparse_factor = 50; + size_t buffer_size = 100; std::string filename; PreimageTest(int argc, const char *argv[]) @@ -1106,11 +1176,15 @@ class PreimageTest : public TestInterface { sparse_factor = atoi(argv[++i]); continue; } + if (!strcmp(argv[i], "-b")) { + buffer_size = atoi(argv[++i]); + continue; + } } - if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_spaces <= 0) { - log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " targets=" << num_spaces << "\n"; + if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_spaces <= 0 || sparse_factor < 0 || sparse_factor > 100 || buffer_size < 0 || buffer_size > 100) { + log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " targets=" << num_spaces << " sparse factor=" << sparse_factor << " buffer size=" << buffer_size << "\n"; exit(1); } } @@ -1184,8 +1258,8 @@ class PreimageTest : public TestInterface { virtual void print_info(void) { - printf("Realm %dD -> %dD Preimage dependent partitioning test: %d nodes, %d edges, %d pieces ,%d targets, %d sparse factor\n", (int) N1, (int) N2, - (int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) sparse_factor); + //printf("Realm %dD -> %dD Preimage dependent partitioning test: %d nodes, %d edges, %d pieces ,%d targets, %d sparse factor, %lu tile size\n", (int) N1, (int) N2, + //(int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) sparse_factor, buffer_size); } virtual Event initialize_data(const std::vector &memories, @@ -1314,10 +1388,10 @@ class PreimageTest : public TestInterface { is_nodes.by_preimage_buffer_requirements(preimage_subspaces, preimage_inputs, preimage_requirements); for (int i = 0; i < num_pieces; i++) { - alloc_piece(point_field_data_gpu[i].scratch_buffer, preimage_requirements[i].upper_bound, gpu_memory).wait(); + size_t alloc_size = preimage_requirements[i].lower_bound + (preimage_requirements[i].upper_bound - preimage_requirements[i].lower_bound) * buffer_size / 100; + alloc_piece(point_field_data_gpu[i].scratch_buffer, alloc_size, gpu_memory).wait(); } - wait_on_events = true; log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; Event warmup = is_nodes.create_subspaces_by_preimage(point_field_data_gpu, targets, @@ -1325,21 +1399,24 @@ class PreimageTest : public TestInterface { Realm::ProfilingRequestSet()); warmup.wait(); + long long gpu_start = Clock::current_time_in_microseconds(); Event gpu_call = is_nodes.create_subspaces_by_preimage(point_field_data_gpu, targets, p_nodes, Realm::ProfilingRequestSet()); gpu_call.wait(); - - long long start = Clock::current_time_in_microseconds(); + long long gpu_us = Clock::current_time_in_microseconds() - gpu_start; + long long cpu_start = Clock::current_time_in_microseconds(); Event cpu_call = is_nodes.create_subspaces_by_preimage(point_field_data, targets, p_nodes_cpu, Realm::ProfilingRequestSet()); - cpu_call.wait(); - std::cout << "CPU TIME: " << (Clock::current_time_in_microseconds() - start) / 1000 << " ms\n"; + cpu_call.wait(); + long long cpu_us = Clock::current_time_in_microseconds() - cpu_start; + printf("RESULT,op=preimage,d1=%d,d2=%d,num_nodes=%d,num_edges=%d,sparse_factor=%d,buffer_size=%zu,gpu_us=%lld,cpu_us=%lld\n", + N1, N2, num_nodes, num_edges, sparse_factor, buffer_size, gpu_us, cpu_us); return Event::merge_events({gpu_call, cpu_call}); } @@ -1391,6 +1468,339 @@ class PreimageTest : public TestInterface { } }; +template +class PreimageRangeTest : public TestInterface { +public: + // graph config parameters + int num_nodes = 1000; + int num_edges = 1000; + int rect_size = 10; + int num_spaces = 4; + int num_pieces = 4; + int sparse_factor = 50; + size_t buffer_size = 100; + std::string filename; + + PreimageRangeTest(int argc, const char *argv[]) + { + for(int i = 1; i < argc; i++) { + + if(!strcmp(argv[i], "-p")) { + num_pieces = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-n")) { + num_nodes = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-e")) { + num_edges = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-r")) { + rect_size = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-s")) { + num_spaces = atoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "-f")) { + sparse_factor = atoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "-b")) { + buffer_size = atoi(argv[++i]); + continue; + } + } + + + if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_spaces <= 0 || rect_size <= 0 || sparse_factor < 0 || sparse_factor > 100 || buffer_size < 0 || buffer_size > 100) { + log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " targets=" << num_spaces << " rect size=" << rect_size << " sparse factor=" << sparse_factor << " buffer size=" << buffer_size << "\n"; + exit(1); + } + } + + struct InitDataArgs { + int index; + RegionInstance ri_nodes; + }; + + enum PRNGStreams + { + NODE_SUBGRAPH_STREAM, + }; + + // assign subgraph ids to nodes + void chase_rect(int idx, Rect& color) + { + for (int d = 0; d < N2; d++) { + if(random_colors) { + color.lo[d] = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_edges); + color.hi[d] = color.lo[d] + Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, 2 * rect_size); + } else { + color.lo[d] = (idx * num_edges / num_nodes) % num_edges; + color.hi[d] = color.lo[d] + rect_size; + } + } + } + + static void init_data_task_wrapper(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) + { + PreimageRangeTest *me = (PreimageRangeTest *)testcfg; + me->init_data_task(args, arglen, p); + } + + //Each piece has a task to initialize its data + void init_data_task(const void *args, size_t arglen, Processor p) + { + const InitDataArgs &i_args = *(const InitDataArgs *)args; + + log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes + << ")"; + + i_args.ri_nodes.fetch_metadata(p).wait(); + + IndexSpace nodes_space = i_args.ri_nodes.template get_indexspace(); + + log_app.debug() << "N: " << is_nodes; + + //For each node in the graph, mark it with a random (or deterministic) subgraph id + { + AffineAccessor, N1> a_rect(i_args.ri_nodes, 0 /* offset */); + + for (IndexSpaceIterator it(is_nodes); it.valid; it.step()) { + for (PointInRectIterator point(it.rect); point.valid; point.step()) { + int idx = 0; + int stride = 1; + for (int d = 0; d < N1; d++) { + idx += (point.p[d] - is_nodes.bounds.lo[d]) * stride; + stride *= (is_nodes.bounds.hi[d] - is_nodes.bounds.lo[d] + 1); + } + Rect destination; + chase_rect(idx, destination); + a_rect.write(point.p, destination); + } + } + } + } + + IndexSpace is_nodes; + IndexSpace is_edges; + std::vector ri_nodes; + std::vector, Rect> > rect_field_data; + + virtual void print_info(void) + { + printf("Realm %dD -> %dD Preimage Range dependent partitioning test: %d nodes, %d edges, %d pieces ,%d targets, %d rect size, %d sparse factor, %lu tile size\n", (int) N1, (int) N2, + (int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) rect_size, (int) sparse_factor, buffer_size); + } + + virtual Event initialize_data(const std::vector &memories, + const std::vector &procs) + { + // now create index space for nodes + Point node_lo, node_hi; + for (int d = 0; d < N1; d++) { + node_lo[d] = 0; + node_hi[d] = num_nodes - 1; + } + is_nodes = Rect(node_lo, node_hi); + + Point edge_lo, edge_hi; + for (int d = 0; d < N2; d++) { + edge_lo[d] = 0; + edge_hi[d] = num_edges - 1; + } + is_edges = Rect(edge_lo, edge_hi); + + // equal partition is used to do initial population of edges and nodes + std::vector > ss_nodes_eq; + + log_app.info() << "Creating equal subspaces\n"; + + is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); + + // create instances for each of these subspaces + std::vector node_fields; + node_fields.push_back(sizeof(Rect)); + + ri_nodes.resize(num_pieces); + rect_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_nodes_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i], + node_fields, 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_nodes[i] = ri; + + rect_field_data[i].index_space = ss_nodes_eq[i]; + rect_field_data[i].inst = ri_nodes[i]; + rect_field_data[i].field_offset = 0; + } + + // fire off tasks to initialize data + std::set events; + for(int i = 0; i < num_pieces; i++) { + Processor p = procs[i % procs.size()]; + InitDataArgs args; + args.index = i; + args.ri_nodes = ri_nodes[i]; + Event e = p.spawn(INIT_PREIMAGE_RANGE_DATA_TASK, &args, sizeof(args)); + events.insert(e); + } + + return Event::merge_events(events); + } + + // the outputs of our partitioning will be: + // p_nodes - nodes partitioned by subgraph id (from GPU) + // p_nodes_cpu - nodes partitioned by subgraph id (from CPU) + + std::vector > p_nodes, p_garbage_nodes, p_nodes_cpu; + + virtual Event perform_partitioning(void) + { + // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU + // Ensure that the results are identical + + std::vector> targets; + if (sparse_factor <= 1) { + is_edges.create_equal_subspaces(num_spaces, 1, targets, Realm::ProfilingRequestSet()).wait(); + } else { + targets.resize(num_spaces); + for (int i = 0; i < num_spaces; i++) { + targets[i] = create_sparse_index_space(is_edges.bounds, sparse_factor, random_colors, i); + } + } + + // We need a GPU memory for GPU partitioning + Memory gpu_memory; + bool found_gpu_memory = false; + Machine machine = Machine::get_machine(); + std::set all_memories; + machine.get_all_memories(all_memories); + for(Memory memory : all_memories) { + if(memory.kind() == Memory::GPU_FB_MEM) { + gpu_memory = memory; + found_gpu_memory = true; + break; + } + } + if (!found_gpu_memory) { + log_app.error() << "No GPU memory found for partitioning test\n"; + return Event::NO_EVENT; + } + + + std::vector node_fields; + node_fields.push_back(sizeof(Rect)); + + std::vector, Rect>> rect_field_data_gpu; + rect_field_data_gpu.resize(num_pieces); + + for (int i = 0; i < num_pieces; i++) { + copy_piece(rect_field_data[i], rect_field_data_gpu[i], node_fields, 0, gpu_memory).wait(); + } + + std::vector> preimage_inputs(num_pieces); + std::vector> preimage_subspaces(num_spaces); + std::vector preimage_requirements(num_pieces); + + for (int i = 0; i < num_pieces; i++) { + preimage_inputs[i].location = rect_field_data_gpu[i].inst.get_location(); + preimage_inputs[i].space = rect_field_data_gpu[i].index_space; + } + + for (int i = 0; i < num_spaces; i++) { + preimage_subspaces[i].space = targets[i]; + preimage_subspaces[i].entries = targets[i].dense() ? 1 : targets[i].sparsity.impl()->get_entries().size(); + } + + is_nodes.by_preimage_buffer_requirements(preimage_subspaces, preimage_inputs, preimage_requirements); + + for (int i = 0; i < num_pieces; i++) { + size_t alloc_size = preimage_requirements[i].lower_bound + (preimage_requirements[i].upper_bound - preimage_requirements[i].lower_bound) * buffer_size / 100; + alloc_piece(rect_field_data_gpu[i].scratch_buffer, alloc_size, gpu_memory).wait(); + } + + log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; + Event warmup = is_nodes.create_subspaces_by_preimage(rect_field_data_gpu, + targets, + p_garbage_nodes, + Realm::ProfilingRequestSet()); + warmup.wait(); + + Event gpu_call = is_nodes.create_subspaces_by_preimage(rect_field_data_gpu, + targets, + p_nodes, + Realm::ProfilingRequestSet()); + + if ( wait_on_events ) { + gpu_call.wait(); + } + Event cpu_call = is_nodes.create_subspaces_by_preimage(rect_field_data, + targets, + p_nodes_cpu, + Realm::ProfilingRequestSet()); + + if ( wait_on_events ) { + cpu_call.wait(); + } + + return Event::merge_events({gpu_call, cpu_call}); + } + + virtual int perform_dynamic_checks(void) + { + // Nothing to do here + return 0; + } + + virtual int check_partitioning(void) + { + int errors = 0; + + if (!p_nodes.size()) { + return p_nodes.size() != p_nodes_cpu.size(); + } + + log_app.info() << "Checking correctness of partitioning " << "\n"; + + for(int i = 0; i < num_spaces; i++) { + if (!p_nodes[i].dense() && (N1 > 1)) { + p_nodes[i].sparsity.impl()->request_bvh(); + if (!p_nodes_cpu[i].dense()) { + p_nodes_cpu[i].sparsity.impl()->request_bvh(); + } + } + for(IndexSpaceIterator it(p_nodes[i]); it.valid; it.step()) { + for(PointInRectIterator point(it.rect); point.valid; point.step()) { + if (!p_nodes_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra image point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for(IndexSpaceIterator it(p_nodes_cpu[i]); it.valid; it.step()) { + for(PointInRectIterator point(it.rect); point.valid; point.step()) { + if (!p_nodes[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing image point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + + } + return errors; + } +}; + void top_level_task(const void *args, size_t arglen, const void *userdata, size_t userlen, Processor p) { @@ -1470,7 +1880,6 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, size_ exit(1); } - printf("all done!\n"); } // Constructor function-pointer type @@ -1524,6 +1933,18 @@ static constexpr CtorFn PREIMAGE_CTORS[3][3] = { { &make_preimage<3,1>, &make_preimage<3,2>, &make_preimage<3,3> }, }; +// ---- Image constructors ---- +template +static TestInterface* make_preimage_range(int argc, const char** argv) { + return new PreimageRangeTest(argc, argv); +} + +static constexpr CtorFn PREIMAGE_RANGE_CTORS[3][3] = { + { &make_preimage_range<1,1>, &make_preimage_range<1,2>, &make_preimage_range<1,3> }, + { &make_preimage_range<2,1>, &make_preimage_range<2,2>, &make_preimage_range<2,3> }, + { &make_preimage_range<3,1>, &make_preimage_range<3,2>, &make_preimage_range<3,3> }, +}; + using TaskWrapperFn = void (*)(const void*, size_t, const void*, size_t, Processor); static constexpr TaskWrapperFn BYFIELD_INIT_TBL[3] = { @@ -1550,6 +1971,12 @@ static constexpr TaskWrapperFn PREIMAGE_INIT_TBL[3][3] = { { &PreimageTest<3,1>::init_data_task_wrapper, &PreimageTest<3,2>::init_data_task_wrapper, &PreimageTest<3,3>::init_data_task_wrapper }, }; +static constexpr TaskWrapperFn PREIMAGE_RANGE_INIT_TBL[3][3] = { + { &PreimageRangeTest<1,1>::init_data_task_wrapper, &PreimageRangeTest<1,2>::init_data_task_wrapper, &PreimageRangeTest<1,3>::init_data_task_wrapper }, + { &PreimageRangeTest<2,1>::init_data_task_wrapper, &PreimageRangeTest<2,2>::init_data_task_wrapper, &PreimageRangeTest<2,3>::init_data_task_wrapper }, + { &PreimageRangeTest<3,1>::init_data_task_wrapper, &PreimageRangeTest<3,2>::init_data_task_wrapper, &PreimageRangeTest<3,3>::init_data_task_wrapper }, +}; + int main(int argc, char **argv) { Runtime rt; @@ -1597,6 +2024,7 @@ int main(int argc, char **argv) if (dimension1 < 1 || dimension1 > 3) assert(false && "invalid dimension"); + op = "byfield"; testcfg = BYFIELD_CTORS[dimension1 - 1](argc - i, const_cast(argv + i)); break; } @@ -1604,13 +2032,15 @@ int main(int argc, char **argv) if(!strcmp(argv[i], "image")) { if (dimension1 < 1 || dimension1 > 3 || dimension2 < 1 || dimension2 > 3) assert(false && "invalid dimension"); + op = "image"; testcfg = IMAGE_CTORS[dimension1 - 1][dimension2 - 1](argc - i, const_cast(argv + i)); break; } - if(!strcmp(argv[i], "range")) { + if(!strcmp(argv[i], "irange")) { if (dimension1 < 1 || dimension1 > 3 || dimension2 < 1 || dimension2 > 3) assert(false && "invalid dimension"); + op = "irange"; testcfg = IMAGE_RANGE_CTORS[dimension1 - 1][dimension2 - 1](argc - i, const_cast(argv + i)); break; } @@ -1618,10 +2048,19 @@ int main(int argc, char **argv) if(!strcmp(argv[i], "preimage")) { if (dimension1 < 1 || dimension1 > 3 || dimension2 < 1 || dimension2 > 3) assert(false && "invalid dimension"); + op = "preimage"; testcfg = PREIMAGE_CTORS[dimension1 - 1][dimension2 - 1](argc - i, const_cast(argv + i)); break; } + if(!strcmp(argv[i], "prange")) { + if (dimension1 < 1 || dimension1 > 3 || dimension2 < 1 || dimension2 > 3) + assert(false && "invalid dimension"); + op = "prange"; + testcfg = PREIMAGE_RANGE_CTORS[dimension1 - 1][dimension2 - 1](argc - i, const_cast(argv + i)); + break; + } + // printf("unknown parameter: %s\n", argv[i]); } @@ -1639,6 +2078,7 @@ int main(int argc, char **argv) rt.register_task(INIT_IMAGE_DATA_TASK, IMAGE_INIT_TBL[dimension1 - 1][dimension2 - 1]); rt.register_task(INIT_IMAGE_RANGE_DATA_TASK, IMAGE_RANGE_INIT_TBL[dimension1 - 1][dimension2 - 1]); rt.register_task(INIT_PREIMAGE_DATA_TASK, PREIMAGE_INIT_TBL[dimension1 - 1][dimension2 - 1]); + rt.register_task(INIT_PREIMAGE_RANGE_DATA_TASK, PREIMAGE_RANGE_INIT_TBL[dimension1 - 1][dimension2 - 1]); signal(SIGALRM, sigalrm_handler); From 83cb1d67c98ec27cffd9f3300c21972e1255e788 Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Tue, 10 Mar 2026 21:47:53 -0700 Subject: [PATCH 20/32] trying full benchmark --- src/realm/deppart/byfield.cc | 2 +- tests/benchmark.cc | 34 +++++++++++++++++++--------------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc index 06c936f0b2..78aceb2f92 100644 --- a/src/realm/deppart/byfield.cc +++ b/src/realm/deppart/byfield.cc @@ -45,7 +45,7 @@ namespace Realm { if (val) { device_size = atoi(val); } - size_t optimal_size = is.bounds.volume() * 10 * sizeof(RectDesc); + size_t optimal_size = is.bounds.volume() * 20 * sizeof(RectDesc); Processor best_proc = Processor::NO_PROC; assert(choose_proc(best_proc, mem)); requirements[i].affinity_processor = best_proc; diff --git a/tests/benchmark.cc b/tests/benchmark.cc index 177ceeb558..3259644270 100644 --- a/tests/benchmark.cc +++ b/tests/benchmark.cc @@ -113,6 +113,7 @@ Event copy_piece(FieldDataDescriptor src_data, FieldDataDescriptor src_data, FieldDataDescriptor src_fields = {src_field}; std::vector dst_fields = {dst_field}; @@ -664,7 +664,7 @@ class ImageTest : public TestInterface { // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU // Ensure that the results are identical - std::vector> sources(num_pieces); + std::vector> sources(num_spaces); for(int i = 0; i < num_spaces; i++) { if (sparse_factor <= 1) { sources[i] = point_field_data[i % num_pieces].index_space; @@ -730,23 +730,24 @@ class ImageTest : public TestInterface { Realm::ProfilingRequestSet()); warmup.wait(); + long long start_gpu = Clock::current_time_in_microseconds(); Event gpu_call = is_edges.create_subspaces_by_image(point_field_data_gpu, sources, p_edges, Realm::ProfilingRequestSet()); - if ( wait_on_events ) { - gpu_call.wait(); - } - + gpu_call.wait(); + long long gpu_us = Clock::current_time_in_microseconds() - start_gpu; + long long start_cpu = Clock::current_time_in_microseconds(); Event cpu_call = is_edges.create_subspaces_by_image(point_field_data, sources, p_edges_cpu, Realm::ProfilingRequestSet()); - if ( wait_on_events ) { - cpu_call.wait(); - } + cpu_call.wait(); + long long cpu_us = Clock::current_time_in_microseconds() - start_cpu; + printf("RESULT,op=image,d1=%d,d2=%d,num_nodes=%d,num_edges=%d,num_spaces=%d,sparse_factor=%d,buffer_size=%zu,gpu_us=%lld,cpu_us=%lld\n", + N1, N2, num_nodes, num_edges, num_spaces, sparse_factor, buffer_size, gpu_us, cpu_us); return Event::merge_events({gpu_call, cpu_call}); @@ -1067,23 +1068,26 @@ class ImageRangeTest : public TestInterface { Realm::ProfilingRequestSet()); warmup.wait(); + long long start_gpu = Clock::current_time_in_microseconds(); Event gpu_call = is_edges.create_subspaces_by_image(rect_field_data_gpu, sources, p_edges, Realm::ProfilingRequestSet()); - if ( wait_on_events ) { - gpu_call.wait(); - } + gpu_call.wait(); + long long gpu_us = Clock::current_time_in_microseconds() - start_gpu; + long long start_cpu = Clock::current_time_in_microseconds(); Event cpu_call = is_edges.create_subspaces_by_image(rect_field_data, sources, p_edges_cpu, Realm::ProfilingRequestSet()); - if ( wait_on_events ) { - cpu_call.wait(); - } + cpu_call.wait(); + long long cpu_us = Clock::current_time_in_microseconds() - start_cpu; + + printf("RESULT,op=image,d1=%d,d2=%d,num_nodes=%d,num_edges=%d,num_spaces=%d,sparse_factor=%d,buffer_size=%zu,gpu_us=%lld,cpu_us=%lld\n", + N1, N2, num_nodes, num_edges, num_spaces, sparse_factor, buffer_size, gpu_us, cpu_us); return Event::merge_events({gpu_call, cpu_call}); From a55e5c69cf36fc20995cdf9691ac71d5718b4284 Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Tue, 10 Mar 2026 23:09:49 -0700 Subject: [PATCH 21/32] bumped upper bounds --- src/realm/deppart/image.cc | 2 +- src/realm/deppart/preimage.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc index ec8cfb834d..c0656d4b59 100644 --- a/src/realm/deppart/image.cc +++ b/src/realm/deppart/image.cc @@ -74,7 +74,7 @@ namespace Realm { device_size = atoi(val); } minimal_size = max(minimal_size, device_size); - size_t optimal_size = is.bounds.volume() * sizeof(Rect) * source_spaces.size() * 10 + minimal_size; + size_t optimal_size = is.bounds.volume() * sizeof(Rect) * source_spaces.size() * 20 + minimal_size; Processor best_proc; assert(choose_proc(best_proc, mem)); requirements[i].affinity_processor = best_proc; diff --git a/src/realm/deppart/preimage.cc b/src/realm/deppart/preimage.cc index 37bfef188a..9ac7d85606 100644 --- a/src/realm/deppart/preimage.cc +++ b/src/realm/deppart/preimage.cc @@ -76,7 +76,7 @@ namespace Realm { device_size = atoi(val); } minimal_size = max(minimal_size, device_size); - size_t optimal_size = is.bounds.volume() * sizeof(Rect) * target_spaces.size() * 10 + minimal_size; + size_t optimal_size = is.bounds.volume() * sizeof(Rect) * target_spaces.size() * 20 + minimal_size; Processor best_proc = Processor::NO_PROC; assert(choose_proc(best_proc, mem)); requirements[i].affinity_processor = best_proc; From 669b69a303a0293308342606b68de48751240ba5 Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Wed, 11 Mar 2026 00:38:25 -0700 Subject: [PATCH 22/32] fixed construct input rectlist --- src/realm/deppart/partitions_gpu_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp index d136c2138b..90f3a9056d 100644 --- a/src/realm/deppart/partitions_gpu_impl.hpp +++ b/src/realm/deppart/partitions_gpu_impl.hpp @@ -426,7 +426,7 @@ namespace Realm { CUDA_CHECK(cudaMemsetAsync(counters, 0, (lhs.num_children) * sizeof(uint32_t), stream), stream); BVH my_bvh; - bool bvh_valid = rhs.num_children < rhs.num_entries; + bool bvh_valid = rhs.num_children < rhs.num_entries && lhs.num_children < lhs.num_entries; if (bvh_valid) { build_bvh(rhs, my_bvh, my_arena, stream); } From 17003b1bb2fa70faea534919d833392b7a766ead Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Wed, 11 Mar 2026 12:32:55 -0700 Subject: [PATCH 23/32] fixed overflow --- src/realm/deppart/image.cc | 4 +++- src/realm/deppart/image_gpu_impl.hpp | 6 +++++- src/realm/deppart/partitions_gpu_impl.hpp | 12 +++++++++++- tests/benchmark.cc | 4 +++- 4 files changed, 22 insertions(+), 4 deletions(-) diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc index c0656d4b59..edc8ffc010 100644 --- a/src/realm/deppart/image.cc +++ b/src/realm/deppart/image.cc @@ -74,13 +74,15 @@ namespace Realm { device_size = atoi(val); } minimal_size = max(minimal_size, device_size); - size_t optimal_size = is.bounds.volume() * sizeof(Rect) * source_spaces.size() * 20 + minimal_size; + size_t optimal_size = is.bounds.volume() * sizeof(RectDesc) * source_spaces.size() * 20 + minimal_size; + optimal_size += 2 * (is.dense() ? 1 : is.sparsity.impl()->get_entries().size()) * sizeof(Rect) * source_entries; Processor best_proc; assert(choose_proc(best_proc, mem)); requirements[i].affinity_processor = best_proc; requirements[i].lower_bound = minimal_size; requirements[i].upper_bound = optimal_size; requirements[i].minimum_alignment = 128; + std::cout << "UPPER BOUND IS " << optimal_size << std::endl; } else { requirements[i].affinity_processor = Processor::NO_PROC; requirements[i].lower_bound = 0; diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp index 43682e06dd..48faad0585 100644 --- a/src/realm/deppart/image_gpu_impl.hpp +++ b/src/realm/deppart/image_gpu_impl.hpp @@ -332,7 +332,7 @@ void GPUImageMicroOp::gpu_populate_ptrs() CUstream stream = this->stream->get_stream(); size_t tile_size = buffer.get_layout()->bytes_used; - //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; + std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size); collapsed_space src_space; @@ -449,6 +449,10 @@ void GPUImageMicroOp::gpu_populate_ptrs() buffer_arena.flip_parity(); PointDesc* d_valid_points = buffer_arena.alloc>(num_valid_points); + buffer_arena.start(); + d_valid_points = buffer_arena.alloc>(num_valid_points); + + std::cout << "Tile has " << num_valid_rects << " valid rects and " << num_valid_points << " valid points." << std::endl; CUDA_CHECK(cudaMemsetAsync(d_inst_counters, 0, (domain_transform.ptr_data.size()) * sizeof(uint32_t), stream), stream); diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp index 90f3a9056d..722a4113df 100644 --- a/src/realm/deppart/partitions_gpu_impl.hpp +++ b/src/realm/deppart/partitions_gpu_impl.hpp @@ -426,7 +426,7 @@ namespace Realm { CUDA_CHECK(cudaMemsetAsync(counters, 0, (lhs.num_children) * sizeof(uint32_t), stream), stream); BVH my_bvh; - bool bvh_valid = rhs.num_children < rhs.num_entries && lhs.num_children < lhs.num_entries; + bool bvh_valid = rhs.num_children < rhs.num_entries && lhs.num_children < lhs.num_entries && lhs.num_entries > 1000; if (bvh_valid) { build_bvh(rhs, my_bvh, my_arena, stream); } @@ -1462,10 +1462,16 @@ namespace Realm { size_t max_aux_bytes = std::max({bytes_T, bytes_S, bytes_R}); size_t max_pg_bytes = std::max({bytes_p, bytes_S}); + std::cout << "COMPLETE PIPELINE HAS USED " << my_arena.used() << " bytes" << " out of " << my_arena.capacity() << std::endl; + std::cout << "TOTAL POINTS IS " << total_pts << std::endl; + + std::cout << "AUX BYTES: " << max_aux_bytes << std::endl; // Instance shared by coordinate keys, source keys, and rectangle outputs char* aux_ptr = my_arena.alloc(2 * max_aux_bytes); + std::cout << "PG BYTES: " << max_pg_bytes << std::endl; + //Instance shared by group ids (RLE) and intermediate points in sorting char* pg_ptr = my_arena.alloc(max_pg_bytes); @@ -1492,8 +1498,12 @@ namespace Realm { //Temporary storage instance shared by CUB operations. size_t temp_bytes = std::max({t1, t2, t3}); + + std::cout << "TEMP BYTES: " << temp_bytes << std::endl; void *temp_storage = my_arena.alloc(temp_bytes); + std::cout << "TOTAL BYTES: " << my_arena.used() + temp_bytes << std::endl; + //Sort along each dimension from LSB to MSB (0 to N-1) size_t use_bytes = temp_bytes; diff --git a/tests/benchmark.cc b/tests/benchmark.cc index 3259644270..cc3b17a634 100644 --- a/tests/benchmark.cc +++ b/tests/benchmark.cc @@ -137,7 +137,7 @@ Event alloc_piece(RegionInstance &result, size_t size, Memory location) { assert(location != Memory::NO_MEMORY); assert(size > 0); std::vector byte_fields = {sizeof(char)}; - IndexSpace<1> instance_index_space(Rect<1>(0, size-1)); + IndexSpace<1, long long> instance_index_space(Rect<1, long long>(0, size-1)); return RegionInstance::create_instance(result, location, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()); } @@ -720,6 +720,7 @@ class ImageTest : public TestInterface { for (int i = 0; i < num_pieces; i++) { size_t alloc_size = image_requirements[i].lower_bound + (image_requirements[i].upper_bound - image_requirements[i].lower_bound) * buffer_size / 100; + std::cout << "Allocating scratch buffer with size " << alloc_size << " for piece " << i << "\n"; alloc_piece(point_field_data_gpu[i].scratch_buffer, alloc_size, gpu_memory).wait(); } @@ -1058,6 +1059,7 @@ class ImageRangeTest : public TestInterface { for (int i = 0; i < num_pieces; i++) { size_t alloc_size = image_requirements[i].lower_bound + (image_requirements[i].upper_bound - image_requirements[i].lower_bound) * buffer_size / 100; + std::cout << "allocating buffer of size " << alloc_size << " for piece " << i << "\n"; alloc_piece(rect_field_data_gpu[i].scratch_buffer, alloc_size, gpu_memory).wait(); } From dc8d5743838ea06b342775527683b78ea242a7c6 Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Wed, 11 Mar 2026 12:34:37 -0700 Subject: [PATCH 24/32] fixed overflow --- src/realm/deppart/image.cc | 1 - src/realm/deppart/partitions_gpu_impl.hpp | 11 ----------- tests/benchmark.cc | 2 -- 3 files changed, 14 deletions(-) diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc index edc8ffc010..9eaf7b8197 100644 --- a/src/realm/deppart/image.cc +++ b/src/realm/deppart/image.cc @@ -82,7 +82,6 @@ namespace Realm { requirements[i].lower_bound = minimal_size; requirements[i].upper_bound = optimal_size; requirements[i].minimum_alignment = 128; - std::cout << "UPPER BOUND IS " << optimal_size << std::endl; } else { requirements[i].affinity_processor = Processor::NO_PROC; requirements[i].lower_bound = 0; diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp index 722a4113df..93cfc5582b 100644 --- a/src/realm/deppart/partitions_gpu_impl.hpp +++ b/src/realm/deppart/partitions_gpu_impl.hpp @@ -1462,16 +1462,9 @@ namespace Realm { size_t max_aux_bytes = std::max({bytes_T, bytes_S, bytes_R}); size_t max_pg_bytes = std::max({bytes_p, bytes_S}); - std::cout << "COMPLETE PIPELINE HAS USED " << my_arena.used() << " bytes" << " out of " << my_arena.capacity() << std::endl; - std::cout << "TOTAL POINTS IS " << total_pts << std::endl; - - std::cout << "AUX BYTES: " << max_aux_bytes << std::endl; - // Instance shared by coordinate keys, source keys, and rectangle outputs char* aux_ptr = my_arena.alloc(2 * max_aux_bytes); - std::cout << "PG BYTES: " << max_pg_bytes << std::endl; - //Instance shared by group ids (RLE) and intermediate points in sorting char* pg_ptr = my_arena.alloc(max_pg_bytes); @@ -1499,12 +1492,8 @@ namespace Realm { //Temporary storage instance shared by CUB operations. size_t temp_bytes = std::max({t1, t2, t3}); - std::cout << "TEMP BYTES: " << temp_bytes << std::endl; void *temp_storage = my_arena.alloc(temp_bytes); - std::cout << "TOTAL BYTES: " << my_arena.used() + temp_bytes << std::endl; - - //Sort along each dimension from LSB to MSB (0 to N-1) size_t use_bytes = temp_bytes; diff --git a/tests/benchmark.cc b/tests/benchmark.cc index cc3b17a634..b0bed444e1 100644 --- a/tests/benchmark.cc +++ b/tests/benchmark.cc @@ -720,7 +720,6 @@ class ImageTest : public TestInterface { for (int i = 0; i < num_pieces; i++) { size_t alloc_size = image_requirements[i].lower_bound + (image_requirements[i].upper_bound - image_requirements[i].lower_bound) * buffer_size / 100; - std::cout << "Allocating scratch buffer with size " << alloc_size << " for piece " << i << "\n"; alloc_piece(point_field_data_gpu[i].scratch_buffer, alloc_size, gpu_memory).wait(); } @@ -1059,7 +1058,6 @@ class ImageRangeTest : public TestInterface { for (int i = 0; i < num_pieces; i++) { size_t alloc_size = image_requirements[i].lower_bound + (image_requirements[i].upper_bound - image_requirements[i].lower_bound) * buffer_size / 100; - std::cout << "allocating buffer of size " << alloc_size << " for piece " << i << "\n"; alloc_piece(rect_field_data_gpu[i].scratch_buffer, alloc_size, gpu_memory).wait(); } From 0e836f0d98ad61e32ecc1f230a54ff1e9c102184 Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Wed, 11 Mar 2026 13:02:37 -0700 Subject: [PATCH 25/32] removed prints --- src/realm/deppart/image_gpu_impl.hpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp index 48faad0585..fa25ab5632 100644 --- a/src/realm/deppart/image_gpu_impl.hpp +++ b/src/realm/deppart/image_gpu_impl.hpp @@ -332,7 +332,7 @@ void GPUImageMicroOp::gpu_populate_ptrs() CUstream stream = this->stream->get_stream(); size_t tile_size = buffer.get_layout()->bytes_used; - std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; + //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size); collapsed_space src_space; @@ -452,8 +452,6 @@ void GPUImageMicroOp::gpu_populate_ptrs() buffer_arena.start(); d_valid_points = buffer_arena.alloc>(num_valid_points); - std::cout << "Tile has " << num_valid_rects << " valid rects and " << num_valid_points << " valid points." << std::endl; - CUDA_CHECK(cudaMemsetAsync(d_inst_counters, 0, (domain_transform.ptr_data.size()) * sizeof(uint32_t), stream), stream); image_gpuPopulateBitmasksPtrsKernel<<>>(d_accessors, d_valid_rects, collapsed_parent.entries_buffer, d_prefix_rects, d_inst_prefix, d_prefix_points, total_pts, num_valid_rects, domain_transform.ptr_data.size(), collapsed_parent.num_entries, d_inst_counters, d_valid_points); From 27771caa56b5ebc8d62364084cb6e250dbb67d1d Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Wed, 11 Mar 2026 17:18:48 -0700 Subject: [PATCH 26/32] picked better host memories --- src/realm/deppart/byfield_gpu_impl.hpp | 6 ++--- src/realm/deppart/image_gpu_impl.hpp | 14 +++++----- src/realm/deppart/partitions.h | 10 ++++--- src/realm/deppart/partitions_gpu_impl.hpp | 33 +++++++++++------------ src/realm/deppart/preimage_gpu_impl.hpp | 12 ++++----- 5 files changed, 39 insertions(+), 36 deletions(-) diff --git a/src/realm/deppart/byfield_gpu_impl.hpp b/src/realm/deppart/byfield_gpu_impl.hpp index 8e1c953730..4d59d30b54 100644 --- a/src/realm/deppart/byfield_gpu_impl.hpp +++ b/src/realm/deppart/byfield_gpu_impl.hpp @@ -31,7 +31,7 @@ void GPUByFieldMicroOp::execute() //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; - Arena buffer_arena(field_data[0].scratch_buffer.pointer_untyped(0, tile_size), tile_size); + Arena buffer_arena(field_data[0].scratch_buffer); inst_space.offsets = buffer_arena.alloc(field_data.size() + 1); inst_space.num_children = field_data.size(); @@ -76,7 +76,7 @@ void GPUByFieldMicroOp::execute() Memory zcpy_mem; - assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM)); + assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM, buffer_arena.location)); // We need to pass the accessors to the GPU so it can read field values. RegionInstance accessors_instance = this->realm_malloc(field_data.size() * sizeof(AffineAccessor), zcpy_mem); @@ -94,7 +94,7 @@ void GPUByFieldMicroOp::execute() } Memory sysmem; - assert(find_memory(sysmem, Memory::SYSTEM_MEM)); + assert(find_memory(sysmem, Memory::SYSTEM_MEM, buffer_arena.location)); size_t num_output = 0; RectDesc* output_start = nullptr; diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp index fa25ab5632..7bac9f9054 100644 --- a/src/realm/deppart/image_gpu_impl.hpp +++ b/src/realm/deppart/image_gpu_impl.hpp @@ -49,7 +49,7 @@ void GPUImageMicroOp::gpu_populate_rngs() RegionInstance buffer = domain_transform.range_data[0].scratch_buffer; size_t tile_size = buffer.get_layout()->bytes_used; //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; - Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size); + Arena buffer_arena(buffer); CUstream stream = this->stream->get_stream(); @@ -81,7 +81,7 @@ void GPUImageMicroOp::gpu_populate_rngs() GPUMicroOp::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream); Memory zcpy_mem; - assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM)); + assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM, buffer_arena.location)); RegionInstance accessors_instance = this->realm_malloc(domain_transform.range_data.size() * sizeof(AffineAccessor,N2,T2>), zcpy_mem); AffineAccessor,N2,T2>* d_accessors = reinterpret_cast,N2,T2>*>(AffineAccessor(accessors_instance, 0).base); for (size_t i = 0; i < domain_transform.range_data.size(); ++i) { @@ -326,14 +326,14 @@ void GPUImageMicroOp::gpu_populate_ptrs() NVTX_DEPPART(gpu_image); - Memory sysmem; - find_memory(sysmem, Memory::SYSTEM_MEM); - CUstream stream = this->stream->get_stream(); size_t tile_size = buffer.get_layout()->bytes_used; //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; - Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size); + Arena buffer_arena(buffer); + + Memory sysmem; + assert(find_memory(sysmem, Memory::SYSTEM_MEM, buffer_arena.location)); collapsed_space src_space; src_space.offsets = buffer_arena.alloc(sources.size()+1); @@ -366,7 +366,7 @@ void GPUImageMicroOp::gpu_populate_ptrs() GPUMicroOp::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream); Memory zcpy_mem; - assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM)); + assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM, buffer_arena.location)); RegionInstance accessors_instance = this->realm_malloc(domain_transform.ptr_data.size() * sizeof(AffineAccessor,N2,T2>), zcpy_mem); AffineAccessor,N2,T2>* d_accessors = reinterpret_cast,N2,T2>*>(AffineAccessor(accessors_instance, 0).base); for (size_t i = 0; i < domain_transform.ptr_data.size(); ++i) { diff --git a/src/realm/deppart/partitions.h b/src/realm/deppart/partitions.h index 9ccbde6b75..a3d0d3feb8 100644 --- a/src/realm/deppart/partitions.h +++ b/src/realm/deppart/partitions.h @@ -52,6 +52,7 @@ namespace Realm { namespace Cuda { class GPUStream; + class GPUProcessor; } template @@ -114,9 +115,10 @@ namespace Realm { public: using byte = std::byte; - Arena() noexcept : base_(nullptr), cap_(0), parity_(false), left_(0), right_(0), base_left_(0), base_right_(0) {} - Arena(void* buffer, size_t bytes) noexcept - : base_(reinterpret_cast(buffer)), cap_(bytes), parity_(false), left_(0), right_(0), base_left_(0), base_right_(0) {} + Arena() noexcept : location(Memory::NO_MEMORY), base_(nullptr), cap_(0), parity_(false), left_(0), right_(0), base_left_(0), base_right_(0) {} + Arena(void* buffer, size_t bytes, Memory location) noexcept + : location(location), base_(reinterpret_cast(buffer)), cap_(bytes), parity_(false), left_(0), right_(0), base_left_(0), base_right_(0) {} + Arena(RegionInstance buffer) : Arena(buffer.pointer_untyped(0, buffer.get_layout()->bytes_used), buffer.get_layout()->bytes_used, buffer.get_location()) {} size_t capacity() const noexcept { return cap_; } size_t used() const noexcept { return left_ + right_; } @@ -194,6 +196,8 @@ namespace Realm { parity_ = false; } + Memory location; + private: void* alloc_left_bytes(size_t bytes, size_t align = alignof(std::max_align_t)) { diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp index 93cfc5582b..015a1b7726 100644 --- a/src/realm/deppart/partitions_gpu_impl.hpp +++ b/src/realm/deppart/partitions_gpu_impl.hpp @@ -90,20 +90,22 @@ namespace Realm { }; // Finds a memory of the specified kind. Returns true on success, false otherwise. - inline bool find_memory(Memory &output, Memory::Kind kind) + inline bool find_memory(Memory &output, Memory::Kind kind, Memory input = Memory::NO_MEMORY) { - bool found = false; - Machine machine = Machine::get_machine(); - std::set all_memories; - machine.get_all_memories(all_memories); - for(auto& memory : all_memories) { - if(memory.kind() == kind) { - output = memory; - found = true; - break; + std::vector affinities; + unsigned best_bandwidth = 0; + output = Memory::NO_MEMORY; + Machine::get_machine().get_mem_mem_affinity(affinities, input, Memory::NO_MEMORY); + for (auto affinity : affinities) { + if (affinity.m2.kind() != kind) { + continue; + } + if (affinity.bandwidth > best_bandwidth) { + best_bandwidth = affinity.bandwidth; + output = affinity.m2; } } - return found; + return output != Memory::NO_MEMORY; } template @@ -228,7 +230,7 @@ namespace Realm { //We copy into one contiguous host buffer, then copy to device Memory sysmem; - assert(find_memory(sysmem, Memory::SYSTEM_MEM)); + assert(find_memory(sysmem, Memory::SYSTEM_MEM, my_arena.location)); RegionInstance h_instance = realm_malloc(out_space.num_entries * sizeof(SparsityMapEntry), sysmem); @@ -610,9 +612,6 @@ namespace Realm { NVTX_DEPPART(complete_rect_pipeline); CUstream stream = this->stream->get_stream(); - Memory my_mem; - assert(find_memory(my_mem, Memory::GPU_FB_MEM)); - assert(!my_arena.get_parity()); size_t beginning = my_arena.mark(); @@ -1579,7 +1578,7 @@ namespace Realm { RegionInstance sys_instance = RegionInstance::NO_INST; Memory sysmem; - assert(find_memory(sysmem, Memory::SYSTEM_MEM)); + assert(find_memory(sysmem, Memory::SYSTEM_MEM, my_arena.location)); Rect* final_rects; std::vector d_starts_host(output_instances.size()), d_ends_host(output_instances.size()); @@ -1707,7 +1706,7 @@ namespace Realm { } Memory sysmem; - assert(find_memory(sysmem, Memory::SYSTEM_MEM)); + assert(find_memory(sysmem, Memory::SYSTEM_MEM, my_arena.location)); if (!this->exclusive) { for (auto const& elem : ctr) { size_t idx = getIndex(elem); diff --git a/src/realm/deppart/preimage_gpu_impl.hpp b/src/realm/deppart/preimage_gpu_impl.hpp index 3a104f2e84..2a93136921 100644 --- a/src/realm/deppart/preimage_gpu_impl.hpp +++ b/src/realm/deppart/preimage_gpu_impl.hpp @@ -19,12 +19,12 @@ namespace Realm { size_t tile_size = buffer.get_layout()->bytes_used; //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; - Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size); + Arena buffer_arena(buffer); NVTX_DEPPART(gpu_preimage_range); Memory sysmem; - find_memory(sysmem, Memory::SYSTEM_MEM); + assert(find_memory(sysmem, Memory::SYSTEM_MEM, buffer_arena.location)); CUstream stream = this->stream->get_stream(); @@ -58,7 +58,7 @@ namespace Realm { GPUMicroOp::collapse_multi_space(targets, target_space, buffer_arena, stream); Memory zcpy_mem; - assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM)); + assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM, buffer_arena.location)); RegionInstance accessors_instance = this->realm_malloc(domain_transform.range_data.size() * sizeof(AffineAccessor,N,T>), zcpy_mem); AffineAccessor,N,T>* d_accessors = reinterpret_cast,N,T>*>(AffineAccessor(accessors_instance, 0).base); for (size_t i = 0; i < domain_transform.range_data.size(); ++i) { @@ -327,10 +327,10 @@ namespace Realm { size_t tile_size = buffer.get_layout()->bytes_used; //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; - Arena buffer_arena(buffer.pointer_untyped(0, tile_size), tile_size); + Arena buffer_arena(buffer); Memory sysmem; - find_memory(sysmem, Memory::SYSTEM_MEM); + assert(find_memory(sysmem, Memory::SYSTEM_MEM, buffer_arena.location)); CUstream stream = this->stream->get_stream(); @@ -366,7 +366,7 @@ namespace Realm { GPUMicroOp::collapse_multi_space(targets, target_space, buffer_arena, stream); Memory zcpy_mem; - assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM)); + assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM, buffer_arena.location)); RegionInstance accessors_instance = this->realm_malloc(domain_transform.ptr_data.size() * sizeof(AffineAccessor,N,T>), zcpy_mem); AffineAccessor,N,T>* d_accessors = reinterpret_cast,N,T>*>(AffineAccessor(accessors_instance, 0).base); for (size_t i = 0; i < domain_transform.ptr_data.size(); ++i) { From 07e354acec9d013f91d10a7429115e6c96bfccef Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Thu, 19 Mar 2026 15:35:21 -0700 Subject: [PATCH 27/32] for flecsii --- src/realm/deppart/byfield.cc | 100 +++++- src/realm/deppart/byfield.h | 20 +- src/realm/deppart/byfield_gpu_impl.hpp | 40 ++- src/realm/deppart/image.cc | 205 ++++++++++-- src/realm/deppart/image.h | 19 ++ src/realm/deppart/image_gpu_impl.hpp | 58 ++-- src/realm/deppart/partitions.cc | 11 - src/realm/deppart/partitions.h | 9 +- src/realm/deppart/partitions_gpu_impl.hpp | 138 +++++--- src/realm/deppart/preimage.cc | 176 ++++++++++- src/realm/deppart/preimage.h | 14 +- src/realm/deppart/preimage_gpu_impl.hpp | 54 ++-- src/realm/deppart/sparsity_impl.cc | 364 +++++++++++++++++++++- src/realm/deppart/sparsity_impl.h | 17 +- src/realm/sparsity.h | 6 +- src/realm/sparsity.inl | 10 +- tests/benchmark.cc | 2 +- 17 files changed, 1047 insertions(+), 196 deletions(-) diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc index 78aceb2f92..ed65533555 100644 --- a/src/realm/deppart/byfield.cc +++ b/src/realm/deppart/byfield.cc @@ -325,13 +325,61 @@ namespace Realm { bool _exclusive) : parent_space(_parent), field_data(_field_data) { this->exclusive = _exclusive; - Memory my_mem = field_data[0].inst.get_location(); - Processor best_proc; - assert(choose_proc(best_proc, my_mem)); - Cuda::GPUProcessor* gpu_proc = dynamic_cast(get_runtime()->get_processor_impl(best_proc)); - assert(gpu_proc); - this->gpu = gpu_proc->gpu; - this->stream = gpu_proc->gpu->get_deppart_stream(); + areg.force_instantiation(); + // GPU setup (this->gpu, this->stream) deferred to execute(), which runs on the + // correct node after dispatch() has forwarded to the instance owner if needed. + } + + template + template + GPUByFieldMicroOp::GPUByFieldMicroOp( + NodeID _requestor, AsyncMicroOp *_async_microop, S& s) + : GPUMicroOp(_requestor, _async_microop) + , parent_space() { + bool ok = true; + size_t n = 0; + ok = ok && (s >> parent_space); + ok = ok && (s >> this->exclusive); + ok = ok && (s >> n); + field_data.resize(n); + for(size_t i = 0; i < n && ok; i++) + ok = ok && (s >> field_data[i].index_space) && + (s >> field_data[i].inst) && + (s >> field_data[i].field_offset) && + (s >> field_data[i].scratch_buffer); + // Deserialize colors manually to avoid std::vector proxy issues + size_t nc = 0; + ok = ok && (s >> nc); + for(size_t i = 0; i < nc && ok; i++) { + FT c; + ok = ok && (s >> c); + if(ok) colors.push_back(c); + } + ok = ok && (s >> sparsity_outputs); + assert(ok); + (void)ok; + } + + template + template + bool GPUByFieldMicroOp::serialize_params(S& s) const { + bool ok = true; + ok = ok && (s << parent_space); + ok = ok && (s << this->exclusive); + ok = ok && (s << field_data.size()); + for(size_t i = 0; i < field_data.size() && ok; i++) + ok = ok && (s << field_data[i].index_space) && + (s << field_data[i].inst) && + (s << field_data[i].field_offset) && + (s << field_data[i].scratch_buffer); + // Serialize colors manually to avoid std::vector proxy issues + ok = ok && (s << colors.size()); + for(size_t i = 0; i < colors.size() && ok; i++) { + FT c = colors[i]; + ok = ok && (s << c); + } + ok = ok && (s << sparsity_outputs); + return ok; } template @@ -342,6 +390,17 @@ namespace Realm { void GPUByFieldMicroOp::dispatch( PartitioningOperation *op, bool inline_ok) { + // GPU by-field must execute on the node that owns the GPU memory + NodeID exec_node = ID(field_data[0].inst).instance_owner_node(); + if(this->exclusive) { + for(const auto& it : sparsity_outputs) + assert(NodeID(ID(it.second).sparsity_creator_node()) == exec_node); + } + if(exec_node != Network::my_node_id) { + PartitioningMicroOp::template forward_microop >(exec_node, op, this); + return; + } + // We have to register ourselves as a waiter on sparse inputs before dispatching. for (size_t i = 0; i < field_data.size(); i++) { @@ -367,6 +426,10 @@ namespace Realm { sparsity_outputs[_val] = _sparsity; } + template + ActiveMessageHandlerReg > > + GPUByFieldMicroOp::areg; + #endif @@ -383,12 +446,26 @@ namespace Realm { : PartitioningOperation(reqs, _finish_event, _finish_gen) , parent(_parent) , field_data(_field_data) + , exclusive_gpu_owner(exclusive_gpu_exec_node()) {} template ByFieldOperation::~ByFieldOperation(void) {} + template + NodeID ByFieldOperation::exclusive_gpu_exec_node(void) const + { + if(field_data.size() != 1) + return -1; + + Memory::Kind kind = field_data[0].inst.get_location().kind(); + if((kind != Memory::GPU_FB_MEM) && (kind != Memory::Z_COPY_MEM)) + return -1; + + return ID(field_data[0].inst).instance_owner_node(); + } + template IndexSpace ByFieldOperation::add_color(FT color) { @@ -401,8 +478,13 @@ namespace Realm { subspace.bounds = parent.bounds; // get a sparsity ID by round-robin'ing across the nodes that have field data - int target_node = ID(field_data[colors.size() % field_data.size()].inst).instance_owner_node(); - SparsityMap sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert >(); + int target_node = (exclusive_gpu_owner >= 0) ? + exclusive_gpu_owner : + ID(field_data[colors.size() % field_data.size()].inst).instance_owner_node(); + if(exclusive_gpu_owner >= 0) + assert(target_node == exclusive_gpu_exec_node()); + SparsityMap sparsity = + create_deppart_output_sparsity(target_node).convert>(); subspace.sparsity = sparsity; colors.push_back(color); diff --git a/src/realm/deppart/byfield.h b/src/realm/deppart/byfield.h index cc21234f32..35b823552f 100644 --- a/src/realm/deppart/byfield.h +++ b/src/realm/deppart/byfield.h @@ -73,6 +73,10 @@ namespace Realm { template class GPUByFieldMicroOp : public GPUMicroOp { public: + static const int DIM = N; + typedef T IDXTYPE; + typedef FT FIELDTYPE; + GPUByFieldMicroOp( const IndexSpace &_parent, std::vector,FT> > _field_data, @@ -87,7 +91,18 @@ namespace Realm { void add_sparsity_output(FT _val, SparsityMap _sparsity); protected: - const IndexSpace parent_space; + friend struct RemoteMicroOpMessage >; + static ActiveMessageHandlerReg > > areg; + + friend class PartitioningMicroOp; + template + REALM_ATTR_WARN_UNUSED(bool serialize_params(S& s) const); + + // construct from received packet + template + GPUByFieldMicroOp(NodeID _requestor, AsyncMicroOp *_async_microop, S& s); + + IndexSpace parent_space; std::vector,FT> > field_data; std::vector colors; std::map > sparsity_outputs; @@ -112,10 +127,13 @@ namespace Realm { virtual void print(std::ostream& os) const; protected: + NodeID exclusive_gpu_exec_node(void) const; + IndexSpace parent; std::vector,FT> > field_data; std::vector colors; std::vector > subspaces; + int exclusive_gpu_owner; }; }; diff --git a/src/realm/deppart/byfield_gpu_impl.hpp b/src/realm/deppart/byfield_gpu_impl.hpp index 4d59d30b54..bf25f81f03 100644 --- a/src/realm/deppart/byfield_gpu_impl.hpp +++ b/src/realm/deppart/byfield_gpu_impl.hpp @@ -17,6 +17,20 @@ namespace Realm { template void GPUByFieldMicroOp::execute() { + // Resolve the local GPU processor now that we are guaranteed to be on the + // correct node (dispatch() forwarded us here if the instance was remote). + { + Memory my_mem = field_data[0].inst.get_location(); + Processor best_proc; + assert(choose_proc(best_proc, my_mem)); + Cuda::GPUProcessor *gpu_proc = + dynamic_cast(get_runtime()->get_processor_impl(best_proc)); + assert(gpu_proc); + this->gpu = gpu_proc->gpu; + this->stream = gpu_proc->gpu->get_deppart_stream(); + } + + Cuda::AutoGPUContext agc(this->gpu); @@ -75,15 +89,14 @@ void GPUByFieldMicroOp::execute() } - Memory zcpy_mem; - assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM, buffer_arena.location)); - - // We need to pass the accessors to the GPU so it can read field values. - RegionInstance accessors_instance = this->realm_malloc(field_data.size() * sizeof(AffineAccessor), zcpy_mem); - AffineAccessor* d_accessors = reinterpret_cast*>(AffineAccessor(accessors_instance, 0).base); + std::vector> h_accessors(field_data.size()); for (size_t i = 0; i < field_data.size(); ++i) { - d_accessors[i] = AffineAccessor(field_data[i].inst, field_data[i].field_offset); + h_accessors[i] = AffineAccessor(field_data[i].inst, field_data[i].field_offset); } + AffineAccessor* d_accessors = buffer_arena.alloc>(field_data.size()); + CUDA_CHECK(cudaMemcpyAsync(d_accessors, h_accessors.data(), + field_data.size() * sizeof(AffineAccessor), + cudaMemcpyHostToDevice, stream), stream); buffer_arena.commit(false); @@ -103,7 +116,7 @@ void GPUByFieldMicroOp::execute() int count = 0; if (count) {} bool host_fallback = false; - std::vector h_instances(colors.size(), RegionInstance::NO_INST); + std::vector*> host_rect_buffers(colors.size(), nullptr); std::vector entry_counts(colors.size(), 0); while (num_completed < inst_space.num_entries) { try { @@ -167,7 +180,7 @@ void GPUByFieldMicroOp::execute() }); if (host_fallback) { - this->split_output(d_new_rects, num_new_rects, h_instances, entry_counts, buffer_arena); + this->split_output(d_new_rects, num_new_rects, host_rect_buffers, entry_counts, buffer_arena); } if (num_output==0 || host_fallback) { @@ -216,7 +229,7 @@ void GPUByFieldMicroOp::execute() } else { host_fallback = true; if (num_output > 0) { - this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena); } curr_tile = tile_size / 2; } @@ -248,7 +261,7 @@ void GPUByFieldMicroOp::execute() return kv.second; }); } catch (arena_oom&) { - this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena); host_fallback = true; } } @@ -261,10 +274,9 @@ void GPUByFieldMicroOp::execute() } size_t idx = color_indices.at(it.first); if (entry_counts[idx] > 0) { - Rect* h_rects = reinterpret_cast *>(AffineAccessor(h_instances[idx], 0).base); - span> h_rects_span(h_rects, entry_counts[idx]); + span> h_rects_span(host_rect_buffers[idx], entry_counts[idx]); impl->contribute_dense_rect_list(h_rects_span, true); - h_instances[idx].destroy(); + deppart_host_free(host_rect_buffers[idx]); } else { impl->contribute_nothing(); } diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc index 9eaf7b8197..217543d147 100644 --- a/src/realm/deppart/image.cc +++ b/src/realm/deppart/image.cc @@ -25,6 +25,7 @@ #include "realm/deppart/preimage.h" #include "realm/logging.h" #include "realm/cuda/cuda_internal.h" +#include namespace Realm { @@ -500,12 +501,43 @@ namespace Realm { EventImpl::gen_t _finish_gen) : PartitioningOperation(reqs, _finish_event, _finish_gen), parent(_parent), - domain_transform(_domain_transform) {} + domain_transform(_domain_transform), + is_intersection(false), + exclusive_gpu_owner(exclusive_gpu_exec_node()) + {} template ImageOperation::~ImageOperation(void) {} + template + NodeID ImageOperation::exclusive_gpu_exec_node(void) const + { + size_t gpu_ptrs = 0, gpu_rects = 0, cpu_ptrs = 0, cpu_rects = 0; + for(size_t i = 0; i < domain_transform.ptr_data.size(); i++) { + Memory::Kind kind = domain_transform.ptr_data[i].inst.get_location().kind(); + if((kind == Memory::GPU_FB_MEM) || (kind == Memory::Z_COPY_MEM)) + gpu_ptrs++; + else + cpu_ptrs++; + } + for(size_t i = 0; i < domain_transform.range_data.size(); i++) { + Memory::Kind kind = domain_transform.range_data[i].inst.get_location().kind(); + if((kind == Memory::GPU_FB_MEM) || (kind == Memory::Z_COPY_MEM)) + gpu_rects++; + else + cpu_rects++; + } + size_t opcount = gpu_ptrs + gpu_rects + cpu_ptrs + cpu_rects; + if((gpu_ptrs + gpu_rects) == 0 || (opcount != 1)) + return -1; + if(gpu_ptrs == 1) + return ID(domain_transform.ptr_data[0].inst).instance_owner_node(); + if(gpu_rects == 1) + return ID(domain_transform.range_data[0].inst).instance_owner_node(); + return -1; + } + template IndexSpace ImageOperation::add_source(const IndexSpace& source) { @@ -520,17 +552,22 @@ namespace Realm { // if the source has a sparsity map, use the same node - otherwise // get a sparsity ID by round-robin'ing across the nodes that have field data int target_node = 0; - if(!source.dense()) + if(exclusive_gpu_owner >= 0) + target_node = exclusive_gpu_owner; + else if(!source.dense()) target_node = ID(source.sparsity).sparsity_creator_node(); else if(!domain_transform.ptr_data.empty()) target_node = ID(domain_transform.ptr_data[sources.size() % domain_transform.ptr_data.size()].inst).instance_owner_node(); else target_node = ID(domain_transform.range_data[sources.size() % domain_transform.range_data.size()].inst).instance_owner_node(); + if(exclusive_gpu_owner >= 0) { + assert(target_node == exclusive_gpu_exec_node()); + } - SparsityMap sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert >(); + SparsityMap sparsity = + create_deppart_output_sparsity(target_node).convert>(); image.sparsity = sparsity; - sources.push_back(source); images.push_back(sparsity); @@ -552,17 +589,22 @@ namespace Realm { // if the source has a sparsity map, use the same node - otherwise // get a sparsity ID by round-robin'ing across the nodes that have field data int target_node; - if(!source.dense()) + if(exclusive_gpu_owner >= 0) + target_node = exclusive_gpu_owner; + else if(!source.dense()) target_node = ID(source.sparsity).sparsity_creator_node(); else if(!domain_transform.ptr_data.empty()) target_node = ID(domain_transform.ptr_data[sources.size() % domain_transform.ptr_data.size()].inst).instance_owner_node(); else target_node = ID(domain_transform.range_data[sources.size() % domain_transform.range_data.size()].inst).instance_owner_node(); + if(exclusive_gpu_owner >= 0) { + assert(target_node == exclusive_gpu_exec_node()); + } - SparsityMap sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert >(); + SparsityMap sparsity = + create_deppart_output_sparsity(target_node).convert>(); image.sparsity = sparsity; - sources.push_back(source); diff_rhss.push_back(diff_rhs); images.push_back(sparsity); @@ -586,17 +628,22 @@ namespace Realm { // if the source has a sparsity map, use the same node - otherwise // get a sparsity ID by round-robin'ing across the nodes that have field data int target_node; - if(!source.dense()) + if(exclusive_gpu_owner >= 0) + target_node = exclusive_gpu_owner; + else if(!source.dense()) target_node = ID(source.sparsity).sparsity_creator_node(); else if(!domain_transform.ptr_data.empty()) target_node = ID(domain_transform.ptr_data[sources.size() % domain_transform.ptr_data.size()].inst).instance_owner_node(); else target_node = ID(domain_transform.range_data[sources.size() % domain_transform.range_data.size()].inst).instance_owner_node(); + if(exclusive_gpu_owner >= 0) { + assert(target_node == exclusive_gpu_exec_node()); + } - SparsityMap sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert >(); + SparsityMap sparsity = + create_deppart_output_sparsity(target_node).convert>(); image.sparsity = sparsity; - sources.push_back(source); diff_rhss.push_back(diff_rhs); images.push_back(sparsity); @@ -666,6 +713,15 @@ namespace Realm { } else { size_t opcount = cpu_ptr_data.size() + cpu_rect_data.size() + gpu_ptr_data.size() + gpu_rect_data.size(); bool exclusive = gpu_data && (opcount == 1); + if(exclusive) { + NodeID expected_owner = exclusive_gpu_exec_node(); + assert(exclusive_gpu_owner >= 0); + assert(NodeID(exclusive_gpu_owner) == expected_owner); + for(size_t i = 0; i < images.size(); i++) { + NodeID output_owner = NodeID(ID(images[i]).sparsity_creator_node()); + assert(output_owner == NodeID(exclusive_gpu_owner)); + } + } if (!exclusive) { // launch full cross-product of image micro ops right away for (size_t i = 0; i < sources.size(); i++) { @@ -702,6 +758,10 @@ namespace Realm { for (auto ptr_fdd : gpu_ptr_data) { // launch full cross-product of image micro ops right away assert(ptr_fdd.scratch_buffer != RegionInstance::NO_INST); + if(exclusive) { + NodeID microop_exec_node = ID(ptr_fdd.inst).instance_owner_node(); + assert(NodeID(exclusive_gpu_owner) == microop_exec_node); + } DomainTransform domain_transform_copy = domain_transform; domain_transform_copy.ptr_data = {ptr_fdd}; GPUImageMicroOp *micro_op = @@ -715,6 +775,10 @@ namespace Realm { for (auto rect_fdd : gpu_rect_data) { // launch full cross-product of image micro ops right away assert(rect_fdd.scratch_buffer != RegionInstance::NO_INST); + if(exclusive) { + NodeID microop_exec_node = ID(rect_fdd.inst).instance_owner_node(); + assert(NodeID(exclusive_gpu_owner) == microop_exec_node); + } DomainTransform domain_transform_copy = domain_transform; domain_transform_copy.range_data = {rect_fdd}; GPUImageMicroOp *micro_op = @@ -942,14 +1006,76 @@ namespace Realm { bool _exclusive) : parent_space(_parent), domain_transform(_domain_transform) { - this->exclusive = _exclusive; - Memory my_mem = domain_transform.ptr_data.empty() ? domain_transform.range_data[0].inst.get_location() : domain_transform.ptr_data[0].inst.get_location(); - Processor best_proc; - assert(choose_proc(best_proc, my_mem)); - Cuda::GPUProcessor* gpu_proc = dynamic_cast(get_runtime()->get_processor_impl(best_proc)); - assert(gpu_proc); - this->gpu = gpu_proc->gpu; - this->stream = gpu_proc->gpu->get_deppart_stream(); + this->exclusive = _exclusive; + areg.force_instantiation(); + // GPU setup (this->gpu, this->stream) deferred to execute(), which runs on the + // correct node after dispatch() has forwarded to the instance owner if needed. + } + + template + template + GPUImageMicroOp::GPUImageMicroOp( + NodeID _requestor, AsyncMicroOp *_async_microop, S& s) + : GPUMicroOp(_requestor, _async_microop) + { + bool ok = true; + bool use_ptr_data = false; + ok = ok && (s >> parent_space); + ok = ok && (s >> this->exclusive); + ok = ok && (s >> use_ptr_data); + if(use_ptr_data) { + domain_transform.type = DomainTransform::DomainTransformType::UNSTRUCTURED_PTR; + size_t n = 0; + ok = ok && (s >> n); + domain_transform.ptr_data.resize(n); + for(size_t i = 0; i < n && ok; i++) + ok = ok && (s >> domain_transform.ptr_data[i].index_space) && + (s >> domain_transform.ptr_data[i].inst) && + (s >> domain_transform.ptr_data[i].field_offset) && + (s >> domain_transform.ptr_data[i].scratch_buffer); + } else { + domain_transform.type = DomainTransform::DomainTransformType::UNSTRUCTURED_RANGE; + size_t n = 0; + ok = ok && (s >> n); + domain_transform.range_data.resize(n); + for(size_t i = 0; i < n && ok; i++) + ok = ok && (s >> domain_transform.range_data[i].index_space) && + (s >> domain_transform.range_data[i].inst) && + (s >> domain_transform.range_data[i].field_offset) && + (s >> domain_transform.range_data[i].scratch_buffer); + } + ok = ok && (s >> sources); + ok = ok && (s >> sparsity_outputs); + assert(ok); + (void)ok; + } + + template + template + bool GPUImageMicroOp::serialize_params(S& s) const { + bool ok = true; + bool use_ptr_data = !domain_transform.ptr_data.empty(); + ok = ok && (s << parent_space); + ok = ok && (s << this->exclusive); + ok = ok && (s << use_ptr_data); + if(use_ptr_data) { + ok = ok && (s << domain_transform.ptr_data.size()); + for(size_t i = 0; i < domain_transform.ptr_data.size() && ok; i++) + ok = ok && (s << domain_transform.ptr_data[i].index_space) && + (s << domain_transform.ptr_data[i].inst) && + (s << domain_transform.ptr_data[i].field_offset) && + (s << domain_transform.ptr_data[i].scratch_buffer); + } else { + ok = ok && (s << domain_transform.range_data.size()); + for(size_t i = 0; i < domain_transform.range_data.size() && ok; i++) + ok = ok && (s << domain_transform.range_data[i].index_space) && + (s << domain_transform.range_data[i].inst) && + (s << domain_transform.range_data[i].field_offset) && + (s << domain_transform.range_data[i].scratch_buffer); + } + ok = ok && (s << sources); + ok = ok && (s << sparsity_outputs); + return ok; } template @@ -959,6 +1085,20 @@ namespace Realm { void GPUImageMicroOp::dispatch( PartitioningOperation *op, bool inline_ok) { + // GPU image must execute on the node that owns the GPU memory + NodeID exec_node = domain_transform.ptr_data.empty() ? + ID(domain_transform.range_data[0].inst).instance_owner_node() : + ID(domain_transform.ptr_data[0].inst).instance_owner_node(); + if(this->exclusive) { + for(size_t i = 0; i < sparsity_outputs.size(); i++) { + assert(NodeID(ID(sparsity_outputs[i]).sparsity_creator_node()) == exec_node); + } + } + if(exec_node != Network::my_node_id) { + PartitioningMicroOp::template forward_microop >(exec_node, op, this); + return; + } + for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) { IndexSpace inst_space = domain_transform.ptr_data[i].index_space; if (!inst_space.dense()) { @@ -969,6 +1109,16 @@ namespace Realm { this->wait_count.fetch_add(1); } } + for (size_t i = 0; i < domain_transform.range_data.size(); i++) { + IndexSpace inst_space = domain_transform.range_data[i].index_space; + if (!inst_space.dense()) { + // it's safe to add the count after the registration only because we initialized + // the count to 2 instead of 1 + bool registered = SparsityMapImpl::lookup(inst_space.sparsity)->add_waiter(this, true /*precise*/); + if(registered) + this->wait_count.fetch_add(1); + } + } for (size_t i = 0; i < sources.size(); i++) { if (!sources[i].dense()) { @@ -994,10 +1144,29 @@ namespace Realm { sparsity_outputs.push_back(_sparsity); } + template + ActiveMessageHandlerReg > > + GPUImageMicroOp::areg; + template void GPUImageMicroOp::execute(void) { TimeStamp ts("GPUImageMicroOp::execute", true, &log_uop_timing); + // Resolve the local GPU processor now that we are guaranteed to be on the + // correct node (dispatch() forwarded us here if the instance was remote). + { + Memory my_mem = domain_transform.ptr_data.empty() ? + domain_transform.range_data[0].inst.get_location() : + domain_transform.ptr_data[0].inst.get_location(); + Processor best_proc; + assert(choose_proc(best_proc, my_mem)); + Cuda::GPUProcessor *gpu_proc = + dynamic_cast(get_runtime()->get_processor_impl(best_proc)); + assert(gpu_proc); + this->gpu = gpu_proc->gpu; + this->stream = gpu_proc->gpu->get_deppart_stream(); + } + Cuda::AutoGPUContext agc(this->gpu); if (domain_transform.ptr_data.size() > 0) { gpu_populate_ptrs(); diff --git a/src/realm/deppart/image.h b/src/realm/deppart/image.h index 4eed6da566..fec4dc7651 100644 --- a/src/realm/deppart/image.h +++ b/src/realm/deppart/image.h @@ -116,12 +116,15 @@ namespace Realm { virtual void set_overlap_tester(void *tester); protected: + NodeID exclusive_gpu_exec_node(void) const; + IndexSpace parent; DomainTransform domain_transform; std::vector > sources; std::vector > diff_rhss; std::vector > images; bool is_intersection; + int exclusive_gpu_owner; }; template @@ -153,6 +156,11 @@ namespace Realm { template class GPUImageMicroOp : public GPUMicroOp { public: + static const int DIM = N; + typedef T IDXTYPE; + static const int DIM2 = N2; + typedef T2 IDXTYPE2; + GPUImageMicroOp( const IndexSpace &_parent, const DomainTransform &_domain_transform, @@ -174,6 +182,17 @@ namespace Realm { bool is_image_microop() const override { return true; } protected: + friend struct RemoteMicroOpMessage >; + static ActiveMessageHandlerReg > > areg; + + friend class PartitioningMicroOp; + template + REALM_ATTR_WARN_UNUSED(bool serialize_params(S& s) const); + + // construct from received packet + template + GPUImageMicroOp(NodeID _requestor, AsyncMicroOp *_async_microop, S& s); + IndexSpace parent_space; DomainTransform domain_transform; std::vector > sources; diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp index 7bac9f9054..75401be42c 100644 --- a/src/realm/deppart/image_gpu_impl.hpp +++ b/src/realm/deppart/image_gpu_impl.hpp @@ -41,6 +41,7 @@ void GPUImageMicroOp::gpu_populate_rngs() { if (sources.size() == 0) { + assert(sparsity_outputs.empty()); return; } @@ -80,13 +81,15 @@ void GPUImageMicroOp::gpu_populate_rngs() // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs. GPUMicroOp::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream); - Memory zcpy_mem; - assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM, buffer_arena.location)); - RegionInstance accessors_instance = this->realm_malloc(domain_transform.range_data.size() * sizeof(AffineAccessor,N2,T2>), zcpy_mem); - AffineAccessor,N2,T2>* d_accessors = reinterpret_cast,N2,T2>*>(AffineAccessor(accessors_instance, 0).base); + std::vector,N2,T2>> h_accessors(domain_transform.range_data.size()); for (size_t i = 0; i < domain_transform.range_data.size(); ++i) { - d_accessors[i] = AffineAccessor,N2,T2>(domain_transform.range_data[i].inst, domain_transform.range_data[i].field_offset); + h_accessors[i] = AffineAccessor,N2,T2>(domain_transform.range_data[i].inst, domain_transform.range_data[i].field_offset); } + AffineAccessor,N2,T2>* d_accessors = + buffer_arena.alloc,N2,T2>>(domain_transform.range_data.size()); + CUDA_CHECK(cudaMemcpyAsync(d_accessors, h_accessors.data(), + domain_transform.range_data.size() * sizeof(AffineAccessor,N2,T2>), + cudaMemcpyHostToDevice, stream), stream); uint32_t* d_src_counters = buffer_arena.alloc(2 * sources.size() + 1); uint32_t* d_src_prefix = d_src_counters + sources.size(); @@ -100,7 +103,7 @@ void GPUImageMicroOp::gpu_populate_rngs() int count = 0; if (count) {} bool host_fallback = false; - std::vector h_instances(sources.size(), RegionInstance::NO_INST); + std::vector*> host_rect_buffers(sources.size(), nullptr); std::vector entry_counts(sources.size(), 0); while (num_completed < inst_space.num_entries) { try { @@ -197,7 +200,7 @@ void GPUImageMicroOp::gpu_populate_rngs() }); if (host_fallback) { - this->split_output(d_new_rects, num_new_rects, h_instances, entry_counts, buffer_arena); + this->split_output(d_new_rects, num_new_rects, host_rect_buffers, entry_counts, buffer_arena); } //Set our first set of output rectangles @@ -250,7 +253,7 @@ void GPUImageMicroOp::gpu_populate_rngs() } else { host_fallback = true; if (num_output > 0) { - this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena); } curr_tile = tile_size / 2; } @@ -282,7 +285,7 @@ void GPUImageMicroOp::gpu_populate_rngs() return elem; }); } catch (arena_oom&) { - this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena); host_fallback = true; } } @@ -294,10 +297,9 @@ void GPUImageMicroOp::gpu_populate_rngs() impl->set_contributor_count(1); } if (entry_counts[idx] > 0) { - Rect* h_rects = reinterpret_cast *>(AffineAccessor(h_instances[idx], 0).base); - span> h_rects_span(h_rects, entry_counts[idx]); + span> h_rects_span(host_rect_buffers[idx], entry_counts[idx]); impl->contribute_dense_rect_list(h_rects_span, false); - h_instances[idx].destroy(); + deppart_host_free(host_rect_buffers[idx]); } else { impl->contribute_nothing(); } @@ -319,6 +321,7 @@ template void GPUImageMicroOp::gpu_populate_ptrs() { if (sources.size() == 0) { + assert(sparsity_outputs.empty()); return; } @@ -365,13 +368,15 @@ void GPUImageMicroOp::gpu_populate_ptrs() // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs. GPUMicroOp::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream); - Memory zcpy_mem; - assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM, buffer_arena.location)); - RegionInstance accessors_instance = this->realm_malloc(domain_transform.ptr_data.size() * sizeof(AffineAccessor,N2,T2>), zcpy_mem); - AffineAccessor,N2,T2>* d_accessors = reinterpret_cast,N2,T2>*>(AffineAccessor(accessors_instance, 0).base); + std::vector,N2,T2>> h_accessors(domain_transform.ptr_data.size()); for (size_t i = 0; i < domain_transform.ptr_data.size(); ++i) { - d_accessors[i] = AffineAccessor,N2,T2>(domain_transform.ptr_data[i].inst, domain_transform.ptr_data[i].field_offset); + h_accessors[i] = AffineAccessor,N2,T2>(domain_transform.ptr_data[i].inst, domain_transform.ptr_data[i].field_offset); } + AffineAccessor,N2,T2>* d_accessors = + buffer_arena.alloc,N2,T2>>(domain_transform.ptr_data.size()); + CUDA_CHECK(cudaMemcpyAsync(d_accessors, h_accessors.data(), + domain_transform.ptr_data.size() * sizeof(AffineAccessor,N2,T2>), + cudaMemcpyHostToDevice, stream), stream); uint32_t* d_prefix_points = buffer_arena.alloc(domain_transform.ptr_data.size()+1); @@ -387,7 +392,7 @@ void GPUImageMicroOp::gpu_populate_ptrs() int count = 0; if (count) {} bool host_fallback = false; - std::vector h_instances(sources.size(), RegionInstance::NO_INST); + std::vector*> host_rect_buffers(sources.size(), nullptr); std::vector entry_counts(sources.size(), 0); while (num_completed < inst_space.num_entries) { try { @@ -475,9 +480,9 @@ void GPUImageMicroOp::gpu_populate_ptrs() return elem; }); - if (host_fallback) { - this->split_output(d_new_rects, num_new_rects, h_instances, entry_counts, buffer_arena); - } + if (host_fallback) { + this->split_output(d_new_rects, num_new_rects, host_rect_buffers, entry_counts, buffer_arena); + } if (num_output==0 || host_fallback) { num_output = num_new_rects; @@ -525,7 +530,7 @@ void GPUImageMicroOp::gpu_populate_ptrs() } else { host_fallback = true; if (num_output > 0) { - this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena); } curr_tile = tile_size / 2; } @@ -557,7 +562,7 @@ void GPUImageMicroOp::gpu_populate_ptrs() return elem; }); } catch (arena_oom&) { - this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena); host_fallback = true; } } @@ -569,14 +574,13 @@ void GPUImageMicroOp::gpu_populate_ptrs() impl->set_contributor_count(1); } if (entry_counts[idx] > 0) { - Rect* h_rects = reinterpret_cast *>(AffineAccessor(h_instances[idx], 0).base); - span> h_rects_span(h_rects, entry_counts[idx]); + span> h_rects_span(host_rect_buffers[idx], entry_counts[idx]); impl->contribute_dense_rect_list(h_rects_span, false); - h_instances[idx].destroy(); + deppart_host_free(host_rect_buffers[idx]); } else { impl->contribute_nothing(); } } } } -} \ No newline at end of file +} diff --git a/src/realm/deppart/partitions.cc b/src/realm/deppart/partitions.cc index 1c16670c47..b25ddd17a5 100644 --- a/src/realm/deppart/partitions.cc +++ b/src/realm/deppart/partitions.cc @@ -662,16 +662,6 @@ namespace Realm { } } - RegionInstance PartitioningMicroOp::realm_malloc(size_t size, Memory location) { - assert(location != Memory::NO_MEMORY); - assert(size > 0); - std::vector byte_fields = {sizeof(char)}; - IndexSpace<1> instance_index_space(Rect<1>(0, size-1)); - RegionInstance result; - RegionInstance::create_instance(result, location, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); - return result; - } - //////////////////////////////////////////////////////////////////////// // // class ComputeOverlapMicroOp @@ -1067,4 +1057,3 @@ namespace Realm { FOREACH_NTNT(DOIT2) }; - diff --git a/src/realm/deppart/partitions.h b/src/realm/deppart/partitions.h index a3d0d3feb8..0af8ec0673 100644 --- a/src/realm/deppart/partitions.h +++ b/src/realm/deppart/partitions.h @@ -91,7 +91,7 @@ namespace Realm { size_t* offsets; size_t num_children; Rect bounds; - RegionInstance h_instance = RegionInstance::NO_INST; + SparsityMapEntry* host_entries_owner = nullptr; }; // Stores everything necessary to query a BVH @@ -310,8 +310,6 @@ namespace Realm { template void sparsity_map_ready(SparsityMapImpl *sparsity, bool precise); - static RegionInstance realm_malloc(size_t size, Memory location = Memory::NO_MEMORY); - IntrusiveListLink uop_link; REALM_PMTA_DEFN(PartitioningMicroOp,IntrusiveListLink,uop_link); typedef IntrusiveList MicroOpList; @@ -358,6 +356,8 @@ namespace Realm { class GPUMicroOp : public PartitioningMicroOp { public: GPUMicroOp(void) = default; + GPUMicroOp(NodeID _requestor, AsyncMicroOp *_async_microop) + : PartitioningMicroOp(_requestor, _async_microop) {} virtual ~GPUMicroOp(void) = default; virtual void execute(void) = 0; @@ -386,7 +386,7 @@ namespace Realm { template void complete1d_pipeline(RectDesc* d_rects, size_t total_rects, RectDesc* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap); - void split_output(RectDesc* d_rects, size_t total_rects, std::vector &output_instances, std::vector &output_counts, Arena &my_arena); + void split_output(RectDesc* d_rects, size_t total_rects, std::vector *> &output_instances, std::vector &output_counts, Arena &my_arena); template void send_output(RectDesc* d_rects, size_t total_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap); @@ -527,4 +527,3 @@ namespace Realm { #include "realm/deppart/partitions.inl" #endif // REALM_PARTITIONS_H - diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp index 015a1b7726..e293419b9a 100644 --- a/src/realm/deppart/partitions_gpu_impl.hpp +++ b/src/realm/deppart/partitions_gpu_impl.hpp @@ -36,6 +36,17 @@ #define COMPUTE_GRID(num_items) \ (((num_items) + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK) +#define CUDA_HOST_CHECK(call) \ + do { \ + cudaError_t err = (call); \ + if (err != cudaSuccess) { \ + std::cerr << "CUDA host error at " << __FILE__ << ":" << __LINE__ \ + << " '" #call "' failed with " \ + << cudaGetErrorString(err) << " (" << err << ")\n"; \ + assert(false); \ + } \ + } while (0) + //NVTX macros to only add ranges if defined. #ifdef REALM_USE_NVTX @@ -61,6 +72,21 @@ inline int32_t next_nvtx_payload() { namespace Realm { + template + inline T *deppart_host_alloc(size_t count, unsigned flags = cudaHostAllocPortable) + { + if(count == 0) return nullptr; + void *ptr = nullptr; + CUDA_HOST_CHECK(cudaHostAlloc(&ptr, count * sizeof(T), flags)); + return reinterpret_cast(ptr); + } + + inline void deppart_host_free(void *ptr) + { + if(ptr != nullptr) + CUDA_HOST_CHECK(cudaFreeHost(ptr)); + } + // Used by cub::DeviceReduce to compute bad GPU approximation. template struct UnionRectOp { @@ -105,7 +131,18 @@ namespace Realm { output = affinity.m2; } } - return output != Memory::NO_MEMORY; + if (output == Memory::NO_MEMORY) { + std::set memories; + Machine::get_machine().get_all_memories(memories); + for (auto mem : memories) { + if (mem.kind() == kind) { + output = mem; + return true; + } + } + return false; + } + return true; } template @@ -132,8 +169,7 @@ namespace Realm { } new_offsets[inst_space.num_children] = num_new_entries; CUDA_CHECK(cudaMemcpyAsync(inst_space.offsets, new_offsets.data(), (inst_space.num_children + 1) * sizeof(size_t), cudaMemcpyHostToDevice, stream), stream); - RegionInstance new_entries_buffer = realm_malloc(num_new_entries * sizeof(SparsityMapEntry), inst_space.h_instance.get_location()); - SparsityMapEntry *new_entries_ptr = reinterpret_cast *>(new_entries_buffer.pointer_untyped(0, num_new_entries * sizeof(SparsityMapEntry))); + SparsityMapEntry *new_entries_ptr = deppart_host_alloc>(num_new_entries); size_t write_loc = 0; for (size_t i = num_completed; i < inst_space.num_entries; i++) { @@ -181,8 +217,8 @@ namespace Realm { num_completed = 0; inst_space.entries_buffer = new_entries_ptr; inst_space.num_entries = num_new_entries; - inst_space.h_instance.destroy(); - inst_space.h_instance = new_entries_buffer; + deppart_host_free(inst_space.host_entries_owner); + inst_space.host_entries_owner = new_entries_ptr; CUDA_CHECK(cudaStreamSynchronize(stream), stream); } @@ -229,15 +265,11 @@ namespace Realm { space_offsets[spaces.size()] = out_space.num_entries; //We copy into one contiguous host buffer, then copy to device - Memory sysmem; - assert(find_memory(sysmem, Memory::SYSTEM_MEM, my_arena.location)); - - - RegionInstance h_instance = realm_malloc(out_space.num_entries * sizeof(SparsityMapEntry), sysmem); - SparsityMapEntry* h_entries = reinterpret_cast*>(AffineAccessor(h_instance, 0).base); + SparsityMapEntry* h_entries = deppart_host_alloc>(out_space.num_entries); if (my_arena.capacity()==0) { - out_space.entries_buffer = reinterpret_cast*>(AffineAccessor(h_instance, 0).base); + out_space.entries_buffer = h_entries; + out_space.host_entries_owner = h_entries; } else { out_space.entries_buffer = my_arena.alloc >(out_space.num_entries); } @@ -286,9 +318,7 @@ namespace Realm { if (my_arena.capacity() != 0) { CUDA_CHECK(cudaMemcpyAsync(out_space.entries_buffer, h_entries, out_space.num_entries * sizeof(SparsityMapEntry), cudaMemcpyHostToDevice, stream), stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); - h_instance.destroy(); - } else { - out_space.h_instance = h_instance; + deppart_host_free(h_entries); } CUDA_CHECK(cudaStreamSynchronize(stream), stream); @@ -1569,16 +1599,12 @@ namespace Realm { } template - void GPUMicroOp::split_output(RectDesc* d_rects, size_t total_rects, std::vector &output_instances, std::vector &output_counts, Arena &my_arena) + void GPUMicroOp::split_output(RectDesc* d_rects, size_t total_rects, std::vector *> &output_instances, std::vector &output_counts, Arena &my_arena) { NVTX_DEPPART(split_output); CUstream stream = this->stream->get_stream(); bool use_sysmem = false; - RegionInstance sys_instance = RegionInstance::NO_INST; - - Memory sysmem; - assert(find_memory(sysmem, Memory::SYSTEM_MEM, my_arena.location)); Rect* final_rects; std::vector d_starts_host(output_instances.size()), d_ends_host(output_instances.size()); @@ -1605,10 +1631,8 @@ namespace Realm { CUDA_CHECK(cudaStreamSynchronize(stream), stream); } catch (arena_oom&) { use_sysmem = true; - RegionInstance tmp_instance = this->realm_malloc(total_rects * sizeof(RectDesc), sysmem); - sys_instance = this->realm_malloc(total_rects * sizeof(Rect), sysmem); - RectDesc* h_tmp_rects = reinterpret_cast*>(tmp_instance.pointer_untyped(0, total_rects * sizeof(RectDesc))); - final_rects = reinterpret_cast*>(sys_instance.pointer_untyped(0, total_rects * sizeof(Rect))); + RectDesc* h_tmp_rects = deppart_host_alloc>(total_rects); + final_rects = deppart_host_alloc>(total_rects); CUDA_CHECK(cudaMemcpyAsync(h_tmp_rects, d_rects, total_rects * sizeof(RectDesc), cudaMemcpyDeviceToHost, stream), stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); for (size_t idx = 0; idx < total_rects; idx++ ) { @@ -1624,7 +1648,7 @@ namespace Realm { d_ends_host[h_tmp_rects[idx].src_idx] = idx+1; } } - tmp_instance.destroy(); + deppart_host_free(h_tmp_rects); } for (size_t i = 1; i < output_instances.size(); i++) { @@ -1639,12 +1663,10 @@ namespace Realm { size_t end = d_ends_host[i]; size_t start = d_starts_host[i]; if (end - start > 0) { - RegionInstance new_instance = this->realm_malloc(((end - start) + output_counts[i]) * sizeof(Rect), sysmem); - Rect* h_new_rects = reinterpret_cast*>(new_instance.pointer_untyped(0, ((end - start) + output_counts[i]) * sizeof(Rect))); + Rect* h_new_rects = deppart_host_alloc>((end - start) + output_counts[i]); if (output_counts[i] > 0) { - Rect* h_old_rects = reinterpret_cast*>(output_instances[i].pointer_untyped(0, output_counts[i] * sizeof(Rect))); - std::memcpy(h_new_rects, h_old_rects, output_counts[i] * sizeof(Rect)); - output_instances[i].destroy(); + std::memcpy(h_new_rects, output_instances[i], output_counts[i] * sizeof(Rect)); + deppart_host_free(output_instances[i]); } if (use_sysmem) { std::memcpy(h_new_rects + output_counts[i], final_rects + start, (end - start) * sizeof(Rect)); @@ -1652,13 +1674,13 @@ namespace Realm { CUDA_CHECK(cudaMemcpyAsync(h_new_rects + output_counts[i], final_rects + start, (end - start) * sizeof(Rect), cudaMemcpyDeviceToHost, stream), stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); } - output_instances[i] = new_instance; + output_instances[i] = h_new_rects; output_counts[i] += end - start; } } } if (use_sysmem) { - sys_instance.destroy(); + deppart_host_free(final_rects); } } @@ -1715,30 +1737,31 @@ namespace Realm { if (d_ends_host[idx] > d_starts_host[idx]) { size_t end = d_ends_host[idx]; size_t start = d_starts_host[idx]; - RegionInstance h_rects_instance = this->realm_malloc((end - start) * sizeof(Rect), sysmem); - Rect *h_rects = reinterpret_cast *>(AffineAccessor(h_rects_instance, 0).base); + Rect *h_rects = deppart_host_alloc>(end - start); CUDA_CHECK(cudaMemcpyAsync(h_rects, final_rects + start, (end - start) * sizeof(Rect), cudaMemcpyDeviceToHost, stream), stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); span> h_rects_span(h_rects, end - start); bool disjoint = !this->is_image_microop(); impl->contribute_dense_rect_list(h_rects_span, disjoint); - h_rects_instance.destroy(); + deppart_host_free(h_rects); } else { impl->contribute_nothing(); } } } else { + std::vector *> local_finalizations; //Use provided lambdas to iterate over sparsity output container (map or vector) for (auto const& elem : ctr) { size_t idx = getIndex(elem); auto mapOpj = getMap(elem); SparsityMapImpl *impl = SparsityMapImpl::lookup(mapOpj); + NodeID owner = ID(mapOpj).sparsity_creator_node(); + assert(owner == Network::my_node_id); if (d_ends_host[idx] > d_starts_host[idx]) { size_t end = d_ends_host[idx]; size_t start = d_starts_host[idx]; - RegionInstance entries = this->realm_malloc((end - start) * sizeof(SparsityMapEntry), sysmem); - SparsityMapEntry *h_entries = reinterpret_cast *>(AffineAccessor(entries, 0).base); + SparsityMapEntry *h_entries = deppart_host_alloc>(end - start); CUDA_CHECK(cudaMemcpyAsync(h_entries, final_entries + start, (end - start) * sizeof(SparsityMapEntry), cudaMemcpyDeviceToHost, stream), stream); Rect *approx_rects; @@ -1779,17 +1802,44 @@ namespace Realm { ); CUDA_CHECK(cudaStreamSynchronize(stream), stream); } - RegionInstance approx_entries = this->realm_malloc(num_approx * sizeof(Rect), sysmem); - SparsityMapEntry *h_approx_entries = reinterpret_cast *>(AffineAccessor(approx_entries, 0).base); + Rect *h_approx_entries = deppart_host_alloc>(num_approx); CUDA_CHECK(cudaMemcpyAsync(h_approx_entries, approx_rects, num_approx * sizeof(Rect), cudaMemcpyDeviceToHost, stream), stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); - impl->set_instance(entries, end - start); - impl->set_approx_instance(approx_entries, num_approx); + if(owner == Network::my_node_id) { + impl->set_gpu_entries(h_entries, end - start); + impl->set_gpu_approx_rects(h_approx_entries, num_approx); + local_finalizations.push_back(impl); + } else { + size_t payload_bytes = ((end - start) * sizeof(SparsityMapEntry)) + + (num_approx * sizeof(Rect)); + ActiveMessage::RemoteGpuFinalizeMessage> + amsg(owner, payload_bytes); + amsg->sparsity = mapOpj; + amsg->num_entries = end - start; + amsg->num_approx = num_approx; + amsg.add_payload(h_entries, (end - start) * sizeof(SparsityMapEntry), + PAYLOAD_COPY); + amsg.add_payload(h_approx_entries, num_approx * sizeof(Rect), + PAYLOAD_COPY); + amsg.commit(); + deppart_host_free(h_entries); + deppart_host_free(h_approx_entries); + } + } else { + if(owner == Network::my_node_id) { + local_finalizations.push_back(impl); + } else { + ActiveMessage::RemoteGpuFinalizeMessage> + amsg(owner); + amsg->sparsity = mapOpj; + amsg->num_entries = 0; + amsg->num_approx = 0; + amsg.commit(); + } } } CUDA_CHECK(cudaStreamSynchronize(stream), stream); - for (auto const& elem : ctr) { - SparsityMapImpl *impl = SparsityMapImpl::lookup(getMap(elem)); + for (SparsityMapImpl *impl : local_finalizations) { impl->gpu_finalize(); } } @@ -1797,4 +1847,4 @@ namespace Realm { } -} \ No newline at end of file +} diff --git a/src/realm/deppart/preimage.cc b/src/realm/deppart/preimage.cc index 9ac7d85606..e283a3ec47 100644 --- a/src/realm/deppart/preimage.cc +++ b/src/realm/deppart/preimage.cc @@ -341,7 +341,8 @@ namespace Realm { parent(_parent), domain_transform(_domain_transform), overlap_tester(0), - dummy_overlap_uop(0) { + dummy_overlap_uop(0), + exclusive_gpu_owner(exclusive_gpu_exec_node()) { areg.force_instantiation(); } @@ -351,6 +352,33 @@ namespace Realm { delete overlap_tester; } + template + NodeID PreimageOperation::exclusive_gpu_exec_node(void) const { + size_t gpu_ptrs = 0, gpu_rects = 0, cpu_ptrs = 0, cpu_rects = 0; + for(size_t i = 0; i < domain_transform.ptr_data.size(); i++) { + Memory::Kind kind = domain_transform.ptr_data[i].inst.get_location().kind(); + if((kind == Memory::GPU_FB_MEM) || (kind == Memory::Z_COPY_MEM)) + gpu_ptrs++; + else + cpu_ptrs++; + } + for(size_t i = 0; i < domain_transform.range_data.size(); i++) { + Memory::Kind kind = domain_transform.range_data[i].inst.get_location().kind(); + if((kind == Memory::GPU_FB_MEM) || (kind == Memory::Z_COPY_MEM)) + gpu_rects++; + else + cpu_rects++; + } + size_t opcount = gpu_ptrs + gpu_rects + cpu_ptrs + cpu_rects; + if((gpu_ptrs + gpu_rects) == 0 || (opcount != 1)) + return -1; + if(gpu_ptrs == 1) + return ID(domain_transform.ptr_data[0].inst).instance_owner_node(); + if(gpu_rects == 1) + return ID(domain_transform.range_data[0].inst).instance_owner_node(); + return -1; + } + template IndexSpace PreimageOperation::add_target(const IndexSpace &target) { // try to filter out obviously empty targets @@ -364,7 +392,9 @@ namespace Realm { // if the target has a sparsity map, use the same node - otherwise // get a sparsity ID by round-robin'ing across the nodes that have field data int target_node; - if (!target.dense()) + if (exclusive_gpu_owner >= 0) + target_node = exclusive_gpu_owner; + else if (!target.dense()) target_node = ID(target.sparsity).sparsity_creator_node(); else if (!domain_transform.ptr_data.empty()) target_node = @@ -378,8 +408,10 @@ namespace Realm { .range_data[targets.size() % domain_transform.range_data.size()] .inst) .instance_owner_node(); - SparsityMap sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert >(); + if (exclusive_gpu_owner >= 0) + assert(target_node == exclusive_gpu_exec_node()); + SparsityMap sparsity = + create_deppart_output_sparsity(target_node).convert>(); preimage.sparsity = sparsity; targets.push_back(target); @@ -818,13 +850,80 @@ namespace Realm { IndexSpace _parent_space, bool _exclusive) : domain_transform(_domain_transform), parent_space(_parent_space) { this->exclusive = _exclusive; - Memory my_mem = domain_transform.ptr_data.empty() ? domain_transform.range_data[0].inst.get_location() : domain_transform.ptr_data[0].inst.get_location(); - Processor best_proc; - assert(choose_proc(best_proc, my_mem)); - Cuda::GPUProcessor* gpu_proc = dynamic_cast(get_runtime()->get_processor_impl(best_proc)); - assert(gpu_proc); - this->gpu = gpu_proc->gpu; - this->stream = gpu_proc->gpu->get_deppart_stream(); + areg.force_instantiation(); + // GPU setup (this->gpu, this->stream) deferred to execute(), which runs on the + // correct node after dispatch() has forwarded to the instance owner if needed. + } + + template + template + GPUPreimageMicroOp::GPUPreimageMicroOp( + NodeID _requestor, AsyncMicroOp *_async_microop, S& s) + : GPUMicroOp(_requestor, _async_microop) { + bool ok = true; + // domain_transform is always UNSTRUCTURED; only one of ptr_data/range_data + // is populated — a single bool distinguishes the two cases. + bool use_ptr_data = false; + ok = ok && (s >> use_ptr_data); + if(use_ptr_data) { + domain_transform.type = + DomainTransform::DomainTransformType::UNSTRUCTURED_PTR; + size_t np = 0; + ok = ok && (s >> np); + domain_transform.ptr_data.resize(np); + for(size_t i = 0; i < np && ok; i++) + ok = ok && (s >> domain_transform.ptr_data[i].index_space) && + (s >> domain_transform.ptr_data[i].inst) && + (s >> domain_transform.ptr_data[i].field_offset) && + (s >> domain_transform.ptr_data[i].scratch_buffer); + } else { + domain_transform.type = + DomainTransform::DomainTransformType::UNSTRUCTURED_RANGE; + size_t nr = 0; + ok = ok && (s >> nr); + domain_transform.range_data.resize(nr); + for(size_t i = 0; i < nr && ok; i++) + ok = ok && (s >> domain_transform.range_data[i].index_space) && + (s >> domain_transform.range_data[i].inst) && + (s >> domain_transform.range_data[i].field_offset) && + (s >> domain_transform.range_data[i].scratch_buffer); + } + ok = ok && (s >> parent_space); + ok = ok && (s >> this->exclusive); + ok = ok && (s >> targets); + ok = ok && (s >> sparsity_outputs); + assert(ok); + (void)ok; + } + + template + template + bool GPUPreimageMicroOp::serialize_params(S& s) const { + bool ok = true; + // domain_transform is always UNSTRUCTURED; only one of ptr_data/range_data + // is populated — a single bool distinguishes the two cases. + bool use_ptr_data = !domain_transform.ptr_data.empty(); + ok = ok && (s << use_ptr_data); + if(use_ptr_data) { + ok = ok && (s << domain_transform.ptr_data.size()); + for(size_t i = 0; i < domain_transform.ptr_data.size() && ok; i++) + ok = ok && (s << domain_transform.ptr_data[i].index_space) && + (s << domain_transform.ptr_data[i].inst) && + (s << domain_transform.ptr_data[i].field_offset) && + (s << domain_transform.ptr_data[i].scratch_buffer); + } else { + ok = ok && (s << domain_transform.range_data.size()); + for(size_t i = 0; i < domain_transform.range_data.size() && ok; i++) + ok = ok && (s << domain_transform.range_data[i].index_space) && + (s << domain_transform.range_data[i].inst) && + (s << domain_transform.range_data[i].field_offset) && + (s << domain_transform.range_data[i].scratch_buffer); + } + ok = ok && (s << parent_space); + ok = ok && (s << this->exclusive); + ok = ok && (s << targets); + ok = ok && (s << sparsity_outputs); + return ok; } template @@ -841,6 +940,20 @@ namespace Realm { template void GPUPreimageMicroOp::execute(void) { TimeStamp ts("GPUPreimageMicroOp::execute", true, &log_uop_timing); + // Resolve the local GPU processor now that we are guaranteed to be on the + // correct node (dispatch() forwarded us here if the instance was remote). + { + Memory my_mem = domain_transform.ptr_data.empty() ? + domain_transform.range_data[0].inst.get_location() : + domain_transform.ptr_data[0].inst.get_location(); + Processor best_proc; + assert(choose_proc(best_proc, my_mem)); + Cuda::GPUProcessor *gpu_proc = + dynamic_cast(get_runtime()->get_processor_impl(best_proc)); + assert(gpu_proc); + this->gpu = gpu_proc->gpu; + this->stream = gpu_proc->gpu->get_deppart_stream(); + } Cuda::AutoGPUContext agc(this->gpu); if (domain_transform.ptr_data.size() > 0) { gpu_populate_bitmasks(); @@ -852,6 +965,40 @@ namespace Realm { template void GPUPreimageMicroOp::dispatch( PartitioningOperation *op, bool inline_ok) { + // GPU preimage must execute on the node that owns the GPU memory + NodeID exec_node = domain_transform.ptr_data.empty() ? + ID(domain_transform.range_data[0].inst).instance_owner_node() : + ID(domain_transform.ptr_data[0].inst).instance_owner_node(); + if(this->exclusive) { + for(size_t i = 0; i < sparsity_outputs.size(); i++) + assert(NodeID(ID(sparsity_outputs[i]).sparsity_creator_node()) == exec_node); + } + if(exec_node != Network::my_node_id) { + PartitioningMicroOp::template forward_microop >(exec_node, op, this); + return; + } + + for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) { + IndexSpace inst_space = domain_transform.ptr_data[i].index_space; + if (!inst_space.dense()) { + // it's safe to add the count after the registration only because we initialized + // the count to 2 instead of 1 + bool registered = SparsityMapImpl::lookup(inst_space.sparsity)->add_waiter(this, true /*precise*/); + if(registered) + this->wait_count.fetch_add(1); + } + } + for (size_t i = 0; i < domain_transform.range_data.size(); i++) { + IndexSpace inst_space = domain_transform.range_data[i].index_space; + if (!inst_space.dense()) { + // it's safe to add the count after the registration only because we initialized + // the count to 2 instead of 1 + bool registered = SparsityMapImpl::lookup(inst_space.sparsity)->add_waiter(this, true /*precise*/); + if(registered) + this->wait_count.fetch_add(1); + } + } + // need valid data for each target for (size_t i = 0; i < targets.size(); i++) { if (!targets[i].dense()) { @@ -874,8 +1021,13 @@ namespace Realm { this->finish_dispatch(op, inline_ok); } + + template + ActiveMessageHandlerReg > > + GPUPreimageMicroOp::areg; + #endif // instantiations of templates handled in preimage_tmpl.cc -}; // namespace Realm \ No newline at end of file +}; // namespace Realm diff --git a/src/realm/deppart/preimage.h b/src/realm/deppart/preimage.h index ed301ad51e..01032d2517 100644 --- a/src/realm/deppart/preimage.h +++ b/src/realm/deppart/preimage.h @@ -100,6 +100,7 @@ namespace Realm { protected: static ActiveMessageHandlerReg > > areg; + NodeID exclusive_gpu_exec_node(void) const; IndexSpace parent; DomainTransform domain_transform; @@ -111,6 +112,7 @@ namespace Realm { atomic remaining_sparse_images; std::vector > contrib_counts; AsyncMicroOp *dummy_overlap_uop; + int exclusive_gpu_owner; }; template @@ -175,6 +177,16 @@ namespace Realm { void dispatch(PartitioningOperation *op, bool inline_ok); protected: + friend struct RemoteMicroOpMessage >; + static ActiveMessageHandlerReg > > areg; + + friend class PartitioningMicroOp; + template + REALM_ATTR_WARN_UNUSED(bool serialize_params(S& s) const); + + // construct from received packet + template + GPUPreimageMicroOp(NodeID _requestor, AsyncMicroOp *_async_microop, S& s); void gpu_populate_ranges(); void gpu_populate_bitmasks(); @@ -189,4 +201,4 @@ namespace Realm { }; // namespace Realm -#endif // REALM_DEPPART_PREIMAGE_H \ No newline at end of file +#endif // REALM_DEPPART_PREIMAGE_H diff --git a/src/realm/deppart/preimage_gpu_impl.hpp b/src/realm/deppart/preimage_gpu_impl.hpp index 2a93136921..6934772fe4 100644 --- a/src/realm/deppart/preimage_gpu_impl.hpp +++ b/src/realm/deppart/preimage_gpu_impl.hpp @@ -12,6 +12,7 @@ namespace Realm { template void GPUPreimageMicroOp::gpu_populate_ranges() { if (targets.size() == 0) { + assert(sparsity_outputs.empty()); return; } @@ -57,13 +58,15 @@ namespace Realm { GPUMicroOp::collapse_multi_space(targets, target_space, buffer_arena, stream); - Memory zcpy_mem; - assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM, buffer_arena.location)); - RegionInstance accessors_instance = this->realm_malloc(domain_transform.range_data.size() * sizeof(AffineAccessor,N,T>), zcpy_mem); - AffineAccessor,N,T>* d_accessors = reinterpret_cast,N,T>*>(AffineAccessor(accessors_instance, 0).base); + std::vector,N,T>> h_accessors(domain_transform.range_data.size()); for (size_t i = 0; i < domain_transform.range_data.size(); ++i) { - d_accessors[i] = AffineAccessor,N,T>(domain_transform.range_data[i].inst, domain_transform.range_data[i].field_offset); + h_accessors[i] = AffineAccessor,N,T>(domain_transform.range_data[i].inst, domain_transform.range_data[i].field_offset); } + AffineAccessor,N,T>* d_accessors = + buffer_arena.alloc,N,T>>(domain_transform.range_data.size()); + CUDA_CHECK(cudaMemcpyAsync(d_accessors, h_accessors.data(), + domain_transform.range_data.size() * sizeof(AffineAccessor,N,T>), + cudaMemcpyHostToDevice, stream), stream); uint32_t* d_target_counters = buffer_arena.alloc(2*targets.size() + 1); uint32_t* d_targets_prefix = d_target_counters + targets.size(); @@ -78,7 +81,7 @@ namespace Realm { int count = 0; if (count) {} bool host_fallback = false; - std::vector h_instances(targets.size(), RegionInstance::NO_INST); + std::vector*> host_rect_buffers(targets.size(), nullptr); std::vector entry_counts(targets.size(), 0); while (num_completed < inst_space.num_entries) { try { @@ -214,7 +217,7 @@ namespace Realm { }); if (host_fallback) { - this->split_output(d_new_rects, num_new_rects, h_instances, entry_counts, buffer_arena); + this->split_output(d_new_rects, num_new_rects, host_rect_buffers, entry_counts, buffer_arena); } if (num_output==0 || host_fallback) { @@ -263,7 +266,7 @@ namespace Realm { } else { host_fallback = true; if (num_output > 0) { - this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena); } curr_tile = tile_size / 2; } @@ -294,7 +297,7 @@ namespace Realm { return elem; }); } catch (arena_oom&) { - this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena); host_fallback = true; } } @@ -306,10 +309,9 @@ namespace Realm { impl->set_contributor_count(1); } if (entry_counts[idx] > 0) { - Rect* h_rects = reinterpret_cast *>(AffineAccessor(h_instances[idx], 0).base); - span> h_rects_span(h_rects, entry_counts[idx]); + span> h_rects_span(host_rect_buffers[idx], entry_counts[idx]); impl->contribute_dense_rect_list(h_rects_span, true); - h_instances[idx].destroy(); + deppart_host_free(host_rect_buffers[idx]); } else { impl->contribute_nothing(); } @@ -320,6 +322,7 @@ namespace Realm { template void GPUPreimageMicroOp::gpu_populate_bitmasks() { if (targets.size() == 0) { + assert(sparsity_outputs.empty()); return; } @@ -365,13 +368,15 @@ namespace Realm { GPUMicroOp::collapse_multi_space(targets, target_space, buffer_arena, stream); - Memory zcpy_mem; - assert(find_memory(zcpy_mem, Memory::Z_COPY_MEM, buffer_arena.location)); - RegionInstance accessors_instance = this->realm_malloc(domain_transform.ptr_data.size() * sizeof(AffineAccessor,N,T>), zcpy_mem); - AffineAccessor,N,T>* d_accessors = reinterpret_cast,N,T>*>(AffineAccessor(accessors_instance, 0).base); + std::vector,N,T>> h_accessors(domain_transform.ptr_data.size()); for (size_t i = 0; i < domain_transform.ptr_data.size(); ++i) { - d_accessors[i] = AffineAccessor,N,T>(domain_transform.ptr_data[i].inst, domain_transform.ptr_data[i].field_offset); + h_accessors[i] = AffineAccessor,N,T>(domain_transform.ptr_data[i].inst, domain_transform.ptr_data[i].field_offset); } + AffineAccessor,N,T>* d_accessors = + buffer_arena.alloc,N,T>>(domain_transform.ptr_data.size()); + CUDA_CHECK(cudaMemcpyAsync(d_accessors, h_accessors.data(), + domain_transform.ptr_data.size() * sizeof(AffineAccessor,N,T>), + cudaMemcpyHostToDevice, stream), stream); uint32_t* d_target_counters = buffer_arena.alloc(2*targets.size() + 1); uint32_t* d_targets_prefix = d_target_counters + targets.size(); @@ -386,7 +391,7 @@ namespace Realm { int count = 0; if (count) {} bool host_fallback = false; - std::vector h_instances(targets.size(), RegionInstance::NO_INST); + std::vector*> host_rect_buffers(targets.size(), nullptr); std::vector entry_counts(targets.size(), 0); while (num_completed < inst_space.num_entries) { try { @@ -522,7 +527,7 @@ namespace Realm { }); if (host_fallback) { - this->split_output(d_new_rects, num_new_rects, h_instances, entry_counts, buffer_arena); + this->split_output(d_new_rects, num_new_rects, host_rect_buffers, entry_counts, buffer_arena); } if (num_output==0 || host_fallback) { @@ -571,7 +576,7 @@ namespace Realm { } else { host_fallback = true; if (num_output > 0) { - this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena); } curr_tile = tile_size / 2; } @@ -602,7 +607,7 @@ namespace Realm { return elem; }); } catch (arena_oom&) { - this->split_output(output_start, num_output, h_instances, entry_counts, buffer_arena); + this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena); host_fallback = true; } } @@ -614,14 +619,13 @@ namespace Realm { impl->set_contributor_count(1); } if (entry_counts[idx] > 0) { - Rect* h_rects = reinterpret_cast *>(AffineAccessor(h_instances[idx], 0).base); - span> h_rects_span(h_rects, entry_counts[idx]); + span> h_rects_span(host_rect_buffers[idx], entry_counts[idx]); impl->contribute_dense_rect_list(h_rects_span, true); - h_instances[idx].destroy(); + deppart_host_free(host_rect_buffers[idx]); } else { impl->contribute_nothing(); } } } } -} \ No newline at end of file +} diff --git a/src/realm/deppart/sparsity_impl.cc b/src/realm/deppart/sparsity_impl.cc index a1a511b744..20c655a62c 100644 --- a/src/realm/deppart/sparsity_impl.cc +++ b/src/realm/deppart/sparsity_impl.cc @@ -25,9 +25,181 @@ #include "realm/deppart/rectlist.h" #include "realm/deppart/inst_helper.h" #include "realm/logging.h" +#include "realm/machine.h" +#ifdef REALM_USE_CUDA +#include +#endif +#include +#include +#include +#include +#include +#include namespace Realm { + namespace { + struct PendingOutputSparsityAllocation { + std::mutex mutex; + std::condition_variable cv; + ID result{ID::ID_NULL}; + bool ready{false}; + }; + + atomic next_output_sparsity_request{1}; + std::mutex pending_output_sparsity_mutex; + std::unordered_map + pending_output_sparsity_allocations; + + struct OutputSparsityAllocationRequest { + uint64_t request_id; + + static void handle_message(NodeID sender, + const OutputSparsityAllocationRequest &msg, + const void *data, + size_t datalen); + }; + + struct OutputSparsityAllocationResponse { + uint64_t request_id; + ID sparsity; + + static void handle_message(NodeID sender, + const OutputSparsityAllocationResponse &msg, + const void *data, + size_t datalen); + }; + + ActiveMessageHandlerReg + output_sparsity_allocation_request_reg; + ActiveMessageHandlerReg + output_sparsity_allocation_response_reg; + + template + inline T *deppart_gpu_host_alloc(size_t count) + { + if(count == 0) return nullptr; +#ifdef REALM_USE_CUDA + void *ptr = nullptr; + cudaError_t err = cudaHostAlloc(&ptr, count * sizeof(T), cudaHostAllocPortable); + assert(err == cudaSuccess); + return reinterpret_cast(ptr); +#else + return static_cast(std::malloc(count * sizeof(T))); +#endif + } + + inline void deppart_gpu_host_free(void *ptr) + { + if(ptr == nullptr) return; +#ifdef REALM_USE_CUDA + cudaError_t err = cudaFreeHost(ptr); + assert(err == cudaSuccess); +#else + std::free(ptr); +#endif + } + + inline bool deppart_sparsity_trace_enabled(void) + { + static int enabled = -1; + if(enabled < 0) + enabled = (std::getenv("REALM_DEPPART_SPARSITY_TRACE") != nullptr) ? 1 : 0; + return (enabled == 1); + } + + inline void deppart_sparsity_trace(const char *tag, + ::realm_id_t sparsity, + NodeID owner, + NodeID node, + int remaining_contrib, + int total_pieces, + int remaining_pieces, + size_t extra0 = 0, + size_t extra1 = 0) + { + if(!deppart_sparsity_trace_enabled()) + return; + std::fprintf(stderr, + "[deppart-trace] %s map=%llx owner=%d node=%d rem_contrib=%d " + "total_pieces=%d rem_pieces=%d extra0=%zu extra1=%zu\n", + tag, + static_cast(sparsity), + owner, + node, + remaining_contrib, + total_pieces, + remaining_pieces, + extra0, + extra1); + std::fflush(stderr); + } + } + + ID create_deppart_output_sparsity(NodeID target_node) + { + if(target_node == Network::my_node_id) { + SparsityMapImplWrapper *wrap = + get_runtime()->get_available_sparsity_impl(target_node); + wrap->add_references(1); + return ID(wrap->me); + } + + PendingOutputSparsityAllocation pending; + uint64_t request_id = next_output_sparsity_request.fetch_add(1); + { + std::lock_guard lock(pending_output_sparsity_mutex); + pending_output_sparsity_allocations.emplace(request_id, &pending); + } + + ActiveMessage amsg(target_node); + amsg->request_id = request_id; + amsg.commit(); + + std::unique_lock lock(pending.mutex); + pending.cv.wait(lock, [&pending]() { return pending.ready; }); + return pending.result; + } + + void OutputSparsityAllocationRequest::handle_message( + NodeID sender, + const OutputSparsityAllocationRequest &msg, + const void *data, + size_t datalen) + { + SparsityMapImplWrapper *wrap = + get_runtime()->get_available_sparsity_impl(Network::my_node_id); + wrap->add_references(1); + + ActiveMessage amsg(sender); + amsg->request_id = msg.request_id; + amsg->sparsity = wrap->me; + amsg.commit(); + } + + void OutputSparsityAllocationResponse::handle_message( + NodeID sender, + const OutputSparsityAllocationResponse &msg, + const void *data, + size_t datalen) + { + PendingOutputSparsityAllocation *pending = nullptr; + { + std::lock_guard lock(pending_output_sparsity_mutex); + auto it = pending_output_sparsity_allocations.find(msg.request_id); + assert(it != pending_output_sparsity_allocations.end()); + pending = it->second; + pending_output_sparsity_allocations.erase(it); + } + + { + std::lock_guard lock(pending->mutex); + pending->result = msg.sparsity; + pending->ready = true; + } + pending->cv.notify_one(); + } + extern Logger log_part; //////////////////////////////////////////////////////////////////////// @@ -1233,13 +1405,8 @@ bool SparsityMapPublicImpl::bvh_centroid_less(int axis, template SparsityMapImpl::~SparsityMapImpl(void) { - //We are responsible for our instances - //if (this->entries_instance.exists()) { - // this->entries_instance.destroy(); - //} - //if (this->approx_instance.exists()) { - // this->approx_instance.destroy(); - //} + deppart_gpu_host_free(this->gpu_entries); + deppart_gpu_host_free(this->gpu_approx_rects); } template @@ -1324,6 +1491,14 @@ SparsityMapImpl::~SparsityMapImpl(void) template void SparsityMapImpl::set_contributor_count(int count) { + deppart_sparsity_trace("set_contributor_count.enter", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + count); if(NodeID(ID(me).sparsity_creator_node()) == Network::my_node_id) { // increment the count atomically - if it brings the total up to 0 // (which covers count == 0), immediately propagate the total piece @@ -1341,8 +1516,23 @@ SparsityMapImpl::~SparsityMapImpl(void) } } else { // send the contributor count to the owner node - sparsity_comm->send_contribute(me, count, 0, false); + // NOTE: must use SetContribCountMessage, not send_contribute! + // send_contribute arrives as contribute_raw_rects which DECREMENTS + // remaining_contributor_count by 1 (treating it as one contributor's piece), + // but set_contributor_count should INCREMENT by count. + ActiveMessage amsg(ID(me).sparsity_creator_node()); + amsg->sparsity = me; + amsg->count = count; + amsg.commit(); } + deppart_sparsity_trace("set_contributor_count.exit", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + count); } template @@ -1410,6 +1600,13 @@ SparsityMapImpl::~SparsityMapImpl(void) template void SparsityMapImpl::contribute_nothing(void) { + deppart_sparsity_trace("contribute_nothing.enter", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load()); NodeID owner = ID(me).sparsity_creator_node(); if(owner != Network::my_node_id) { @@ -1432,6 +1629,13 @@ SparsityMapImpl::~SparsityMapImpl(void) if(have_all_pieces) finalize(); } + deppart_sparsity_trace("contribute_nothing.exit", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load()); } template @@ -1490,6 +1694,15 @@ SparsityMapImpl::~SparsityMapImpl(void) size_t piece_count, bool disjoint, size_t total_count) { + deppart_sparsity_trace("contribute_raw_rects.enter", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + count, + piece_count); if(count > 0) { AutoLock<> al(mutex); @@ -1727,6 +1940,15 @@ SparsityMapImpl::~SparsityMapImpl(void) finalize(); } + deppart_sparsity_trace("contribute_raw_rects.exit", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + count, + piece_count); } // adds a microop as a waiter for valid sparsity map data - returns true @@ -1735,6 +1957,14 @@ SparsityMapImpl::~SparsityMapImpl(void) template bool SparsityMapImpl::add_waiter(PartitioningMicroOp *uop, bool precise) { + deppart_sparsity_trace("add_waiter.enter", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + precise ? 1 : 0); // early out if(precise ? this->entries_valid.load_acquire() : this->approx_valid.load_acquire()) @@ -1784,6 +2014,15 @@ SparsityMapImpl::~SparsityMapImpl(void) sparsity_comm->send_request(me, request_precise, request_approx); } + deppart_sparsity_trace("add_waiter.exit", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + precise ? 1 : 0, + registered ? 1 : 0); return registered; } @@ -1827,6 +2066,15 @@ SparsityMapImpl::~SparsityMapImpl(void) void SparsityMapImpl::remote_data_reply(NodeID requestor, bool reply_precise, bool reply_approx) { + deppart_sparsity_trace("remote_data_reply.enter", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + reply_precise ? 1 : 0, + reply_approx ? 1 : 0); if(reply_approx) { // TODO if(!this->approx_valid.load_acquire()) @@ -1879,6 +2127,15 @@ SparsityMapImpl::~SparsityMapImpl(void) sparsity_comm->send_contribute(requestor, me, num_pieces + 1, total_count, /*disjoint=*/true, rdata, bytes); } + deppart_sparsity_trace("remote_data_reply.exit", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + reply_precise ? 1 : 0, + reply_approx ? 1 : 0); } template @@ -2039,6 +2296,14 @@ SparsityMapImpl::~SparsityMapImpl(void) template void SparsityMapImpl::finalize(void) { + deppart_sparsity_trace("finalize.enter", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + this->entries.size()); this->from_gpu = false; @@ -2180,6 +2445,15 @@ SparsityMapImpl::~SparsityMapImpl(void) if(trigger_precise.exists()) GenEventImpl::trigger(trigger_precise, false /*!poisoned*/); + deppart_sparsity_trace("finalize.exit", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + this->entries.size()); + } @@ -2189,7 +2463,16 @@ SparsityMapImpl::~SparsityMapImpl(void) template void SparsityMapImpl::gpu_finalize(void) { - this->from_gpu = true; + deppart_sparsity_trace("gpu_finalize.enter", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + this->num_entries, + this->num_approx); + this->from_gpu = ((this->gpu_entries != nullptr) || (this->gpu_approx_rects != nullptr)); if(true /*ID(me).sparsity_creator_node() == Network::my_node_id*/) { assert(!this->approx_valid.load()); @@ -2273,22 +2556,33 @@ SparsityMapImpl::~SparsityMapImpl(void) if(trigger_precise.exists()) GenEventImpl::trigger(trigger_precise, false /*!poisoned*/); + deppart_sparsity_trace("gpu_finalize.exit", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + this->num_entries, + this->num_approx); } - //Allows a GPU deppart client to set the entries directly with a host region instance template - void SparsityMapImpl::set_instance(RegionInstance _entries_instance, size_t size) + void SparsityMapImpl::set_gpu_entries(SparsityMapEntry *entries, size_t size) { - this->entries_instance = _entries_instance; + deppart_gpu_host_free(this->gpu_entries); + this->gpu_entries = entries; + this->entries.clear(); this->num_entries = size; } - //Allows a GPU deppart client to set the approx rects directly with a host region instance template - void SparsityMapImpl::set_approx_instance(RegionInstance _approx_instance, size_t size) + void SparsityMapImpl::set_gpu_approx_rects(Rect *approx_rects, size_t size) { - this->approx_instance = _approx_instance; + deppart_gpu_host_free(this->gpu_approx_rects); + this->gpu_approx_rects = approx_rects; + this->approx_rects.clear(); this->num_approx = size; } @@ -2304,6 +2598,10 @@ SparsityMapImpl::~SparsityMapImpl(void) /*static*/ ActiveMessageHandlerReg< typename SparsityMapImpl::SetContribCountMessage> SparsityMapImpl::set_contrib_count_msg_reg; + template + /*static*/ ActiveMessageHandlerReg< + typename SparsityMapImpl::RemoteGpuFinalizeMessage> + SparsityMapImpl::remote_gpu_finalize_msg_reg; /*static*/ ActiveMessageHandlerReg< typename SparsityMapRefCounter::SparsityMapAddReferenceMessage> @@ -2361,6 +2659,42 @@ SparsityMapImpl::~SparsityMapImpl(void) SparsityMapImpl::lookup(msg.sparsity)->set_contributor_count(msg.count); } + //////////////////////////////////////////////////////////////////////// + // + // class SparsityMapImpl::RemoteGpuFinalizeMessage + + template + inline /*static*/ void SparsityMapImpl::RemoteGpuFinalizeMessage::handle_message( + NodeID sender, const SparsityMapImpl::RemoteGpuFinalizeMessage &msg, + const void *data, size_t datalen) + { + size_t expected = (msg.num_entries * sizeof(SparsityMapEntry)) + + (msg.num_approx * sizeof(Rect)); + assert(datalen == expected); + (void)sender; + + const char *payload = static_cast(data); + SparsityMapImpl *impl = SparsityMapImpl::lookup(msg.sparsity); + + if(msg.num_entries > 0) { + SparsityMapEntry *entries = deppart_gpu_host_alloc>(msg.num_entries); + std::memcpy(entries, payload, msg.num_entries * sizeof(SparsityMapEntry)); + impl->set_gpu_entries(entries, msg.num_entries); + payload += msg.num_entries * sizeof(SparsityMapEntry); + } else { + impl->set_gpu_entries(nullptr, 0); + } + + if(msg.num_approx > 0) { + Rect *approx = deppart_gpu_host_alloc>(msg.num_approx); + std::memcpy(approx, payload, msg.num_approx * sizeof(Rect)); + impl->set_gpu_approx_rects(approx, msg.num_approx); + } else { + impl->set_gpu_approx_rects(nullptr, 0); + } + impl->gpu_finalize(); + } + #define DOIT(N, T) \ template class SparsityMapPublicImpl; \ template class SparsityMapImpl; \ diff --git a/src/realm/deppart/sparsity_impl.h b/src/realm/deppart/sparsity_impl.h index f9656e65b6..aa94d7200f 100644 --- a/src/realm/deppart/sparsity_impl.h +++ b/src/realm/deppart/sparsity_impl.h @@ -33,6 +33,9 @@ namespace Realm { + REALM_INTERNAL_API_EXTERNAL_LINKAGE + ID create_deppart_output_sparsity(NodeID target_node); + class PartitioningMicroOp; /** @@ -139,8 +142,8 @@ namespace Realm { void remote_data_request(NodeID requestor, bool send_precise, bool send_approx); void remote_data_reply(NodeID requestor, bool send_precise, bool send_approx); - void set_instance(RegionInstance _entries_instance, size_t size); - void set_approx_instance(RegionInstance _approx_instance, size_t size); + void set_gpu_entries(SparsityMapEntry *entries, size_t size); + void set_gpu_approx_rects(Rect *approx_rects, size_t size); void gpu_finalize(void); SparsityMap me; @@ -174,12 +177,22 @@ namespace Realm { const void *data, size_t datalen); }; + struct RemoteGpuFinalizeMessage { + SparsityMap sparsity; + size_t num_entries; + size_t num_approx; + + static void handle_message(NodeID sender, const RemoteGpuFinalizeMessage &msg, + const void *data, size_t datalen); + }; + protected: void finalize(void); static ActiveMessageHandlerReg remote_sparsity_request_reg; static ActiveMessageHandlerReg remote_sparsity_contrib_reg; static ActiveMessageHandlerReg set_contrib_count_msg_reg; + static ActiveMessageHandlerReg remote_gpu_finalize_msg_reg; atomic remaining_contributor_count{0}; atomic total_piece_count{0}, remaining_piece_count{0}; diff --git a/src/realm/sparsity.h b/src/realm/sparsity.h index dc9fe74300..616f86f499 100644 --- a/src/realm/sparsity.h +++ b/src/realm/sparsity.h @@ -318,11 +318,11 @@ namespace Realm { std::vector > entries; std::vector > approx_rects; - //Stores rectangles for GPU deppart (allows fast copy after merged on GPU) - RegionInstance entries_instance = RegionInstance::NO_INST; + // Stores rectangles for GPU deppart in host buffers owned by the sparsity map. + SparsityMapEntry *gpu_entries = nullptr; size_t num_entries = 0; - RegionInstance approx_instance = RegionInstance::NO_INST; + Rect *gpu_approx_rects = nullptr; size_t num_approx = 0; //Tracks whether to use instance or vector diff --git a/src/realm/sparsity.inl b/src/realm/sparsity.inl index 7ff00ef552..60ffa41a70 100644 --- a/src/realm/sparsity.inl +++ b/src/realm/sparsity.inl @@ -91,10 +91,7 @@ namespace Realm { if (num_entries == 0) { return span>(); } - return span>( - reinterpret_cast *>(entries_instance.pointer_untyped( - 0, num_entries * sizeof(SparsityMapEntry))), - num_entries); + return span>(gpu_entries, num_entries); } else { return span>(entries.data(), entries.size()); } @@ -108,10 +105,7 @@ namespace Realm { if (num_approx == 0) { return span>(); } - return span>( - reinterpret_cast *>( - approx_instance.pointer_untyped(0, num_approx * sizeof(Rect))), - num_approx); + return span>(gpu_approx_rects, num_approx); } else { return span>(approx_rects.data(), approx_rects.size()); } diff --git a/tests/benchmark.cc b/tests/benchmark.cc index b0bed444e1..9277436a9f 100644 --- a/tests/benchmark.cc +++ b/tests/benchmark.cc @@ -1202,7 +1202,7 @@ class PreimageTest : public TestInterface { { NODE_SUBGRAPH_STREAM, }; - +ci // assign subgraph ids to nodes void chase_point(int idx, Point& color) { From 15628d98ea7d460d6dd9650df2212f6f133c46be Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Tue, 24 Mar 2026 09:30:02 -0700 Subject: [PATCH 28/32] Export CPU_BVH for shared builds --- src/realm/sparsity.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/realm/sparsity.h b/src/realm/sparsity.h index 616f86f499..b16fce5ed7 100644 --- a/src/realm/sparsity.h +++ b/src/realm/sparsity.h @@ -155,7 +155,7 @@ namespace Realm { }; template - struct CPU_BVH { + struct REALM_INTERNAL_API_EXTERNAL_LINKAGE CPU_BVH { struct Node { Rect bounds; int left = -1; From 2301eb276dd7fe982f8cded734d3404b56f43388 Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Tue, 24 Mar 2026 09:45:47 -0700 Subject: [PATCH 29/32] Restore feature-gated source selection --- src/CMakeLists.txt | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c277a1b74d..df1132bbdb 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -69,7 +69,7 @@ set(REALM_SOURCES procset/procset_module.cc ) -if(TARGET CUDA::cuda_driver AND REALM_USE_CUDA) +if(REALM_USE_CUDA) list(APPEND REALM_SOURCES cuda/cuda_module.cc cuda/cuda_internal.cc cuda/cuda_access.cc) if(REALM_USE_NVTX) list(APPEND REALM_SOURCES nvtx.cc) @@ -77,15 +77,15 @@ if(TARGET CUDA::cuda_driver AND REALM_USE_CUDA) list(APPEND REALM_CUDA_SOURCES cuda/cuda_memcpy.cu) endif() -if(TARGET hip::host) +if(REALM_USE_HIP) list(APPEND REALM_SOURCES hip/hip_module.cc hip/hip_internal.cc hip/hip_access.cc) endif() -if(TARGET LLVM::LLVM) +if(REALM_USE_LLVM) list(APPEND REALM_SOURCES llvmjit/llvmjit_internal.cc llvmjit/llvmjit_module.cc) endif() -if(TARGET hdf5::hdf5) +if(REALM_USE_HDF5) list(APPEND REALM_SOURCES hdf5/hdf5_module.cc hdf5/hdf5_internal.cc hdf5/hdf5_access.cc) endif() @@ -100,11 +100,11 @@ if(REALM_USE_PREALM) list(APPEND REALM_SOURCES prealm/prealm.cc) endif() -if(TARGET Python3::Python) +if(REALM_USE_PYTHON) list(APPEND REALM_SOURCES python/python_module.cc python/python_source.cc) endif() -if(TARGET ucx::ucp) +if(REALM_USE_UCX) list( APPEND REALM_SOURCES @@ -119,12 +119,14 @@ if(TARGET ucx::ucp) ) endif() -if(TARGET GASNet::GASNet) - list(APPEND REALM_SOURCES gasnet1/gasnet1_module.cc gasnet1/gasnetmsg.cc) +if(REALM_USE_GASNETEX) + if(NOT REALM_ENABLE_GASNETEX_WRAPPER) + list(APPEND REALM_SOURCES gasnet1/gasnet1_module.cc gasnet1/gasnetmsg.cc) + endif() list(APPEND REALM_SOURCES gasnetex/gasnetex_module.cc gasnetex/gasnetex_internal.cc) endif() -if(TARGET MPI::MPI_CXX) +if(REALM_USE_MPI) list(APPEND REALM_SOURCES mpi/mpi_module.cc mpi/am_mpi.cc) endif() From 04df5861dd1131f969a70396772f0905329ae3b6 Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Wed, 25 Mar 2026 13:10:47 -0700 Subject: [PATCH 30/32] deppart: add pinned host pool and NVTX tracing --- src/realm/deppart/partitions.h | 25 +++++ src/realm/deppart/partitions_gpu_impl.hpp | 51 +++------ src/realm/deppart/sparsity_impl.cc | 129 ++++++++++++++++------ src/realm/deppart/sparsity_impl.h | 6 + 4 files changed, 146 insertions(+), 65 deletions(-) diff --git a/src/realm/deppart/partitions.h b/src/realm/deppart/partitions.h index 0af8ec0673..a6b3fe371f 100644 --- a/src/realm/deppart/partitions.h +++ b/src/realm/deppart/partitions.h @@ -42,6 +42,31 @@ typedef CUstream_st* CUstream; #endif +#ifdef REALM_USE_NVTX +#include "realm/nvtx.h" +#endif + +//NVTX macros to only add ranges if defined. +#ifdef REALM_USE_NVTX + +#include + +inline int32_t next_nvtx_payload() { + static std::atomic counter{0}; + return counter.fetch_add(1, std::memory_order_relaxed); +} + +#define NVTX_CAT2(a, b) a##b +#define NVTX_CAT(a, b) NVTX_CAT2(a, b) + +#define NVTX_DEPPART(message) \ + nvtxScopedRange NVTX_CAT(nvtx_, __LINE__)("cuda", #message, next_nvtx_payload()) + +#else + + #define NVTX_DEPPART(message) do { } while (0) + +#endif namespace Realm { diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp index e293419b9a..e28195d550 100644 --- a/src/realm/deppart/partitions_gpu_impl.hpp +++ b/src/realm/deppart/partitions_gpu_impl.hpp @@ -1,9 +1,7 @@ #pragma once #include "deppart_config.h" #include "partitions.h" -#ifdef REALM_USE_NVTX -#include "realm/nvtx.h" -#endif + #include "realm/cuda/cuda_internal.h" #include "realm/deppart/partitions_gpu_kernels.hpp" #include @@ -47,44 +45,18 @@ } \ } while (0) - -//NVTX macros to only add ranges if defined. -#ifdef REALM_USE_NVTX - -#include - -inline int32_t next_nvtx_payload() { - static std::atomic counter{0}; - return counter.fetch_add(1, std::memory_order_relaxed); -} - -#define NVTX_CAT2(a, b) a##b -#define NVTX_CAT(a, b) NVTX_CAT2(a, b) - -#define NVTX_DEPPART(message) \ - nvtxScopedRange NVTX_CAT(nvtx_, __LINE__)("cuda", #message, next_nvtx_payload()) - -#else - - #define NVTX_DEPPART(message) do { } while (0) - -#endif - namespace Realm { template inline T *deppart_host_alloc(size_t count, unsigned flags = cudaHostAllocPortable) { - if(count == 0) return nullptr; - void *ptr = nullptr; - CUDA_HOST_CHECK(cudaHostAlloc(&ptr, count * sizeof(T), flags)); - return reinterpret_cast(ptr); + (void)flags; + return static_cast(deppart_pinned_host_alloc_bytes(count * sizeof(T))); } inline void deppart_host_free(void *ptr) { - if(ptr != nullptr) - CUDA_HOST_CHECK(cudaFreeHost(ptr)); + deppart_pinned_host_free(ptr); } // Used by cub::DeviceReduce to compute bad GPU approximation. @@ -1731,13 +1703,18 @@ namespace Realm { assert(find_memory(sysmem, Memory::SYSTEM_MEM, my_arena.location)); if (!this->exclusive) { for (auto const& elem : ctr) { + NVTX_DEPPART(cpu_finalize); size_t idx = getIndex(elem); auto mapOpj = getMap(elem); SparsityMapImpl *impl = SparsityMapImpl::lookup(mapOpj); if (d_ends_host[idx] > d_starts_host[idx]) { size_t end = d_ends_host[idx]; size_t start = d_starts_host[idx]; - Rect *h_rects = deppart_host_alloc>(end - start); + Rect * h_rects; + { + NVTX_DEPPART(rects_alloc); + h_rects = deppart_host_alloc>(end - start); + } CUDA_CHECK(cudaMemcpyAsync(h_rects, final_rects + start, (end - start) * sizeof(Rect), cudaMemcpyDeviceToHost, stream), stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); span> h_rects_span(h_rects, end - start); @@ -1753,6 +1730,7 @@ namespace Realm { //Use provided lambdas to iterate over sparsity output container (map or vector) for (auto const& elem : ctr) { + NVTX_DEPPART(gpu_finalize); size_t idx = getIndex(elem); auto mapOpj = getMap(elem); SparsityMapImpl *impl = SparsityMapImpl::lookup(mapOpj); @@ -1761,7 +1739,11 @@ namespace Realm { if (d_ends_host[idx] > d_starts_host[idx]) { size_t end = d_ends_host[idx]; size_t start = d_starts_host[idx]; - SparsityMapEntry *h_entries = deppart_host_alloc>(end - start); + SparsityMapEntry *h_entries; + { + NVTX_DEPPART(alloc_entries); + h_entries = deppart_host_alloc>(end - start); + } CUDA_CHECK(cudaMemcpyAsync(h_entries, final_entries + start, (end - start) * sizeof(SparsityMapEntry), cudaMemcpyDeviceToHost, stream), stream); Rect *approx_rects; @@ -1838,6 +1820,7 @@ namespace Realm { } } } + NVTX_DEPPART(cleanup); CUDA_CHECK(cudaStreamSynchronize(stream), stream); for (SparsityMapImpl *impl : local_finalizations) { impl->gpu_finalize(); diff --git a/src/realm/deppart/sparsity_impl.cc b/src/realm/deppart/sparsity_impl.cc index 20c655a62c..1314126d64 100644 --- a/src/realm/deppart/sparsity_impl.cc +++ b/src/realm/deppart/sparsity_impl.cc @@ -29,6 +29,8 @@ #ifdef REALM_USE_CUDA #include #endif + + #include #include #include @@ -39,6 +41,77 @@ namespace Realm { namespace { + class DeppartPinnedHostPool { + public: + void *alloc(size_t bytes) + { + if(bytes == 0) + return nullptr; + + const size_t bucket_size = round_up(bytes); + void *ptr = nullptr; + { + std::lock_guard lock(mutex); + std::vector &bucket = free_blocks[bucket_size]; + if(!bucket.empty()) { + ptr = bucket.back(); + bucket.pop_back(); + } + } + + if(ptr == nullptr) { +#ifdef REALM_USE_CUDA + cudaError_t err = cudaHostAlloc(&ptr, bucket_size, cudaHostAllocPortable); + assert(err == cudaSuccess); +#else + ptr = std::malloc(bucket_size); + assert(ptr != nullptr); +#endif + } + + { + std::lock_guard lock(mutex); + live_blocks[ptr] = bucket_size; + } + return ptr; + } + + void release(void *ptr) + { + if(ptr == nullptr) + return; + + std::lock_guard lock(mutex); + auto it = live_blocks.find(ptr); + assert(it != live_blocks.end()); + free_blocks[it->second].push_back(ptr); + live_blocks.erase(it); + } + + private: + static size_t round_up(size_t bytes) + { + size_t rounded = 4096; + while((rounded < bytes) && (rounded < (size_t(1) << 30))) + rounded <<= 1; + if(rounded >= bytes) + return rounded; + + const size_t granularity = size_t(1) << 20; + return ((bytes + granularity - 1) / granularity) * granularity; + } + + std::mutex mutex; + std::unordered_map> free_blocks; + std::unordered_map live_blocks; + }; + + DeppartPinnedHostPool &get_deppart_pinned_host_pool(void) + { + static DeppartPinnedHostPool *pool = new DeppartPinnedHostPool(); + return *pool; + } + struct PendingOutputSparsityAllocation { std::mutex mutex; std::condition_variable cv; @@ -75,31 +148,6 @@ namespace Realm { ActiveMessageHandlerReg output_sparsity_allocation_response_reg; - template - inline T *deppart_gpu_host_alloc(size_t count) - { - if(count == 0) return nullptr; -#ifdef REALM_USE_CUDA - void *ptr = nullptr; - cudaError_t err = cudaHostAlloc(&ptr, count * sizeof(T), cudaHostAllocPortable); - assert(err == cudaSuccess); - return reinterpret_cast(ptr); -#else - return static_cast(std::malloc(count * sizeof(T))); -#endif - } - - inline void deppart_gpu_host_free(void *ptr) - { - if(ptr == nullptr) return; -#ifdef REALM_USE_CUDA - cudaError_t err = cudaFreeHost(ptr); - assert(err == cudaSuccess); -#else - std::free(ptr); -#endif - } - inline bool deppart_sparsity_trace_enabled(void) { static int enabled = -1; @@ -200,6 +248,16 @@ namespace Realm { pending->cv.notify_one(); } + void *deppart_pinned_host_alloc_bytes(size_t bytes) + { + return get_deppart_pinned_host_pool().alloc(bytes); + } + + void deppart_pinned_host_free(void *ptr) + { + get_deppart_pinned_host_pool().release(ptr); + } + extern Logger log_part; //////////////////////////////////////////////////////////////////////// @@ -1405,8 +1463,8 @@ bool SparsityMapPublicImpl::bvh_centroid_less(int axis, template SparsityMapImpl::~SparsityMapImpl(void) { - deppart_gpu_host_free(this->gpu_entries); - deppart_gpu_host_free(this->gpu_approx_rects); + deppart_pinned_host_free(this->gpu_entries); + deppart_pinned_host_free(this->gpu_approx_rects); } template @@ -1694,6 +1752,8 @@ SparsityMapImpl::~SparsityMapImpl(void) size_t piece_count, bool disjoint, size_t total_count) { + NVTX_DEPPART(contribute_raw_rects); + deppart_sparsity_trace("contribute_raw_rects.enter", me.id, ID(me).sparsity_creator_node(), @@ -2296,6 +2356,7 @@ SparsityMapImpl::~SparsityMapImpl(void) template void SparsityMapImpl::finalize(void) { + NVTX_DEPPART(finalize); deppart_sparsity_trace("finalize.enter", me.id, ID(me).sparsity_creator_node(), @@ -2504,6 +2565,7 @@ SparsityMapImpl::~SparsityMapImpl(void) Event trigger_approx = Event::NO_EVENT; std::vector precise_waiters_copy, approx_waiters_copy; { + NVTX_DEPPART(synchronization); AutoLock<> al(mutex); assert(!this->entries_valid.load()); @@ -2533,6 +2595,7 @@ SparsityMapImpl::~SparsityMapImpl(void) (*it)->sparsity_map_ready(this, false); if(!sendto_approx.empty()) { + NVTX_DEPPART(send_to_approx); for(NodeID i = 0; (i <= Network::max_node_id) && !sendto_approx.empty(); i++) if(sendto_approx.contains(i)) { bool also_precise = sendto_precise.contains(i); @@ -2544,6 +2607,7 @@ SparsityMapImpl::~SparsityMapImpl(void) } if(!sendto_precise.empty()) { + NVTX_DEPPART(sendto_precise); for(NodeID i = 0; (i <= Network::max_node_id) && !sendto_precise.empty(); i++) if(sendto_precise.contains(i)) { remote_data_reply(i, true, false); @@ -2571,7 +2635,7 @@ SparsityMapImpl::~SparsityMapImpl(void) template void SparsityMapImpl::set_gpu_entries(SparsityMapEntry *entries, size_t size) { - deppart_gpu_host_free(this->gpu_entries); + deppart_pinned_host_free(this->gpu_entries); this->gpu_entries = entries; this->entries.clear(); this->num_entries = size; @@ -2580,7 +2644,7 @@ SparsityMapImpl::~SparsityMapImpl(void) template void SparsityMapImpl::set_gpu_approx_rects(Rect *approx_rects, size_t size) { - deppart_gpu_host_free(this->gpu_approx_rects); + deppart_pinned_host_free(this->gpu_approx_rects); this->gpu_approx_rects = approx_rects; this->approx_rects.clear(); this->num_approx = size; @@ -2677,7 +2741,9 @@ SparsityMapImpl::~SparsityMapImpl(void) SparsityMapImpl *impl = SparsityMapImpl::lookup(msg.sparsity); if(msg.num_entries > 0) { - SparsityMapEntry *entries = deppart_gpu_host_alloc>(msg.num_entries); + SparsityMapEntry *entries = + static_cast *>(deppart_pinned_host_alloc_bytes( + msg.num_entries * sizeof(SparsityMapEntry))); std::memcpy(entries, payload, msg.num_entries * sizeof(SparsityMapEntry)); impl->set_gpu_entries(entries, msg.num_entries); payload += msg.num_entries * sizeof(SparsityMapEntry); @@ -2686,7 +2752,8 @@ SparsityMapImpl::~SparsityMapImpl(void) } if(msg.num_approx > 0) { - Rect *approx = deppart_gpu_host_alloc>(msg.num_approx); + Rect *approx = static_cast *>(deppart_pinned_host_alloc_bytes( + msg.num_approx * sizeof(Rect))); std::memcpy(approx, payload, msg.num_approx * sizeof(Rect)); impl->set_gpu_approx_rects(approx, msg.num_approx); } else { diff --git a/src/realm/deppart/sparsity_impl.h b/src/realm/deppart/sparsity_impl.h index aa94d7200f..f1a6a3756f 100644 --- a/src/realm/deppart/sparsity_impl.h +++ b/src/realm/deppart/sparsity_impl.h @@ -36,6 +36,12 @@ namespace Realm { REALM_INTERNAL_API_EXTERNAL_LINKAGE ID create_deppart_output_sparsity(NodeID target_node); + REALM_INTERNAL_API_EXTERNAL_LINKAGE + void *deppart_pinned_host_alloc_bytes(size_t bytes); + + REALM_INTERNAL_API_EXTERNAL_LINKAGE + void deppart_pinned_host_free(void *ptr); + class PartitioningMicroOp; /** From de04613843191192c26daec38093b905ea9e86e1 Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Wed, 8 Apr 2026 09:14:52 -0700 Subject: [PATCH 31/32] Revert "deppart: add pinned host pool and NVTX tracing" This reverts commit 04df5861dd1131f969a70396772f0905329ae3b6. --- src/realm/deppart/partitions.h | 25 ----- src/realm/deppart/partitions_gpu_impl.hpp | 51 ++++++--- src/realm/deppart/sparsity_impl.cc | 129 ++++++---------------- src/realm/deppart/sparsity_impl.h | 6 - 4 files changed, 65 insertions(+), 146 deletions(-) diff --git a/src/realm/deppart/partitions.h b/src/realm/deppart/partitions.h index a6b3fe371f..0af8ec0673 100644 --- a/src/realm/deppart/partitions.h +++ b/src/realm/deppart/partitions.h @@ -42,31 +42,6 @@ typedef CUstream_st* CUstream; #endif -#ifdef REALM_USE_NVTX -#include "realm/nvtx.h" -#endif - -//NVTX macros to only add ranges if defined. -#ifdef REALM_USE_NVTX - -#include - -inline int32_t next_nvtx_payload() { - static std::atomic counter{0}; - return counter.fetch_add(1, std::memory_order_relaxed); -} - -#define NVTX_CAT2(a, b) a##b -#define NVTX_CAT(a, b) NVTX_CAT2(a, b) - -#define NVTX_DEPPART(message) \ - nvtxScopedRange NVTX_CAT(nvtx_, __LINE__)("cuda", #message, next_nvtx_payload()) - -#else - - #define NVTX_DEPPART(message) do { } while (0) - -#endif namespace Realm { diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp index e28195d550..e293419b9a 100644 --- a/src/realm/deppart/partitions_gpu_impl.hpp +++ b/src/realm/deppart/partitions_gpu_impl.hpp @@ -1,7 +1,9 @@ #pragma once #include "deppart_config.h" #include "partitions.h" - +#ifdef REALM_USE_NVTX +#include "realm/nvtx.h" +#endif #include "realm/cuda/cuda_internal.h" #include "realm/deppart/partitions_gpu_kernels.hpp" #include @@ -45,18 +47,44 @@ } \ } while (0) + +//NVTX macros to only add ranges if defined. +#ifdef REALM_USE_NVTX + +#include + +inline int32_t next_nvtx_payload() { + static std::atomic counter{0}; + return counter.fetch_add(1, std::memory_order_relaxed); +} + +#define NVTX_CAT2(a, b) a##b +#define NVTX_CAT(a, b) NVTX_CAT2(a, b) + +#define NVTX_DEPPART(message) \ + nvtxScopedRange NVTX_CAT(nvtx_, __LINE__)("cuda", #message, next_nvtx_payload()) + +#else + + #define NVTX_DEPPART(message) do { } while (0) + +#endif + namespace Realm { template inline T *deppart_host_alloc(size_t count, unsigned flags = cudaHostAllocPortable) { - (void)flags; - return static_cast(deppart_pinned_host_alloc_bytes(count * sizeof(T))); + if(count == 0) return nullptr; + void *ptr = nullptr; + CUDA_HOST_CHECK(cudaHostAlloc(&ptr, count * sizeof(T), flags)); + return reinterpret_cast(ptr); } inline void deppart_host_free(void *ptr) { - deppart_pinned_host_free(ptr); + if(ptr != nullptr) + CUDA_HOST_CHECK(cudaFreeHost(ptr)); } // Used by cub::DeviceReduce to compute bad GPU approximation. @@ -1703,18 +1731,13 @@ namespace Realm { assert(find_memory(sysmem, Memory::SYSTEM_MEM, my_arena.location)); if (!this->exclusive) { for (auto const& elem : ctr) { - NVTX_DEPPART(cpu_finalize); size_t idx = getIndex(elem); auto mapOpj = getMap(elem); SparsityMapImpl *impl = SparsityMapImpl::lookup(mapOpj); if (d_ends_host[idx] > d_starts_host[idx]) { size_t end = d_ends_host[idx]; size_t start = d_starts_host[idx]; - Rect * h_rects; - { - NVTX_DEPPART(rects_alloc); - h_rects = deppart_host_alloc>(end - start); - } + Rect *h_rects = deppart_host_alloc>(end - start); CUDA_CHECK(cudaMemcpyAsync(h_rects, final_rects + start, (end - start) * sizeof(Rect), cudaMemcpyDeviceToHost, stream), stream); CUDA_CHECK(cudaStreamSynchronize(stream), stream); span> h_rects_span(h_rects, end - start); @@ -1730,7 +1753,6 @@ namespace Realm { //Use provided lambdas to iterate over sparsity output container (map or vector) for (auto const& elem : ctr) { - NVTX_DEPPART(gpu_finalize); size_t idx = getIndex(elem); auto mapOpj = getMap(elem); SparsityMapImpl *impl = SparsityMapImpl::lookup(mapOpj); @@ -1739,11 +1761,7 @@ namespace Realm { if (d_ends_host[idx] > d_starts_host[idx]) { size_t end = d_ends_host[idx]; size_t start = d_starts_host[idx]; - SparsityMapEntry *h_entries; - { - NVTX_DEPPART(alloc_entries); - h_entries = deppart_host_alloc>(end - start); - } + SparsityMapEntry *h_entries = deppart_host_alloc>(end - start); CUDA_CHECK(cudaMemcpyAsync(h_entries, final_entries + start, (end - start) * sizeof(SparsityMapEntry), cudaMemcpyDeviceToHost, stream), stream); Rect *approx_rects; @@ -1820,7 +1838,6 @@ namespace Realm { } } } - NVTX_DEPPART(cleanup); CUDA_CHECK(cudaStreamSynchronize(stream), stream); for (SparsityMapImpl *impl : local_finalizations) { impl->gpu_finalize(); diff --git a/src/realm/deppart/sparsity_impl.cc b/src/realm/deppart/sparsity_impl.cc index 1314126d64..20c655a62c 100644 --- a/src/realm/deppart/sparsity_impl.cc +++ b/src/realm/deppart/sparsity_impl.cc @@ -29,8 +29,6 @@ #ifdef REALM_USE_CUDA #include #endif - - #include #include #include @@ -41,77 +39,6 @@ namespace Realm { namespace { - class DeppartPinnedHostPool { - public: - void *alloc(size_t bytes) - { - if(bytes == 0) - return nullptr; - - const size_t bucket_size = round_up(bytes); - void *ptr = nullptr; - { - std::lock_guard lock(mutex); - std::vector &bucket = free_blocks[bucket_size]; - if(!bucket.empty()) { - ptr = bucket.back(); - bucket.pop_back(); - } - } - - if(ptr == nullptr) { -#ifdef REALM_USE_CUDA - cudaError_t err = cudaHostAlloc(&ptr, bucket_size, cudaHostAllocPortable); - assert(err == cudaSuccess); -#else - ptr = std::malloc(bucket_size); - assert(ptr != nullptr); -#endif - } - - { - std::lock_guard lock(mutex); - live_blocks[ptr] = bucket_size; - } - return ptr; - } - - void release(void *ptr) - { - if(ptr == nullptr) - return; - - std::lock_guard lock(mutex); - auto it = live_blocks.find(ptr); - assert(it != live_blocks.end()); - free_blocks[it->second].push_back(ptr); - live_blocks.erase(it); - } - - private: - static size_t round_up(size_t bytes) - { - size_t rounded = 4096; - while((rounded < bytes) && (rounded < (size_t(1) << 30))) - rounded <<= 1; - if(rounded >= bytes) - return rounded; - - const size_t granularity = size_t(1) << 20; - return ((bytes + granularity - 1) / granularity) * granularity; - } - - std::mutex mutex; - std::unordered_map> free_blocks; - std::unordered_map live_blocks; - }; - - DeppartPinnedHostPool &get_deppart_pinned_host_pool(void) - { - static DeppartPinnedHostPool *pool = new DeppartPinnedHostPool(); - return *pool; - } - struct PendingOutputSparsityAllocation { std::mutex mutex; std::condition_variable cv; @@ -148,6 +75,31 @@ namespace Realm { ActiveMessageHandlerReg output_sparsity_allocation_response_reg; + template + inline T *deppart_gpu_host_alloc(size_t count) + { + if(count == 0) return nullptr; +#ifdef REALM_USE_CUDA + void *ptr = nullptr; + cudaError_t err = cudaHostAlloc(&ptr, count * sizeof(T), cudaHostAllocPortable); + assert(err == cudaSuccess); + return reinterpret_cast(ptr); +#else + return static_cast(std::malloc(count * sizeof(T))); +#endif + } + + inline void deppart_gpu_host_free(void *ptr) + { + if(ptr == nullptr) return; +#ifdef REALM_USE_CUDA + cudaError_t err = cudaFreeHost(ptr); + assert(err == cudaSuccess); +#else + std::free(ptr); +#endif + } + inline bool deppart_sparsity_trace_enabled(void) { static int enabled = -1; @@ -248,16 +200,6 @@ namespace Realm { pending->cv.notify_one(); } - void *deppart_pinned_host_alloc_bytes(size_t bytes) - { - return get_deppart_pinned_host_pool().alloc(bytes); - } - - void deppart_pinned_host_free(void *ptr) - { - get_deppart_pinned_host_pool().release(ptr); - } - extern Logger log_part; //////////////////////////////////////////////////////////////////////// @@ -1463,8 +1405,8 @@ bool SparsityMapPublicImpl::bvh_centroid_less(int axis, template SparsityMapImpl::~SparsityMapImpl(void) { - deppart_pinned_host_free(this->gpu_entries); - deppart_pinned_host_free(this->gpu_approx_rects); + deppart_gpu_host_free(this->gpu_entries); + deppart_gpu_host_free(this->gpu_approx_rects); } template @@ -1752,8 +1694,6 @@ SparsityMapImpl::~SparsityMapImpl(void) size_t piece_count, bool disjoint, size_t total_count) { - NVTX_DEPPART(contribute_raw_rects); - deppart_sparsity_trace("contribute_raw_rects.enter", me.id, ID(me).sparsity_creator_node(), @@ -2356,7 +2296,6 @@ SparsityMapImpl::~SparsityMapImpl(void) template void SparsityMapImpl::finalize(void) { - NVTX_DEPPART(finalize); deppart_sparsity_trace("finalize.enter", me.id, ID(me).sparsity_creator_node(), @@ -2565,7 +2504,6 @@ SparsityMapImpl::~SparsityMapImpl(void) Event trigger_approx = Event::NO_EVENT; std::vector precise_waiters_copy, approx_waiters_copy; { - NVTX_DEPPART(synchronization); AutoLock<> al(mutex); assert(!this->entries_valid.load()); @@ -2595,7 +2533,6 @@ SparsityMapImpl::~SparsityMapImpl(void) (*it)->sparsity_map_ready(this, false); if(!sendto_approx.empty()) { - NVTX_DEPPART(send_to_approx); for(NodeID i = 0; (i <= Network::max_node_id) && !sendto_approx.empty(); i++) if(sendto_approx.contains(i)) { bool also_precise = sendto_precise.contains(i); @@ -2607,7 +2544,6 @@ SparsityMapImpl::~SparsityMapImpl(void) } if(!sendto_precise.empty()) { - NVTX_DEPPART(sendto_precise); for(NodeID i = 0; (i <= Network::max_node_id) && !sendto_precise.empty(); i++) if(sendto_precise.contains(i)) { remote_data_reply(i, true, false); @@ -2635,7 +2571,7 @@ SparsityMapImpl::~SparsityMapImpl(void) template void SparsityMapImpl::set_gpu_entries(SparsityMapEntry *entries, size_t size) { - deppart_pinned_host_free(this->gpu_entries); + deppart_gpu_host_free(this->gpu_entries); this->gpu_entries = entries; this->entries.clear(); this->num_entries = size; @@ -2644,7 +2580,7 @@ SparsityMapImpl::~SparsityMapImpl(void) template void SparsityMapImpl::set_gpu_approx_rects(Rect *approx_rects, size_t size) { - deppart_pinned_host_free(this->gpu_approx_rects); + deppart_gpu_host_free(this->gpu_approx_rects); this->gpu_approx_rects = approx_rects; this->approx_rects.clear(); this->num_approx = size; @@ -2741,9 +2677,7 @@ SparsityMapImpl::~SparsityMapImpl(void) SparsityMapImpl *impl = SparsityMapImpl::lookup(msg.sparsity); if(msg.num_entries > 0) { - SparsityMapEntry *entries = - static_cast *>(deppart_pinned_host_alloc_bytes( - msg.num_entries * sizeof(SparsityMapEntry))); + SparsityMapEntry *entries = deppart_gpu_host_alloc>(msg.num_entries); std::memcpy(entries, payload, msg.num_entries * sizeof(SparsityMapEntry)); impl->set_gpu_entries(entries, msg.num_entries); payload += msg.num_entries * sizeof(SparsityMapEntry); @@ -2752,8 +2686,7 @@ SparsityMapImpl::~SparsityMapImpl(void) } if(msg.num_approx > 0) { - Rect *approx = static_cast *>(deppart_pinned_host_alloc_bytes( - msg.num_approx * sizeof(Rect))); + Rect *approx = deppart_gpu_host_alloc>(msg.num_approx); std::memcpy(approx, payload, msg.num_approx * sizeof(Rect)); impl->set_gpu_approx_rects(approx, msg.num_approx); } else { diff --git a/src/realm/deppart/sparsity_impl.h b/src/realm/deppart/sparsity_impl.h index f1a6a3756f..aa94d7200f 100644 --- a/src/realm/deppart/sparsity_impl.h +++ b/src/realm/deppart/sparsity_impl.h @@ -36,12 +36,6 @@ namespace Realm { REALM_INTERNAL_API_EXTERNAL_LINKAGE ID create_deppart_output_sparsity(NodeID target_node); - REALM_INTERNAL_API_EXTERNAL_LINKAGE - void *deppart_pinned_host_alloc_bytes(size_t bytes); - - REALM_INTERNAL_API_EXTERNAL_LINKAGE - void deppart_pinned_host_free(void *ptr); - class PartitioningMicroOp; /** From be8544f845b9f025460dac4ec514433e29ec636d Mon Sep 17 00:00:00 2001 From: Rohan Chanani Date: Wed, 8 Apr 2026 09:18:44 -0700 Subject: [PATCH 32/32] added .codex to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index ae19316444..78cce53e0c 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ install/ .idea/ .vscode/ .cursor/ +.codex # clangd LSP cache .cache/