diff --git a/.gitignore b/.gitignore index ae19316444..78cce53e0c 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ install/ .idea/ .vscode/ .cursor/ +.codex # clangd LSP cache .cache/ diff --git a/cmake/deppart_tmpl.cu.in b/cmake/deppart_tmpl.cu.in new file mode 100644 index 0000000000..01978e21ac --- /dev/null +++ b/cmake/deppart_tmpl.cu.in @@ -0,0 +1,20 @@ +/* + * Copyright 2025 Stanford University, NVIDIA Corporation + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#cmakedefine INST_N1 @INST_N1@ +#cmakedefine INST_N2 @INST_N2@ +#include "@SRCFILE@_gpu_tmpl.cu" \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7054eb2e94..df1132bbdb 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -38,7 +38,6 @@ set(REALM_SOURCES nodeset.cc operation.cc proc_impl.cc - realm_assert.cc repl_heap.cc rsrv_impl.cc runtime_impl.cc @@ -64,6 +63,7 @@ set(REALM_SOURCES deppart/partitions.cc deppart/setops.cc deppart/sparsity_impl.cc + deppart/untemplated_gpu_kernels.cu numa/numa_module.cc numa/numasysif.cc procset/procset_module.cc @@ -120,7 +120,7 @@ if(REALM_USE_UCX) endif() if(REALM_USE_GASNETEX) - if (NOT REALM_ENABLE_GASNETEX_WRAPPER) + if(NOT REALM_ENABLE_GASNETEX_WRAPPER) list(APPEND REALM_SOURCES gasnet1/gasnet1_module.cc gasnet1/gasnetmsg.cc) endif() list(APPEND REALM_SOURCES gasnetex/gasnetex_module.cc gasnetex/gasnetex_internal.cc) @@ -145,7 +145,7 @@ configure_file( @ONLY ) -# generate per-dimension object files for deppart stuff +# Generate per-dimension object files for CPU deppart. foreach(INST_N1 RANGE 1 ${REALM_MAX_DIM}) foreach(INST_N2 RANGE 1 ${REALM_MAX_DIM}) foreach(SRCFILE realm/deppart/image realm/deppart/preimage realm/deppart/byfield) @@ -157,6 +157,18 @@ foreach(INST_N1 RANGE 1 ${REALM_MAX_DIM}) endforeach() endforeach() +# Generate per-dimension object files for GPU deppart. +foreach(INST_N1 RANGE 1 ${REALM_MAX_DIM}) + foreach(INST_N2 RANGE 1 ${REALM_MAX_DIM}) + foreach(SRCFILE realm/deppart/byfield realm/deppart/image realm/deppart/preimage) + set(_result_file "${CMAKE_CURRENT_BINARY_DIR}/${SRCFILE}_gpu_${INST_N1}_${INST_N2}.cu") + # use cmake's configure_file for a portable way of creating wrapper source files + configure_file("${PROJECT_SOURCE_DIR}/cmake/deppart_tmpl.cu.in" "${_result_file}") + list(APPEND REALM_SOURCES "${_result_file}") + endforeach() + endforeach() +endforeach() + set(REALM_SOURCES ${REALM_SOURCES} PARENT_SCOPE diff --git a/src/realm/cuda/cuda_internal.h b/src/realm/cuda/cuda_internal.h index 614710bfe1..13d127c12b 100644 --- a/src/realm/cuda/cuda_internal.h +++ b/src/realm/cuda/cuda_internal.h @@ -412,6 +412,7 @@ namespace Realm { get_null_task_stream(void) const; // needed by librealm_kokkos.so GPUStream *get_next_task_stream(bool create = false); GPUStream *get_next_d2d_stream(); + GPUStream *get_deppart_stream() const; void launch_batch_affine_fill_kernel(void *fill_info, size_t dim, size_t elemSize, size_t volume, GPUStream *stream); @@ -489,6 +490,8 @@ namespace Realm { GPUStream *host_to_device_stream = nullptr; GPUStream *device_to_host_stream = nullptr; GPUStream *device_to_device_stream = nullptr; + GPUStream *deppart_stream = nullptr; + std::vector device_to_device_streams; std::vector peer_to_peer_streams; // indexed by target std::vector task_streams; diff --git a/src/realm/cuda/cuda_module.cc b/src/realm/cuda/cuda_module.cc index 0147bc2b0d..ce84eb5704 100644 --- a/src/realm/cuda/cuda_module.cc +++ b/src/realm/cuda/cuda_module.cc @@ -1058,6 +1058,11 @@ namespace Realm { return device_to_device_streams[d2d_stream_index]; } + GPUStream *GPU::get_deppart_stream() const + { + return deppart_stream; + } + static void launch_kernel(const Realm::Cuda::GPU::GPUFuncInfo &func_info, void *params, size_t num_elems, GPUStream *stream) { @@ -2040,6 +2045,7 @@ namespace Realm { host_to_device_stream = new GPUStream(this, worker); device_to_host_stream = new GPUStream(this, worker); + deppart_stream = new GPUStream(this, worker); CUdevice dev; int numSMs; @@ -2164,6 +2170,7 @@ namespace Realm { // destroy streams delete host_to_device_stream; delete device_to_host_stream; + delete deppart_stream; delete_container_contents(device_to_device_streams); diff --git a/src/realm/deppart/byfield.cc b/src/realm/deppart/byfield.cc index cc6a0d6cc4..ed65533555 100644 --- a/src/realm/deppart/byfield.cc +++ b/src/realm/deppart/byfield.cc @@ -23,12 +23,44 @@ #include "realm/deppart/rectlist.h" #include "realm/deppart/inst_helper.h" #include "realm/logging.h" +#include "realm/cuda/cuda_internal.h" namespace Realm { extern Logger log_part; extern Logger log_uop_timing; + template + void IndexSpace::by_field_buffer_requirements( + const std::vector>& inputs, + std::vector& requirements) const { + requirements = std::vector(inputs.size()); + for (size_t i = 0; i < inputs.size(); i++) { + IndexSpace is = inputs[i].space; + Memory mem = inputs[i].location; + if (mem.kind() == Memory::GPU_FB_MEM || + mem.kind() == Memory::Z_COPY_MEM) { + const char* val = std::getenv("MIN_SIZE"); // or any env var + size_t device_size = 2000000; //default + if (val) { + device_size = atoi(val); + } + size_t optimal_size = is.bounds.volume() * 20 * sizeof(RectDesc); + Processor best_proc = Processor::NO_PROC; + assert(choose_proc(best_proc, mem)); + requirements[i].affinity_processor = best_proc; + requirements[i].lower_bound = device_size; + requirements[i].upper_bound = max(device_size, optimal_size); + requirements[i].minimum_alignment = 128; + } else { + requirements[i].affinity_processor = Processor::NO_PROC; + requirements[i].lower_bound = 0; + requirements[i].upper_bound = 0; + requirements[i].minimum_alignment = 0; + } + } + } + template template @@ -277,8 +309,128 @@ namespace Realm { (void)ok; } + template + ActiveMessageHandlerReg > > ByFieldMicroOp::areg; + + +#ifdef REALM_USE_CUDA + //////////////////////////////////////////////////////////////////////// + // + // class GPUByFieldMicroOp + + template + GPUByFieldMicroOp::GPUByFieldMicroOp( + const IndexSpace &_parent, + std::vector, FT> > _field_data, + bool _exclusive) + : parent_space(_parent), field_data(_field_data) { + this->exclusive = _exclusive; + areg.force_instantiation(); + // GPU setup (this->gpu, this->stream) deferred to execute(), which runs on the + // correct node after dispatch() has forwarded to the instance owner if needed. + } + + template + template + GPUByFieldMicroOp::GPUByFieldMicroOp( + NodeID _requestor, AsyncMicroOp *_async_microop, S& s) + : GPUMicroOp(_requestor, _async_microop) + , parent_space() { + bool ok = true; + size_t n = 0; + ok = ok && (s >> parent_space); + ok = ok && (s >> this->exclusive); + ok = ok && (s >> n); + field_data.resize(n); + for(size_t i = 0; i < n && ok; i++) + ok = ok && (s >> field_data[i].index_space) && + (s >> field_data[i].inst) && + (s >> field_data[i].field_offset) && + (s >> field_data[i].scratch_buffer); + // Deserialize colors manually to avoid std::vector proxy issues + size_t nc = 0; + ok = ok && (s >> nc); + for(size_t i = 0; i < nc && ok; i++) { + FT c; + ok = ok && (s >> c); + if(ok) colors.push_back(c); + } + ok = ok && (s >> sparsity_outputs); + assert(ok); + (void)ok; + } + + template + template + bool GPUByFieldMicroOp::serialize_params(S& s) const { + bool ok = true; + ok = ok && (s << parent_space); + ok = ok && (s << this->exclusive); + ok = ok && (s << field_data.size()); + for(size_t i = 0; i < field_data.size() && ok; i++) + ok = ok && (s << field_data[i].index_space) && + (s << field_data[i].inst) && + (s << field_data[i].field_offset) && + (s << field_data[i].scratch_buffer); + // Serialize colors manually to avoid std::vector proxy issues + ok = ok && (s << colors.size()); + for(size_t i = 0; i < colors.size() && ok; i++) { + FT c = colors[i]; + ok = ok && (s << c); + } + ok = ok && (s << sparsity_outputs); + return ok; + } + + template + GPUByFieldMicroOp::~GPUByFieldMicroOp() { + } + + template + void GPUByFieldMicroOp::dispatch( + PartitioningOperation *op, bool inline_ok) { + + // GPU by-field must execute on the node that owns the GPU memory + NodeID exec_node = ID(field_data[0].inst).instance_owner_node(); + if(this->exclusive) { + for(const auto& it : sparsity_outputs) + assert(NodeID(ID(it.second).sparsity_creator_node()) == exec_node); + } + if(exec_node != Network::my_node_id) { + PartitioningMicroOp::template forward_microop >(exec_node, op, this); + return; + } + + // We have to register ourselves as a waiter on sparse inputs before dispatching. + + for (size_t i = 0; i < field_data.size(); i++) { + IndexSpace inst_space = field_data[i].index_space; + if (!inst_space.dense()) { + bool registered = SparsityMapImpl::lookup(inst_space.sparsity)->add_waiter(this, true /*precise*/); + if (registered) + this->wait_count.fetch_add(1); + } + } + + if (!parent_space.dense()) { + bool registered = SparsityMapImpl::lookup(parent_space.sparsity)->add_waiter(this, true /*precise*/); + if (registered) this->wait_count.fetch_add(1); + } + this->finish_dispatch(op, inline_ok); + } + + template + void GPUByFieldMicroOp::add_sparsity_output( + FT _val, SparsityMap _sparsity) { + colors.push_back(_val); + sparsity_outputs[_val] = _sparsity; + } + template - ActiveMessageHandlerReg > > ByFieldMicroOp::areg; + ActiveMessageHandlerReg > > + GPUByFieldMicroOp::areg; + +#endif //////////////////////////////////////////////////////////////////////// @@ -294,12 +446,26 @@ namespace Realm { : PartitioningOperation(reqs, _finish_event, _finish_gen) , parent(_parent) , field_data(_field_data) + , exclusive_gpu_owner(exclusive_gpu_exec_node()) {} template ByFieldOperation::~ByFieldOperation(void) {} + template + NodeID ByFieldOperation::exclusive_gpu_exec_node(void) const + { + if(field_data.size() != 1) + return -1; + + Memory::Kind kind = field_data[0].inst.get_location().kind(); + if((kind != Memory::GPU_FB_MEM) && (kind != Memory::Z_COPY_MEM)) + return -1; + + return ID(field_data[0].inst).instance_owner_node(); + } + template IndexSpace ByFieldOperation::add_color(FT color) { @@ -312,8 +478,13 @@ namespace Realm { subspace.bounds = parent.bounds; // get a sparsity ID by round-robin'ing across the nodes that have field data - int target_node = ID(field_data[colors.size() % field_data.size()].inst).instance_owner_node(); - SparsityMap sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert >(); + int target_node = (exclusive_gpu_owner >= 0) ? + exclusive_gpu_owner : + ID(field_data[colors.size() % field_data.size()].inst).instance_owner_node(); + if(exclusive_gpu_owner >= 0) + assert(target_node == exclusive_gpu_exec_node()); + SparsityMap sparsity = + create_deppart_output_sparsity(target_node).convert>(); subspace.sparsity = sparsity; colors.push_back(color); @@ -322,22 +493,52 @@ namespace Realm { return subspace; } - template - void ByFieldOperation::execute(void) - { - for(size_t i = 0; i < subspaces.size(); i++) - SparsityMapImpl::lookup(subspaces[i])->set_contributor_count(field_data.size()); - - for(size_t i = 0; i < field_data.size(); i++) { - ByFieldMicroOp *uop = new ByFieldMicroOp(parent, - field_data[i].index_space, - field_data[i].inst, - field_data[i].field_offset); - for(size_t j = 0; j < colors.size(); j++) - uop->add_sparsity_output(colors[j], subspaces[j]); - //uop.set_value_set(colors); + template + void ByFieldOperation::execute(void) { + + + // If the field data is on the GPU, we need to launch a GPUByFieldMicroOp. + // Rather than one micro-op per field, we can do them all in one micro-op. + // Launching multiple GPU micro-ops just adds overhead, and + // there isn't enough work to need multiple GPUs. + std::vector,FT> > gpu_field_data; + std::vector,FT> > cpu_field_data; + for (size_t i = 0; i < field_data.size(); i++) { + if (field_data[i].inst.get_location().kind() == Memory::GPU_FB_MEM + || field_data[i].inst.get_location().kind() == Memory::Z_COPY_MEM) { + gpu_field_data.push_back(field_data[i]); + } else { + cpu_field_data.push_back(field_data[i]); + } + } + bool exclusive = (gpu_field_data.size() == 1) && cpu_field_data.empty(); + if (!exclusive) { + for (size_t i = 0; i < subspaces.size(); i++) + SparsityMapImpl::lookup(subspaces[i])->set_contributor_count(cpu_field_data.size() + gpu_field_data.size()); + } + for (size_t i = 0; i < cpu_field_data.size(); i++) { + ByFieldMicroOp *uop = new ByFieldMicroOp(parent, + cpu_field_data[i].index_space, + cpu_field_data[i].inst, + cpu_field_data[i].field_offset); + for (size_t j = 0; j < colors.size(); j++) + uop->add_sparsity_output(colors[j], subspaces[j]); + uop->dispatch(this, true /* ok to run in this thread */); } +#ifdef REALM_USE_CUDA + for (auto fdd : gpu_field_data) { + assert(fdd.scratch_buffer != RegionInstance::NO_INST); + std::vector,FT> > single_gpu_field_data = {fdd}; + GPUByFieldMicroOp *uop = new GPUByFieldMicroOp(parent, single_gpu_field_data, exclusive); + for (size_t i = 0; i < colors.size(); i++) { + uop->add_sparsity_output(colors[i], subspaces[i]); + } + uop->dispatch(this, false); + } +#else + assert(gpu_field_data.empty()); +#endif } template @@ -345,20 +546,4 @@ namespace Realm { { os << "ByFieldOperation(" << parent << ")"; } - -#define DOIT(N,T,F) \ - template class ByFieldMicroOp; \ - template class ByFieldOperation; \ - template ByFieldMicroOp::ByFieldMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \ - template Event IndexSpace::create_subspaces_by_field(const std::vector,F> >&, \ - const std::vector&, \ - std::vector >&, \ - const ProfilingRequestSet &, \ - Event) const; -#ifndef REALM_TEMPLATES_ONLY - FOREACH_NTF(DOIT) -#endif - - // instantiations of point/rect-field templates handled in byfield_tmpl.cc - }; diff --git a/src/realm/deppart/byfield.h b/src/realm/deppart/byfield.h index 1ff62b415e..35b823552f 100644 --- a/src/realm/deppart/byfield.h +++ b/src/realm/deppart/byfield.h @@ -21,6 +21,7 @@ #define REALM_DEPPART_BYFIELD_H #include "realm/deppart/partitions.h" +#include "realm/deppart/rectlist.h" namespace Realm { @@ -67,6 +68,48 @@ namespace Realm { std::map > sparsity_outputs; }; +#ifdef REALM_USE_CUDA + + template + class GPUByFieldMicroOp : public GPUMicroOp { + public: + static const int DIM = N; + typedef T IDXTYPE; + typedef FT FIELDTYPE; + + GPUByFieldMicroOp( + const IndexSpace &_parent, + std::vector,FT> > _field_data, + bool _exclusive); + + virtual ~GPUByFieldMicroOp(void); + + virtual void execute(void); + + void dispatch(PartitioningOperation *op, bool inline_ok); + + void add_sparsity_output(FT _val, SparsityMap _sparsity); + + protected: + friend struct RemoteMicroOpMessage >; + static ActiveMessageHandlerReg > > areg; + + friend class PartitioningMicroOp; + template + REALM_ATTR_WARN_UNUSED(bool serialize_params(S& s) const); + + // construct from received packet + template + GPUByFieldMicroOp(NodeID _requestor, AsyncMicroOp *_async_microop, S& s); + + IndexSpace parent_space; + std::vector,FT> > field_data; + std::vector colors; + std::map > sparsity_outputs; + }; + +#endif + template class ByFieldOperation : public PartitioningOperation { public: @@ -84,10 +127,13 @@ namespace Realm { virtual void print(std::ostream& os) const; protected: + NodeID exclusive_gpu_exec_node(void) const; + IndexSpace parent; std::vector,FT> > field_data; std::vector colors; std::vector > subspaces; + int exclusive_gpu_owner; }; }; diff --git a/src/realm/deppart/byfield_gpu_impl.hpp b/src/realm/deppart/byfield_gpu_impl.hpp new file mode 100644 index 0000000000..bf25f81f03 --- /dev/null +++ b/src/realm/deppart/byfield_gpu_impl.hpp @@ -0,0 +1,287 @@ +#pragma once +#include "realm/deppart/byfield.h" +#include "realm/deppart/byfield_gpu_kernels.hpp" +#include "realm/deppart/partitions_gpu_impl.hpp" +#include +#include "realm/nvtx.h" + +namespace Realm { + +/* + * Input (stored in MicroOp): Array of field instances, a parent index space, and a list of colors + * Output: A list of (potentially overlapping) points in original instances ∩ parent index space marked with their color, + * which it then sends off to complete_pipeline. + * Approach: Intersect all instance rectangles with parent rectangles in parallel. For surviving rectangles, use + * prefix sum + binary search to iterate over these in parallel and mark each point with its color. + */ +template +void GPUByFieldMicroOp::execute() +{ + // Resolve the local GPU processor now that we are guaranteed to be on the + // correct node (dispatch() forwarded us here if the instance was remote). + { + Memory my_mem = field_data[0].inst.get_location(); + Processor best_proc; + assert(choose_proc(best_proc, my_mem)); + Cuda::GPUProcessor *gpu_proc = + dynamic_cast(get_runtime()->get_processor_impl(best_proc)); + assert(gpu_proc); + this->gpu = gpu_proc->gpu; + this->stream = gpu_proc->gpu->get_deppart_stream(); + } + + + + Cuda::AutoGPUContext agc(this->gpu); + + // For profiling. + NVTX_DEPPART(byfield_gpu); + + CUstream stream = this->stream->get_stream(); + + collapsed_space inst_space; + + size_t tile_size = field_data[0].scratch_buffer.get_layout()->bytes_used; + + //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; + + Arena buffer_arena(field_data[0].scratch_buffer); + + inst_space.offsets = buffer_arena.alloc(field_data.size() + 1); + inst_space.num_children = field_data.size(); + + Arena sys_arena; + GPUMicroOp::collapse_multi_space(field_data, inst_space, sys_arena, stream); + + collapsed_space collapsed_parent; + collapsed_parent.offsets = buffer_arena.alloc(2); + collapsed_parent.num_children = 1; + std::vector> parent_spaces = {parent_space}; + + // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs. + GPUMicroOp::collapse_multi_space(parent_spaces, collapsed_parent, buffer_arena, stream); + + // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter + // to figure out where to write each rectangle. + uint32_t* d_inst_counters = buffer_arena.alloc(2*field_data.size() + 1); + + // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second + // to track which instance each rectangle came from in the populate phase. + uint32_t* d_inst_prefix = d_inst_counters + field_data.size(); + size_t num_valid_rects = 0; + Rect* d_valid_rects; + + FT* d_colors; + + + // Memcpying a boolean vector breaks things for some reason so we have this disgusting workaround. + if constexpr(std::is_same_v) { + std::vector flat_colors(colors.size()); + for (size_t i = 0; i < colors.size(); i++) { + flat_colors[i] = colors[i] ? 1 : 0; + } + uint8_t* d_flat_colors = buffer_arena.alloc(colors.size()); + CUDA_CHECK(cudaMemcpyAsync(d_flat_colors, flat_colors.data(), colors.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, stream), stream); + d_colors = reinterpret_cast(d_flat_colors); + } else { + d_colors = buffer_arena.alloc(colors.size()); + CUDA_CHECK(cudaMemcpyAsync(d_colors, colors.data(), colors.size() * sizeof(FT), cudaMemcpyHostToDevice, stream), stream); + } + + + std::vector> h_accessors(field_data.size()); + for (size_t i = 0; i < field_data.size(); ++i) { + h_accessors[i] = AffineAccessor(field_data[i].inst, field_data[i].field_offset); + } + AffineAccessor* d_accessors = buffer_arena.alloc>(field_data.size()); + CUDA_CHECK(cudaMemcpyAsync(d_accessors, h_accessors.data(), + field_data.size() * sizeof(AffineAccessor), + cudaMemcpyHostToDevice, stream), stream); + + buffer_arena.commit(false); + + // Map colors to their output index to match send output iterator. + std::map color_indices; + for (size_t i = 0; i < colors.size(); i++) { + color_indices[colors[i]] = i; + } + + Memory sysmem; + assert(find_memory(sysmem, Memory::SYSTEM_MEM, buffer_arena.location)); + + size_t num_output = 0; + RectDesc* output_start = nullptr; + size_t num_completed = 0; + size_t curr_tile = tile_size / 2; + int count = 0; + if (count) {} + bool host_fallback = false; + std::vector*> host_rect_buffers(colors.size(), nullptr); + std::vector entry_counts(colors.size(), 0); + while (num_completed < inst_space.num_entries) { + try { + //std::cout << "Byfield iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; + buffer_arena.start(); + if (num_completed + curr_tile > inst_space.num_entries) { + curr_tile = inst_space.num_entries - num_completed; + } + + collapsed_space inst_space_tile = inst_space; + inst_space_tile.num_entries = curr_tile; + inst_space_tile.entries_buffer = buffer_arena.alloc>(curr_tile); + CUDA_CHECK(cudaMemcpyAsync(inst_space_tile.entries_buffer, inst_space.entries_buffer + num_completed, curr_tile * sizeof(SparsityMapEntry), cudaMemcpyHostToDevice, stream), stream); + + // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from. + GPUMicroOp::template construct_input_rectlist>(inst_space_tile, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream); + + + // Early out if we don't have any rectangles. + if (num_valid_rects == 0) { + num_completed += curr_tile; + curr_tile = tile_size / 2; + subtract_const<<>>(inst_space.offsets, field_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + continue; + } + + + // Prefix sum the valid rectangles by volume. + size_t total_pts; + size_t* d_prefix_rects; + + GPUMicroOp::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream); + + // Now we have everything we need to actually populate our outputs. + buffer_arena.flip_parity(); + assert(!buffer_arena.get_parity()); + + PointDesc* d_points = buffer_arena.alloc>(total_pts); + + // This is where the work is actually done - each thread figures out which points to read, reads it, marks a PointDesc with its color, and writes it out. + byfield_gpuPopulateBitmasksKernel<<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, d_colors, total_pts, colors.size(), num_valid_rects, field_data.size(), d_points); + KERNEL_CHECK(stream); + + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + // Ship off the points for final processing. + size_t num_new_rects = (num_output == 0) ? 1 : 2; + assert(!buffer_arena.get_parity()); + RectDesc* d_new_rects; + this->complete_pipeline(d_points, total_pts, d_new_rects, num_new_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& kv){ + // elem is a SparsityMap from the vector + return color_indices.at(kv.first); + }, + /* getMap: */ [&](auto const& kv){ + // return the SparsityMap key itself + return kv.second; + }); + + if (host_fallback) { + this->split_output(d_new_rects, num_new_rects, host_rect_buffers, entry_counts, buffer_arena); + } + + if (num_output==0 || host_fallback) { + num_output = num_new_rects; + output_start = d_new_rects; + num_completed += curr_tile; + subtract_const<<>>(inst_space.offsets, field_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + curr_tile = tile_size / 2; + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + continue; + } + + //Otherwise we merge with existing rectangles + RectDesc* d_old_rects = buffer_arena.alloc>(num_output); + assert(d_old_rects == d_new_rects + num_new_rects); + CUDA_CHECK(cudaMemcpyAsync(d_old_rects, output_start, num_output * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + size_t num_final_rects = 1; + //Send it off for processing + this->complete_rect_pipeline(d_new_rects, num_output + num_new_rects, output_start, num_final_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& kv){ + // elem is a SparsityMap from the vector + return color_indices.at(kv.first); + }, + /* getMap: */ [&](auto const& kv){ + // return the SparsityMap key itself + return kv.second; + }); + num_completed += curr_tile; + num_output = num_final_rects; + subtract_const<<>>(inst_space.offsets, field_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + curr_tile = tile_size / 2; + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + } catch (arena_oom&) { + //std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; + curr_tile /= 2; + if (curr_tile == 0) { + if (host_fallback) { + GPUMicroOp::shatter_rects(inst_space, num_completed, stream); + curr_tile = 1; + } else { + host_fallback = true; + if (num_output > 0) { + this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena); + } + curr_tile = tile_size / 2; + } + } + } + } + + if (num_output == 0) { + for (std::pair> it : sparsity_outputs) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(it.second); + if (this->exclusive) { + impl->gpu_finalize(); + } else { + impl->contribute_nothing(); + } + } + return; + } + + if (!host_fallback) { + try { + this->send_output(output_start, num_output, buffer_arena, sparsity_outputs, + /* getIndex: */ [&](auto const& kv){ + // elem is a SparsityMap from the vector + return color_indices.at(kv.first); + }, + /* getMap: */ [&](auto const& kv){ + // return the SparsityMap key itself + return kv.second; + }); + } catch (arena_oom&) { + this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena); + host_fallback = true; + } + } + + if (host_fallback) { + for (std::pair> it : sparsity_outputs) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(it.second); + if (this->exclusive) { + impl->set_contributor_count(1); + } + size_t idx = color_indices.at(it.first); + if (entry_counts[idx] > 0) { + span> h_rects_span(host_rect_buffers[idx], entry_counts[idx]); + impl->contribute_dense_rect_list(h_rects_span, true); + deppart_host_free(host_rect_buffers[idx]); + } else { + impl->contribute_nothing(); + } + } + } + +} +} diff --git a/src/realm/deppart/byfield_gpu_kernels.hpp b/src/realm/deppart/byfield_gpu_kernels.hpp new file mode 100644 index 0000000000..f1ec217f9b --- /dev/null +++ b/src/realm/deppart/byfield_gpu_kernels.hpp @@ -0,0 +1,57 @@ +#pragma once +#include "realm/deppart/byfield.h" +#include "realm/deppart/partitions_gpu_kernels.hpp" + +namespace Realm { + + +template < + int N, typename T, typename FT +> +__global__ +void byfield_gpuPopulateBitmasksKernel( + AffineAccessor* accessors, + Rect* rects, + size_t* prefix, + uint32_t* inst_prefix, + FT* d_colors, + size_t numPoints, + size_t numColors, + size_t numRects, + size_t num_insts, + PointDesc *d_points +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numPoints) return; + + // Binary search to find which rectangle this point belongs to. + uint32_t r = bsearch(prefix, numRects, idx); + + // Binary search to find which instance this rectangle belongs to. + size_t inst_idx = bsearch(inst_prefix, num_insts, r); + + // Now we know which rectangle we're in, figure out the point coordinates. + size_t offset = idx - prefix[r]; + Point p; + for (int k = N-1; k >= 0; --k) { + size_t dim = rects[r].hi[k] + 1 - rects[r].lo[k]; + p[k] = rects[r].lo[k] + (offset % dim); + offset /= dim; + } + + // Read the field value at that point. + FT ptr = accessors[inst_idx].read(p); + + // Find our color's idx and write output. + PointDesc point_desc; + point_desc.point = p; + for (size_t i = 0; i < numColors; ++i) { + if (ptr == d_colors[i]) { + point_desc.src_idx = i; + break; + } + } + d_points[idx] = point_desc; +} + +} // namespace Realm \ No newline at end of file diff --git a/src/realm/deppart/byfield_gpu_tmpl.cu b/src/realm/deppart/byfield_gpu_tmpl.cu new file mode 100644 index 0000000000..807fc1ad0b --- /dev/null +++ b/src/realm/deppart/byfield_gpu_tmpl.cu @@ -0,0 +1,64 @@ +/* Copyright 2024 Stanford University, NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// per‐dimension instantiator for the GPU By Field Operation +// Mirrors CPU Approach (byfield_tmpl.cc) + +#define REALM_TEMPLATES_ONLY +#include "realm/deppart/byfield_gpu_impl.hpp" +#include "realm/deppart/inst_helper.h" + + +#ifndef INST_N1 + #error "INST_N1 must be defined before including byfield_gpu_tmpl.cu" +#endif +#ifndef INST_N2 + #error "INST_N2 must be defined before including byfield_gpu_tmpl.cu" +#endif + +#define FOREACH_TT(__func__) \ + __func__(int, int) \ + __func__(int, unsigned) \ + __func__(int, long long) \ + __func__(unsigned,int) \ + __func__(unsigned,unsigned) \ + __func__(unsigned,long long) \ + __func__(long long, int) \ + __func__(long long, unsigned) \ + __func__(long long, long long) + +#define FOREACH_T(__func__) \ + __func__(int) \ + __func__(unsigned) \ + __func__(long long) + +namespace Realm { + #define N1 INST_N1 + #define N2 INST_N2 + + #define ZP(N,T) Point + #define ZR(N,T) Rect + + #define DO_WITH_FT(N, T, FT) \ + template class ByFieldMicroOp; \ + template class GPUByFieldMicroOp; + + #define DOIT(T1,T2) \ + DO_WITH_FT(N1,T1,ZP(N2,T2)) + + FOREACH_TT(DOIT) + + FOREACH_NTF(DO_WITH_FT) +} // namespace Realm \ No newline at end of file diff --git a/src/realm/deppart/byfield_tmpl.cc b/src/realm/deppart/byfield_tmpl.cc index 7c58bc725b..3da5121f04 100644 --- a/src/realm/deppart/byfield_tmpl.cc +++ b/src/realm/deppart/byfield_tmpl.cc @@ -17,7 +17,7 @@ // per-dimension instantiator for byfield.cc -#define REALM_TEMPLATES_ONLY +#undef REALM_TEMPLATES_ONLY #include "./byfield.cc" #ifndef INST_N1 @@ -43,6 +43,33 @@ namespace Realm { #define N1 INST_N1 #define N2 INST_N2 +#ifdef REALM_USE_CUDA + #define GPU_BYFIELD_LINE(N, T, ...) template class GPUByFieldMicroOp; + +#else + #define GPU_BYFIELD_LINE(N, T, ...) /* no CUDA */ +#endif + +#define DOIT(N,T,F) \ + template class ByFieldMicroOp; \ + GPU_BYFIELD_LINE(N, T, F) \ + template class ByFieldOperation; \ + template ByFieldMicroOp::ByFieldMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \ + template Event IndexSpace::create_subspaces_by_field(const std::vector,F> >&, \ + const std::vector&, \ + std::vector >&, \ + const ProfilingRequestSet &, \ + Event) const; + +#define DOIT_NT(N, T) \ + template void IndexSpace::by_field_buffer_requirements( \ + const std::vector>&, \ + std::vector&) const; + + +FOREACH_NT(DOIT_NT) +FOREACH_NTF(DOIT) + #define ZP(N,T) Point #define ZR(N,T) Rect #define DOIT2(T1,T2) \ diff --git a/src/realm/deppart/image.cc b/src/realm/deppart/image.cc index e598c22033..217543d147 100644 --- a/src/realm/deppart/image.cc +++ b/src/realm/deppart/image.cc @@ -24,12 +24,73 @@ #include "realm/deppart/inst_helper.h" #include "realm/deppart/preimage.h" #include "realm/logging.h" +#include "realm/cuda/cuda_internal.h" +#include namespace Realm { extern Logger log_part; extern Logger log_uop_timing; + template + template + void IndexSpace::by_image_buffer_requirements( + const std::vector>& source_spaces, + const std::vector>& inputs, + std::vector& requirements) const { + size_t minimal_size = 0; + size_t source_entries = 0; + bool bvh = false; + for (auto subspace : source_spaces) { + source_entries += subspace.entries == 0 ? 1 : subspace.entries; + if (subspace.entries > 1) { + bvh = true; + } + } + minimal_size += sizeof(Rect) * source_entries; + if (this->dense()) { + minimal_size += sizeof(Rect); + } else { + minimal_size += sizeof(Rect) * this->sparsity.impl()->get_entries().size(); + } + if (bvh) { + minimal_size += + (source_entries * sizeof(uint64_t)) + + (source_entries * sizeof(size_t)) + + ((2*source_entries - 1) * sizeof(Rect)) + + (2 * (2*source_entries - 1) * sizeof(int)) + + sizeof(Rect) + + (2 * source_entries * sizeof(uint64_t)) + + (source_entries * sizeof(uint64_t)); + } + requirements = std::vector(inputs.size()); + for (size_t i = 0; i < inputs.size(); i++) { + IndexSpace is = inputs[i].space; + Memory mem = inputs[i].location; + if (mem.kind() == Memory::GPU_FB_MEM || + mem.kind() == Memory::Z_COPY_MEM) { + const char* val = std::getenv("MIN_SIZE"); // or any env var + size_t device_size = 2000000; //default + if (val) { + device_size = atoi(val); + } + minimal_size = max(minimal_size, device_size); + size_t optimal_size = is.bounds.volume() * sizeof(RectDesc) * source_spaces.size() * 20 + minimal_size; + optimal_size += 2 * (is.dense() ? 1 : is.sparsity.impl()->get_entries().size()) * sizeof(Rect) * source_entries; + Processor best_proc; + assert(choose_proc(best_proc, mem)); + requirements[i].affinity_processor = best_proc; + requirements[i].lower_bound = minimal_size; + requirements[i].upper_bound = optimal_size; + requirements[i].minimum_alignment = 128; + } else { + requirements[i].affinity_processor = Processor::NO_PROC; + requirements[i].lower_bound = 0; + requirements[i].upper_bound = 0; + } + } + } + template template Event IndexSpace::create_subspaces_by_image( @@ -222,6 +283,7 @@ namespace Realm { if(!bmpp) bmpp = &bitmasks[i]; if(!*bmpp) *bmpp = new BM; (*bmpp)->add_rect(it3.rect); + } } } @@ -439,12 +501,43 @@ namespace Realm { EventImpl::gen_t _finish_gen) : PartitioningOperation(reqs, _finish_event, _finish_gen), parent(_parent), - domain_transform(_domain_transform) {} + domain_transform(_domain_transform), + is_intersection(false), + exclusive_gpu_owner(exclusive_gpu_exec_node()) + {} template ImageOperation::~ImageOperation(void) {} + template + NodeID ImageOperation::exclusive_gpu_exec_node(void) const + { + size_t gpu_ptrs = 0, gpu_rects = 0, cpu_ptrs = 0, cpu_rects = 0; + for(size_t i = 0; i < domain_transform.ptr_data.size(); i++) { + Memory::Kind kind = domain_transform.ptr_data[i].inst.get_location().kind(); + if((kind == Memory::GPU_FB_MEM) || (kind == Memory::Z_COPY_MEM)) + gpu_ptrs++; + else + cpu_ptrs++; + } + for(size_t i = 0; i < domain_transform.range_data.size(); i++) { + Memory::Kind kind = domain_transform.range_data[i].inst.get_location().kind(); + if((kind == Memory::GPU_FB_MEM) || (kind == Memory::Z_COPY_MEM)) + gpu_rects++; + else + cpu_rects++; + } + size_t opcount = gpu_ptrs + gpu_rects + cpu_ptrs + cpu_rects; + if((gpu_ptrs + gpu_rects) == 0 || (opcount != 1)) + return -1; + if(gpu_ptrs == 1) + return ID(domain_transform.ptr_data[0].inst).instance_owner_node(); + if(gpu_rects == 1) + return ID(domain_transform.range_data[0].inst).instance_owner_node(); + return -1; + } + template IndexSpace ImageOperation::add_source(const IndexSpace& source) { @@ -459,17 +552,22 @@ namespace Realm { // if the source has a sparsity map, use the same node - otherwise // get a sparsity ID by round-robin'ing across the nodes that have field data int target_node = 0; - if(!source.dense()) + if(exclusive_gpu_owner >= 0) + target_node = exclusive_gpu_owner; + else if(!source.dense()) target_node = ID(source.sparsity).sparsity_creator_node(); else if(!domain_transform.ptr_data.empty()) target_node = ID(domain_transform.ptr_data[sources.size() % domain_transform.ptr_data.size()].inst).instance_owner_node(); else target_node = ID(domain_transform.range_data[sources.size() % domain_transform.range_data.size()].inst).instance_owner_node(); + if(exclusive_gpu_owner >= 0) { + assert(target_node == exclusive_gpu_exec_node()); + } - SparsityMap sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert >(); + SparsityMap sparsity = + create_deppart_output_sparsity(target_node).convert>(); image.sparsity = sparsity; - sources.push_back(source); images.push_back(sparsity); @@ -491,27 +589,96 @@ namespace Realm { // if the source has a sparsity map, use the same node - otherwise // get a sparsity ID by round-robin'ing across the nodes that have field data int target_node; - if(!source.dense()) + if(exclusive_gpu_owner >= 0) + target_node = exclusive_gpu_owner; + else if(!source.dense()) target_node = ID(source.sparsity).sparsity_creator_node(); else if(!domain_transform.ptr_data.empty()) - target_node = ID(domain_transform.ptr_data[sources.size() % domain_transform.ptr_data.size()].inst).instance_owner_node(); + target_node = ID(domain_transform.ptr_data[sources.size() % domain_transform.ptr_data.size()].inst).instance_owner_node(); else - target_node = ID(domain_transform.range_data[sources.size() % domain_transform.range_data.size()].inst).instance_owner_node(); - SparsityMap sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert >(); + target_node = ID(domain_transform.range_data[sources.size() % domain_transform.range_data.size()].inst).instance_owner_node(); + if(exclusive_gpu_owner >= 0) { + assert(target_node == exclusive_gpu_exec_node()); + } + + SparsityMap sparsity = + create_deppart_output_sparsity(target_node).convert>(); image.sparsity = sparsity; + sources.push_back(source); + diff_rhss.push_back(diff_rhs); + images.push_back(sparsity); + is_intersection = false; + + return image; + } + + template + IndexSpace ImageOperation::add_source_with_intersection(const IndexSpace& source, + const IndexSpace& diff_rhs) + { + // try to filter out obviously empty sources + if(parent.empty() || source.empty()) + return IndexSpace::make_empty(); + + // otherwise it'll be something smaller than the current parent + IndexSpace image; + image.bounds = parent.bounds; + + // if the source has a sparsity map, use the same node - otherwise + // get a sparsity ID by round-robin'ing across the nodes that have field data + int target_node; + if(exclusive_gpu_owner >= 0) + target_node = exclusive_gpu_owner; + else if(!source.dense()) + target_node = ID(source.sparsity).sparsity_creator_node(); + else + if(!domain_transform.ptr_data.empty()) + target_node = ID(domain_transform.ptr_data[sources.size() % domain_transform.ptr_data.size()].inst).instance_owner_node(); + else + target_node = ID(domain_transform.range_data[sources.size() % domain_transform.range_data.size()].inst).instance_owner_node(); + if(exclusive_gpu_owner >= 0) { + assert(target_node == exclusive_gpu_exec_node()); + } + SparsityMap sparsity = + create_deppart_output_sparsity(target_node).convert>(); + image.sparsity = sparsity; sources.push_back(source); diff_rhss.push_back(diff_rhs); images.push_back(sparsity); + is_intersection = true; return image; } template void ImageOperation::execute(void) { - if (domain_transform.type == - DomainTransform::DomainTransformType::STRUCTURED) { + + std::vector,Point> > gpu_ptr_data; + std::vector,Point> > cpu_ptr_data; + std::vector,Rect> > gpu_rect_data; + std::vector,Rect> > cpu_rect_data; + for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) { + if (domain_transform.ptr_data[i].inst.get_location().kind() == + Memory::GPU_FB_MEM || domain_transform.ptr_data[i].inst.get_location().kind() == Memory::Z_COPY_MEM) { + gpu_ptr_data.push_back(domain_transform.ptr_data[i]); + } else { + cpu_ptr_data.push_back(domain_transform.ptr_data[i]); + } + } + for (size_t i = 0; i < domain_transform.range_data.size(); i++) { + if (domain_transform.range_data[i].inst.get_location().kind() == + Memory::GPU_FB_MEM || domain_transform.range_data[i].inst.get_location().kind() == Memory::Z_COPY_MEM) { + gpu_rect_data.push_back(domain_transform.range_data[i]); + } else { + cpu_rect_data.push_back(domain_transform.range_data[i]); + } + } + bool gpu_data = !gpu_ptr_data.empty() || !gpu_rect_data.empty(); + if (domain_transform.type == + DomainTransform::DomainTransformType::STRUCTURED && !gpu_data) { + for (size_t i = 0; i < sources.size(); i++) { SparsityMapImpl::lookup(images[i])->set_contributor_count(1); } @@ -523,65 +690,108 @@ namespace Realm { for (size_t j = 0; j < sources.size(); j++) { micro_op->add_sparsity_output(sources[j], images[j]); } - micro_op->dispatch(this, /*inline_ok=*/true); - } else { - if (!DeppartConfig::cfg_disable_intersection_optimization) { - // build the overlap tester based on the field index spaces - they're more - // likely to be known and - // denser - ComputeOverlapMicroOp *uop = - new ComputeOverlapMicroOp(this); + } else if (!DeppartConfig::cfg_disable_intersection_optimization && !gpu_data) { + // build the overlap tester based on the field index spaces - they're more + // likely to be known and + // denser + ComputeOverlapMicroOp *uop = + new ComputeOverlapMicroOp(this); - for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) - uop->add_input_space(domain_transform.ptr_data[i].index_space); + for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) + uop->add_input_space(domain_transform.ptr_data[i].index_space); - for (size_t i = 0; i < domain_transform.range_data.size(); i++) - uop->add_input_space(domain_transform.range_data[i].index_space); + for (size_t i = 0; i < domain_transform.range_data.size(); i++) + uop->add_input_space(domain_transform.range_data[i].index_space); - // we will ask this uop to also prefetch the sources we will intersect test - // against it - for (size_t i = 0; i < sources.size(); i++) - uop->add_extra_dependency(sources[i]); + // we will ask this uop to also prefetch the sources we will intersect test + // against it + for (size_t i = 0; i < sources.size(); i++) + uop->add_extra_dependency(sources[i]); - uop->dispatch(this, true /* ok to run in this thread */); + uop->dispatch(this, true /* ok to run in this thread */); } else { - // launch full cross-product of image micro ops right away - for (size_t i = 0; i < sources.size(); i++) - SparsityMapImpl::lookup(images[i])->set_contributor_count( - domain_transform.ptr_data.size() + - domain_transform.range_data.size()); - - for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) { - ImageMicroOp *uop = new ImageMicroOp( - parent, domain_transform.ptr_data[i].index_space, - domain_transform.ptr_data[i].inst, - domain_transform.ptr_data[i].field_offset, false /*ptrs*/); - for (size_t j = 0; j < sources.size(); j++) - if (diff_rhss.empty()) - uop->add_sparsity_output(sources[j], images[j]); - else - uop->add_sparsity_output_with_difference(sources[j], diff_rhss[j], - images[j]); - - uop->dispatch(this, true /* ok to run in this thread */); - } - - for (size_t i = 0; i < domain_transform.range_data.size(); i++) { - ImageMicroOp *uop = new ImageMicroOp( - parent, domain_transform.range_data[i].index_space, - domain_transform.range_data[i].inst, - domain_transform.range_data[i].field_offset, true /*ranges*/); - for (size_t j = 0; j < sources.size(); j++) - if (diff_rhss.empty()) - uop->add_sparsity_output(sources[j], images[j]); - else - uop->add_sparsity_output_with_difference(sources[j], diff_rhss[j], - images[j]); - - uop->dispatch(this, true /* ok to run in this thread */); - } - } + size_t opcount = cpu_ptr_data.size() + cpu_rect_data.size() + gpu_ptr_data.size() + gpu_rect_data.size(); + bool exclusive = gpu_data && (opcount == 1); + if(exclusive) { + NodeID expected_owner = exclusive_gpu_exec_node(); + assert(exclusive_gpu_owner >= 0); + assert(NodeID(exclusive_gpu_owner) == expected_owner); + for(size_t i = 0; i < images.size(); i++) { + NodeID output_owner = NodeID(ID(images[i]).sparsity_creator_node()); + assert(output_owner == NodeID(exclusive_gpu_owner)); + } + } + if (!exclusive) { + // launch full cross-product of image micro ops right away + for (size_t i = 0; i < sources.size(); i++) { + SparsityMapImpl::lookup(images[i])->set_contributor_count(opcount); + } + } + for (size_t i = 0; i < cpu_ptr_data.size(); i++) { + ImageMicroOp *uop = new ImageMicroOp( + parent, cpu_ptr_data[i].index_space, + cpu_ptr_data[i].inst, + cpu_ptr_data[i].field_offset, false /*ptrs*/); + for (size_t j = 0; j < sources.size(); j++) + if (diff_rhss.empty()) + uop->add_sparsity_output(sources[j], images[j]); + else + uop->add_sparsity_output_with_difference(sources[j], diff_rhss[j], + images[j]); + uop->dispatch(this, true /* ok to run in this thread */); + } + for (size_t i = 0; i < cpu_rect_data.size(); i++) { + ImageMicroOp *uop = new ImageMicroOp( + parent, cpu_rect_data[i].index_space, + cpu_rect_data[i].inst, + cpu_rect_data[i].field_offset, true /*ranges*/); + for (size_t j = 0; j < sources.size(); j++) + if (diff_rhss.empty()) + uop->add_sparsity_output(sources[j], images[j]); + else + uop->add_sparsity_output_with_difference(sources[j], diff_rhss[j], + images[j]); + uop->dispatch(this, true /* ok to run in this thread */); + } +#ifdef REALM_USE_CUDA + for (auto ptr_fdd : gpu_ptr_data) { + // launch full cross-product of image micro ops right away + assert(ptr_fdd.scratch_buffer != RegionInstance::NO_INST); + if(exclusive) { + NodeID microop_exec_node = ID(ptr_fdd.inst).instance_owner_node(); + assert(NodeID(exclusive_gpu_owner) == microop_exec_node); + } + DomainTransform domain_transform_copy = domain_transform; + domain_transform_copy.ptr_data = {ptr_fdd}; + GPUImageMicroOp *micro_op = + new GPUImageMicroOp( + parent, domain_transform_copy, exclusive); + for (size_t j = 0; j < sources.size(); j++) { + micro_op->add_sparsity_output(sources[j], images[j]); + } + micro_op->dispatch(this, true); + } + for (auto rect_fdd : gpu_rect_data) { + // launch full cross-product of image micro ops right away + assert(rect_fdd.scratch_buffer != RegionInstance::NO_INST); + if(exclusive) { + NodeID microop_exec_node = ID(rect_fdd.inst).instance_owner_node(); + assert(NodeID(exclusive_gpu_owner) == microop_exec_node); + } + DomainTransform domain_transform_copy = domain_transform; + domain_transform_copy.range_data = {rect_fdd}; + GPUImageMicroOp *micro_op = + new GPUImageMicroOp( + parent, domain_transform_copy, exclusive); + for (size_t j = 0; j < sources.size(); j++) { + micro_op->add_sparsity_output(sources[j], images[j]); + } + micro_op->dispatch(this, true); + } +#else + assert(!gpu_data); +#endif } } @@ -783,6 +993,190 @@ namespace Realm { } } + //////////////////////////////////////////////////////////////////////// + // + // class GPUImageMicroOp + +#ifdef REALM_USE_CUDA + + template + GPUImageMicroOp::GPUImageMicroOp( + const IndexSpace &_parent, + const DomainTransform &_domain_transform, + bool _exclusive) + : parent_space(_parent), domain_transform(_domain_transform) + { + this->exclusive = _exclusive; + areg.force_instantiation(); + // GPU setup (this->gpu, this->stream) deferred to execute(), which runs on the + // correct node after dispatch() has forwarded to the instance owner if needed. + } + + template + template + GPUImageMicroOp::GPUImageMicroOp( + NodeID _requestor, AsyncMicroOp *_async_microop, S& s) + : GPUMicroOp(_requestor, _async_microop) + { + bool ok = true; + bool use_ptr_data = false; + ok = ok && (s >> parent_space); + ok = ok && (s >> this->exclusive); + ok = ok && (s >> use_ptr_data); + if(use_ptr_data) { + domain_transform.type = DomainTransform::DomainTransformType::UNSTRUCTURED_PTR; + size_t n = 0; + ok = ok && (s >> n); + domain_transform.ptr_data.resize(n); + for(size_t i = 0; i < n && ok; i++) + ok = ok && (s >> domain_transform.ptr_data[i].index_space) && + (s >> domain_transform.ptr_data[i].inst) && + (s >> domain_transform.ptr_data[i].field_offset) && + (s >> domain_transform.ptr_data[i].scratch_buffer); + } else { + domain_transform.type = DomainTransform::DomainTransformType::UNSTRUCTURED_RANGE; + size_t n = 0; + ok = ok && (s >> n); + domain_transform.range_data.resize(n); + for(size_t i = 0; i < n && ok; i++) + ok = ok && (s >> domain_transform.range_data[i].index_space) && + (s >> domain_transform.range_data[i].inst) && + (s >> domain_transform.range_data[i].field_offset) && + (s >> domain_transform.range_data[i].scratch_buffer); + } + ok = ok && (s >> sources); + ok = ok && (s >> sparsity_outputs); + assert(ok); + (void)ok; + } + + template + template + bool GPUImageMicroOp::serialize_params(S& s) const { + bool ok = true; + bool use_ptr_data = !domain_transform.ptr_data.empty(); + ok = ok && (s << parent_space); + ok = ok && (s << this->exclusive); + ok = ok && (s << use_ptr_data); + if(use_ptr_data) { + ok = ok && (s << domain_transform.ptr_data.size()); + for(size_t i = 0; i < domain_transform.ptr_data.size() && ok; i++) + ok = ok && (s << domain_transform.ptr_data[i].index_space) && + (s << domain_transform.ptr_data[i].inst) && + (s << domain_transform.ptr_data[i].field_offset) && + (s << domain_transform.ptr_data[i].scratch_buffer); + } else { + ok = ok && (s << domain_transform.range_data.size()); + for(size_t i = 0; i < domain_transform.range_data.size() && ok; i++) + ok = ok && (s << domain_transform.range_data[i].index_space) && + (s << domain_transform.range_data[i].inst) && + (s << domain_transform.range_data[i].field_offset) && + (s << domain_transform.range_data[i].scratch_buffer); + } + ok = ok && (s << sources); + ok = ok && (s << sparsity_outputs); + return ok; + } + + template + GPUImageMicroOp::~GPUImageMicroOp() {} + + template + void GPUImageMicroOp::dispatch( + PartitioningOperation *op, bool inline_ok) { + + // GPU image must execute on the node that owns the GPU memory + NodeID exec_node = domain_transform.ptr_data.empty() ? + ID(domain_transform.range_data[0].inst).instance_owner_node() : + ID(domain_transform.ptr_data[0].inst).instance_owner_node(); + if(this->exclusive) { + for(size_t i = 0; i < sparsity_outputs.size(); i++) { + assert(NodeID(ID(sparsity_outputs[i]).sparsity_creator_node()) == exec_node); + } + } + if(exec_node != Network::my_node_id) { + PartitioningMicroOp::template forward_microop >(exec_node, op, this); + return; + } + + for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) { + IndexSpace inst_space = domain_transform.ptr_data[i].index_space; + if (!inst_space.dense()) { + // it's safe to add the count after the registration only because we initialized + // the count to 2 instead of 1 + bool registered = SparsityMapImpl::lookup(inst_space.sparsity)->add_waiter(this, true /*precise*/); + if(registered) + this->wait_count.fetch_add(1); + } + } + for (size_t i = 0; i < domain_transform.range_data.size(); i++) { + IndexSpace inst_space = domain_transform.range_data[i].index_space; + if (!inst_space.dense()) { + // it's safe to add the count after the registration only because we initialized + // the count to 2 instead of 1 + bool registered = SparsityMapImpl::lookup(inst_space.sparsity)->add_waiter(this, true /*precise*/); + if(registered) + this->wait_count.fetch_add(1); + } + } + + for (size_t i = 0; i < sources.size(); i++) { + if (!sources[i].dense()) { + bool registered = SparsityMapImpl::lookup(sources[i].sparsity) + ->add_waiter(this, true /*precise*/); + if (registered) this->wait_count.fetch_add(1); + } + } + + if (!parent_space.dense()) { + bool registered = SparsityMapImpl::lookup(parent_space.sparsity) + ->add_waiter(this, true /*precise*/); + if (registered) this->wait_count.fetch_add(1); + } + this->finish_dispatch(op, inline_ok); + } + + template + void GPUImageMicroOp::add_sparsity_output( + IndexSpace _source, SparsityMap _sparsity) { + sources.push_back(_source); + // TODO(apryakhin): Handle and test this sparsity ref-count path. + sparsity_outputs.push_back(_sparsity); + } + + template + ActiveMessageHandlerReg > > + GPUImageMicroOp::areg; + + template + void GPUImageMicroOp::execute(void) { + TimeStamp ts("GPUImageMicroOp::execute", true, &log_uop_timing); + + // Resolve the local GPU processor now that we are guaranteed to be on the + // correct node (dispatch() forwarded us here if the instance was remote). + { + Memory my_mem = domain_transform.ptr_data.empty() ? + domain_transform.range_data[0].inst.get_location() : + domain_transform.ptr_data[0].inst.get_location(); + Processor best_proc; + assert(choose_proc(best_proc, my_mem)); + Cuda::GPUProcessor *gpu_proc = + dynamic_cast(get_runtime()->get_processor_impl(best_proc)); + assert(gpu_proc); + this->gpu = gpu_proc->gpu; + this->stream = gpu_proc->gpu->get_deppart_stream(); + } + + Cuda::AutoGPUContext agc(this->gpu); + if (domain_transform.ptr_data.size() > 0) { + gpu_populate_ptrs(); + } else { + gpu_populate_rngs(); + } + } +#endif + + //////////////////////////////////////////////////////////////////////// // instantiations of templates handled in image_tmpl.cc diff --git a/src/realm/deppart/image.h b/src/realm/deppart/image.h index 0e0fbfe03f..fec4dc7651 100644 --- a/src/realm/deppart/image.h +++ b/src/realm/deppart/image.h @@ -24,117 +24,181 @@ #include "realm/deppart/rectlist.h" namespace Realm { + template + class ImageMicroOp : public PartitioningMicroOp { + public: + static const int DIM = N; + typedef T IDXTYPE; + static const int DIM2 = N2; + typedef T2 IDXTYPE2; - template - class ImageMicroOp : public PartitioningMicroOp { - public: - static const int DIM = N; - typedef T IDXTYPE; - static const int DIM2 = N2; - typedef T2 IDXTYPE2; - - ImageMicroOp(IndexSpace _parent_space, IndexSpace _inst_space, - RegionInstance _inst, size_t _field_offset, bool _is_ranged); - - virtual ~ImageMicroOp(void); - - void add_sparsity_output(IndexSpace _source, SparsityMap _sparsity); - void add_sparsity_output_with_difference(IndexSpace _source, - IndexSpace _diff_rhs, - SparsityMap _sparsity); - void add_approx_output(int index, PartitioningOperation *op); - - virtual void execute(void); - - void dispatch(PartitioningOperation *op, bool inline_ok); - - protected: - friend struct RemoteMicroOpMessage >; - static ActiveMessageHandlerReg > > areg; - - friend class PartitioningMicroOp; - template - REALM_ATTR_WARN_UNUSED(bool serialize_params(S& s) const); - - // construct from received packet - template - ImageMicroOp(NodeID _requestor, AsyncMicroOp *_async_microop, S& s); - - template - void populate_bitmasks_ptrs(std::map& bitmasks); - - template - void populate_bitmasks_ranges(std::map& bitmasks); - - template - void populate_approx_bitmask_ptrs(BM& bitmask); - - template - void populate_approx_bitmask_ranges(BM& bitmask); - - IndexSpace parent_space; - IndexSpace inst_space; - RegionInstance inst; - size_t field_offset; - bool is_ranged; - std::vector > sources; - std::vector > diff_rhss; - std::vector > sparsity_outputs; - int approx_output_index; - intptr_t approx_output_op; - }; - - template - class ImageOperation : public PartitioningOperation { - public: - ImageOperation(const IndexSpace& _parent, - const DomainTransform& _domain_transform, - const ProfilingRequestSet& reqs, GenEventImpl* _finish_event, - EventImpl::gen_t _finish_gen); - - virtual ~ImageOperation(void); - - IndexSpace add_source(const IndexSpace& source); - IndexSpace add_source_with_difference( - const IndexSpace& source, const IndexSpace& diff_rhs); - - virtual void execute(void); - - virtual void print(std::ostream& os) const; - - virtual void set_overlap_tester(void* tester); - - protected: - IndexSpace parent; - DomainTransform domain_transform; - std::vector> sources; - std::vector> diff_rhss; - std::vector> images; - }; - - template - class StructuredImageMicroOp : public PartitioningMicroOp { - public: - StructuredImageMicroOp( - const IndexSpace& _parent, - const StructuredTransform& _transform); - - virtual ~StructuredImageMicroOp(void); - virtual void execute(void); - - virtual void populate(std::map*>& bitmasks); - - void dispatch(PartitioningOperation* op, bool inline_ok); - void add_sparsity_output(IndexSpace _source, - SparsityMap _sparsity); - - protected: - IndexSpace parent_space; - StructuredTransform transform; - std::vector> sources; - std::vector> sparsity_outputs; - }; + ImageMicroOp(IndexSpace _parent_space, IndexSpace _inst_space, + RegionInstance _inst, size_t _field_offset, bool _is_ranged); - }; // namespace Realm + virtual ~ImageMicroOp(void); + + void add_sparsity_output(IndexSpace _source, SparsityMap _sparsity); + + void add_sparsity_output_with_difference(IndexSpace _source, + IndexSpace _diff_rhs, + SparsityMap _sparsity); + + void add_sparsity_output_with_intersection(IndexSpace _source, + IndexSpace _diff_rhs, + SparsityMap _sparsity); + + void add_approx_output(int index, PartitioningOperation *op); + + virtual void execute(void); + + void dispatch(PartitioningOperation *op, bool inline_ok); + + protected: + friend struct RemoteMicroOpMessage >; + static ActiveMessageHandlerReg > > areg; + + friend class PartitioningMicroOp; + + template + REALM_ATTR_WARN_UNUSED(bool serialize_params(S& s) const); + + // construct from received packet + template + ImageMicroOp(NodeID _requestor, AsyncMicroOp *_async_microop, S &s); + + template + void populate_bitmasks_ptrs(std::map &bitmasks); + + template + void populate_bitmasks_ranges(std::map &bitmasks); + + template + void populate_approx_bitmask_ptrs(BM &bitmask); + + template + void populate_approx_bitmask_ranges(BM &bitmask); + + IndexSpace parent_space; + IndexSpace inst_space; + RegionInstance inst; + size_t field_offset; + bool is_ranged; + bool is_intersection; + std::vector > sources; + std::vector > diff_rhss; + std::vector > sparsity_outputs; + int approx_output_index; + intptr_t approx_output_op; + }; + + template + class ImageOperation : public PartitioningOperation { + public: + ImageOperation(const IndexSpace &_parent, + const DomainTransform &_domain_transform, + const ProfilingRequestSet &reqs, GenEventImpl *_finish_event, + EventImpl::gen_t _finish_gen); + + virtual ~ImageOperation(void); + + IndexSpace add_source(const IndexSpace &source); + + IndexSpace add_source_with_difference( + const IndexSpace &source, const IndexSpace &diff_rhs); + + IndexSpace add_source_with_intersection( + const IndexSpace &source, const IndexSpace &diff_rhs); + + virtual void execute(void); + + virtual void print(std::ostream &os) const; + + virtual void set_overlap_tester(void *tester); + + protected: + NodeID exclusive_gpu_exec_node(void) const; + + IndexSpace parent; + DomainTransform domain_transform; + std::vector > sources; + std::vector > diff_rhss; + std::vector > images; + bool is_intersection; + int exclusive_gpu_owner; + }; + + template + class StructuredImageMicroOp : public PartitioningMicroOp { + public: + StructuredImageMicroOp( + const IndexSpace &_parent, + const StructuredTransform &_transform); + + virtual ~StructuredImageMicroOp(void); + + virtual void execute(void); + + virtual void populate(std::map *> &bitmasks); + + void dispatch(PartitioningOperation *op, bool inline_ok); + + void add_sparsity_output(IndexSpace _source, + SparsityMap _sparsity); + + protected: + IndexSpace parent_space; + StructuredTransform transform; + std::vector > sources; + std::vector > sparsity_outputs; + }; +#ifdef REALM_USE_CUDA + + template + class GPUImageMicroOp : public GPUMicroOp { + public: + static const int DIM = N; + typedef T IDXTYPE; + static const int DIM2 = N2; + typedef T2 IDXTYPE2; + + GPUImageMicroOp( + const IndexSpace &_parent, + const DomainTransform &_domain_transform, + bool _exclusive); + + virtual ~GPUImageMicroOp(void); + + virtual void execute(void); + + virtual void gpu_populate_ptrs(); + + virtual void gpu_populate_rngs(); + + void dispatch(PartitioningOperation *op, bool inline_ok); + + void add_sparsity_output(IndexSpace _source, + SparsityMap _sparsity); + + bool is_image_microop() const override { return true; } + + protected: + friend struct RemoteMicroOpMessage >; + static ActiveMessageHandlerReg > > areg; + + friend class PartitioningMicroOp; + template + REALM_ATTR_WARN_UNUSED(bool serialize_params(S& s) const); + + // construct from received packet + template + GPUImageMicroOp(NodeID _requestor, AsyncMicroOp *_async_microop, S& s); + + IndexSpace parent_space; + DomainTransform domain_transform; + std::vector > sources; + std::vector > sparsity_outputs; + }; +#endif +}; // namespace Realm #endif // REALM_DEPPART_IMAGE_H diff --git a/src/realm/deppart/image_gpu_impl.hpp b/src/realm/deppart/image_gpu_impl.hpp new file mode 100644 index 0000000000..75401be42c --- /dev/null +++ b/src/realm/deppart/image_gpu_impl.hpp @@ -0,0 +1,586 @@ +#pragma once +#include "realm/deppart/image.h" +#include "realm/deppart/image_gpu_kernels.hpp" +#include "realm/deppart/partitions_gpu_impl.hpp" +#include +#include +#include +#include "realm/nvtx.h" + +namespace Realm { + +//TODO: INTERSECTING INPUT/OUTPUT RECTS CAN BE DONE WITH BVH IF BECOME EXPENSIVE + +template +struct RectDescVolumeOp { + __device__ __forceinline__ + size_t operator()(const RectDesc& rd) const { + return rd.rect.volume(); + } +}; + +template +struct SparsityMapEntryVolumeOp { + __device__ __forceinline__ + size_t operator()(const SparsityMapEntry& entry) const { + return entry.bounds.volume(); + } +}; + + /* + * Input (stored in MicroOp): Array of field instances, a parent index space, and a list of source index spaces + * Output: A list of (potentially overlapping) rectangles that result from chasing all the pointers in the source index spaces + * through the provided instances and emitting only those that intersect the parent index space labeled by which source they came from, + * which are then sent off to complete_rect_pipeline. + * Approach: Intersect all instance rectangles with source rectangles in parallel. Prefix sum + binary search to iterate over these in + * parallel and chase all the pointers in the source rectangles to their corresponding rectangle. Finally, intersect the output rectangles + * with the parent rectangles in parallel. + */ +template +void GPUImageMicroOp::gpu_populate_rngs() +{ + + if (sources.size() == 0) { + assert(sparsity_outputs.empty()); + return; + } + + NVTX_DEPPART(gpu_image_range); + + RegionInstance buffer = domain_transform.range_data[0].scratch_buffer; + size_t tile_size = buffer.get_layout()->bytes_used; + //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; + Arena buffer_arena(buffer); + + CUstream stream = this->stream->get_stream(); + + collapsed_space src_space; + src_space.offsets = buffer_arena.alloc(sources.size()+1); + src_space.num_children = sources.size(); + GPUMicroOp::collapse_multi_space(sources, src_space, buffer_arena, stream); + + collapsed_space inst_space; + + // We combine all of our instances into one to batch work, tracking the offsets between instances. + inst_space.offsets = buffer_arena.alloc(domain_transform.range_data.size() + 1); + inst_space.num_children = domain_transform.range_data.size(); + + Arena sys_arena; + GPUMicroOp::collapse_multi_space(domain_transform.range_data, inst_space, sys_arena, stream); + + // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter + // to figure out where to write each rectangle. + uint32_t* d_inst_counters = buffer_arena.alloc(2 * domain_transform.range_data.size()+1); + + // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second + // to track which instance each rectangle came from in the populate phase. + uint32_t* d_inst_prefix = d_inst_counters + domain_transform.range_data.size(); + + collapsed_space collapsed_parent; + + // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs. + GPUMicroOp::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream); + + std::vector,N2,T2>> h_accessors(domain_transform.range_data.size()); + for (size_t i = 0; i < domain_transform.range_data.size(); ++i) { + h_accessors[i] = AffineAccessor,N2,T2>(domain_transform.range_data[i].inst, domain_transform.range_data[i].field_offset); + } + AffineAccessor,N2,T2>* d_accessors = + buffer_arena.alloc,N2,T2>>(domain_transform.range_data.size()); + CUDA_CHECK(cudaMemcpyAsync(d_accessors, h_accessors.data(), + domain_transform.range_data.size() * sizeof(AffineAccessor,N2,T2>), + cudaMemcpyHostToDevice, stream), stream); + + uint32_t* d_src_counters = buffer_arena.alloc(2 * sources.size() + 1); + uint32_t* d_src_prefix = d_src_counters + sources.size(); + + buffer_arena.commit(false); + + size_t num_output = 0; + RectDesc* output_start = nullptr; + size_t num_completed = 0; + size_t curr_tile = tile_size / 2; + int count = 0; + if (count) {} + bool host_fallback = false; + std::vector*> host_rect_buffers(sources.size(), nullptr); + std::vector entry_counts(sources.size(), 0); + while (num_completed < inst_space.num_entries) { + try { + //std::cout << "Image Range iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; + buffer_arena.start(); + buffer_arena.flip_parity(); + if (num_completed + curr_tile > inst_space.num_entries) { + curr_tile = inst_space.num_entries - num_completed; + } + collapsed_space inst_space_tile = inst_space; + inst_space_tile.num_entries = curr_tile; + inst_space_tile.entries_buffer = buffer_arena.alloc>(curr_tile); + CUDA_CHECK(cudaMemcpyAsync(inst_space_tile.entries_buffer, inst_space.entries_buffer + num_completed, curr_tile * sizeof(SparsityMapEntry), cudaMemcpyHostToDevice, stream), stream); + + size_t num_valid_rects; + RectDesc* d_valid_rects; + // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from. + GPUMicroOp::template construct_input_rectlist>(inst_space_tile, src_space, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream); + + if (num_valid_rects == 0) { + num_completed += curr_tile; + subtract_const<<>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + curr_tile = tile_size / 2; + continue; + } + + // Prefix sum the valid rectangles by volume. + size_t* d_prefix_rects; + size_t total_pts; + + GPUMicroOp::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream); + + buffer_arena.flip_parity(); + RectDesc* d_rngs = buffer_arena.alloc>(total_pts); + + image_gpuPopulateBitmasksRngsKernel<<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, total_pts, num_valid_rects, domain_transform.range_data.size(), d_rngs); + KERNEL_CHECK(stream); + + + CUDA_CHECK(cudaMemsetAsync(d_src_counters, 0, sources.size() * sizeof(uint32_t), stream), stream); + + + //Finally, we do another two pass count + emit to intersect with the parent rectangles + image_intersect_output<<>>(collapsed_parent.entries_buffer, d_rngs, nullptr, collapsed_parent.num_entries, total_pts, d_src_counters, nullptr); + KERNEL_CHECK(stream); + + std::vector h_src_counters(sources.size()+1); + h_src_counters[0] = 0; // prefix sum starts at 0 + CUDA_CHECK(cudaMemcpyAsync(h_src_counters.data()+1, d_src_counters, sources.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + for (size_t i = 0; i < sources.size(); ++i) { + h_src_counters[i+1] += h_src_counters[i]; + } + + size_t num_valid_output = h_src_counters[sources.size()]; + + if (num_valid_output == 0) { + num_completed += curr_tile; + subtract_const<<>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + curr_tile = tile_size / 2; + continue; + } + + buffer_arena.flip_parity(); + RectDesc* d_valid_intersect = buffer_arena.alloc>(num_valid_output); + + CUDA_CHECK(cudaMemcpyAsync(d_src_prefix, h_src_counters.data(), (sources.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream); + CUDA_CHECK(cudaMemsetAsync(d_src_counters, 0, sources.size() * sizeof(uint32_t), stream), stream); + + image_intersect_output<<>>(collapsed_parent.entries_buffer, d_rngs, d_src_prefix, collapsed_parent.num_entries, total_pts, d_src_counters, d_valid_intersect); + KERNEL_CHECK(stream); + + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + size_t num_new_rects = (num_output == 0) ? 1 : 2; + assert(!buffer_arena.get_parity()); + RectDesc* d_new_rects; + + //Send it off for processing + this->complete_rect_pipeline(d_valid_intersect, num_valid_output, d_new_rects, num_new_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + + if (host_fallback) { + this->split_output(d_new_rects, num_new_rects, host_rect_buffers, entry_counts, buffer_arena); + } + + //Set our first set of output rectangles + if (num_output==0 || host_fallback) { + + //We need to place the new output at the rightmost end of the buffer + num_output = num_new_rects; + num_completed += curr_tile; + output_start = d_new_rects; + subtract_const<<>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + curr_tile = tile_size / 2; + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + continue; + } + + //Otherwise we merge with existing rectangles + RectDesc* d_old_rects = buffer_arena.alloc>(num_output); + assert(d_old_rects == d_new_rects + num_new_rects); + CUDA_CHECK(cudaMemcpyAsync(d_old_rects, output_start, num_output * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + size_t num_final_rects = 1; + + //Send it off for processing + this->complete_rect_pipeline(d_new_rects, num_output + num_new_rects, output_start, num_final_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + num_completed += curr_tile; + num_output = num_final_rects; + subtract_const<<>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + curr_tile = tile_size / 2; + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } + catch (arena_oom&) { + //std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; + curr_tile /= 2; + if (curr_tile == 0) { + if (host_fallback) { + GPUMicroOp::shatter_rects(inst_space, num_completed, stream); + curr_tile = 1; + } else { + host_fallback = true; + if (num_output > 0) { + this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena); + } + curr_tile = tile_size / 2; + } + } + } + } + + if (num_output == 0) { + for (SparsityMap it : sparsity_outputs) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(it); + if (this->exclusive) { + impl->gpu_finalize(); + } else { + impl->contribute_nothing(); + } + } + return; + } + + if (!host_fallback) { + try { + this->send_output(output_start, num_output, buffer_arena, sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + } catch (arena_oom&) { + this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena); + host_fallback = true; + } + } + + if (host_fallback) { + for (size_t idx = 0; idx < sparsity_outputs.size(); ++idx) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(sparsity_outputs[idx]); + if (this->exclusive) { + impl->set_contributor_count(1); + } + if (entry_counts[idx] > 0) { + span> h_rects_span(host_rect_buffers[idx], entry_counts[idx]); + impl->contribute_dense_rect_list(h_rects_span, false); + deppart_host_free(host_rect_buffers[idx]); + } else { + impl->contribute_nothing(); + } + } + } + +} + + /* + * Input (stored in MicroOp): Array of field instances, a parent index space, and a list of source index spaces + * Output: A list of (potentially overlapping) points that result from chasing all the pointers in the source index spaces + * through the provided instances and emitting only points in the parent index space labeled by which source they came from, + * which are then sent off to complete_pipeline. + * Approach: Intersect all instance rectangles with source rectangles in parallel. Prefix sum + binary search to iterate over these in + * parallel and chase all the pointers in the source rectangles to their corresponding point. Here, the pointer chasing is also a count + emit, + * where only points that are in the parent index space are counted/emitted. + */ +template +void GPUImageMicroOp::gpu_populate_ptrs() +{ + if (sources.size() == 0) { + assert(sparsity_outputs.empty()); + return; + } + + RegionInstance buffer = domain_transform.ptr_data[0].scratch_buffer; + + NVTX_DEPPART(gpu_image); + + CUstream stream = this->stream->get_stream(); + + size_t tile_size = buffer.get_layout()->bytes_used; + //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; + Arena buffer_arena(buffer); + + Memory sysmem; + assert(find_memory(sysmem, Memory::SYSTEM_MEM, buffer_arena.location)); + + collapsed_space src_space; + src_space.offsets = buffer_arena.alloc(sources.size()+1); + src_space.num_children = sources.size(); + + GPUMicroOp::collapse_multi_space(sources, src_space, buffer_arena, stream); + + collapsed_space inst_space; + + // We combine all of our instances into one to batch work, tracking the offsets between instances. + inst_space.offsets = buffer_arena.alloc(domain_transform.ptr_data.size()+1); + inst_space.num_children = domain_transform.ptr_data.size(); + + Arena sys_arena; + GPUMicroOp::collapse_multi_space(domain_transform.ptr_data, inst_space, sys_arena, stream); + + // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter + // to figure out where to write each rectangle. + uint32_t* d_inst_counters = buffer_arena.alloc(2*domain_transform.ptr_data.size()+1); + + + // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second + // to track which instance each rectangle came from in the populate phase. + uint32_t* d_inst_prefix = d_inst_counters + domain_transform.ptr_data.size(); + + //Uniform for all tiles + collapsed_space collapsed_parent; + + // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs. + GPUMicroOp::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream); + + std::vector,N2,T2>> h_accessors(domain_transform.ptr_data.size()); + for (size_t i = 0; i < domain_transform.ptr_data.size(); ++i) { + h_accessors[i] = AffineAccessor,N2,T2>(domain_transform.ptr_data[i].inst, domain_transform.ptr_data[i].field_offset); + } + AffineAccessor,N2,T2>* d_accessors = + buffer_arena.alloc,N2,T2>>(domain_transform.ptr_data.size()); + CUDA_CHECK(cudaMemcpyAsync(d_accessors, h_accessors.data(), + domain_transform.ptr_data.size() * sizeof(AffineAccessor,N2,T2>), + cudaMemcpyHostToDevice, stream), stream); + + uint32_t* d_prefix_points = buffer_arena.alloc(domain_transform.ptr_data.size()+1); + + buffer_arena.commit(false); + + size_t left = buffer_arena.used(); + + //Here we iterate over the tiles + size_t num_output = 0; + RectDesc* output_start = nullptr; + size_t num_completed = 0; + size_t curr_tile = tile_size / 2; + int count = 0; + if (count) {} + bool host_fallback = false; + std::vector*> host_rect_buffers(sources.size(), nullptr); + std::vector entry_counts(sources.size(), 0); + while (num_completed < inst_space.num_entries) { + try { + //std::cout << "Image iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; + buffer_arena.start(); + if (num_completed + curr_tile > inst_space.num_entries) { + curr_tile = inst_space.num_entries - num_completed; + } + collapsed_space inst_space_tile = inst_space; + inst_space_tile.num_entries = curr_tile; + inst_space_tile.entries_buffer = buffer_arena.alloc>(curr_tile); + CUDA_CHECK(cudaMemcpyAsync(inst_space_tile.entries_buffer, inst_space.entries_buffer + num_completed, curr_tile * sizeof(SparsityMapEntry), cudaMemcpyHostToDevice, stream), stream); + + size_t num_valid_rects; + RectDesc* d_valid_rects; + GPUMicroOp::template construct_input_rectlist>(inst_space_tile, src_space, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream); + + if (num_valid_rects == 0) { + num_completed += curr_tile; + subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + curr_tile = tile_size / 2; + continue; + } + + // Prefix sum the valid rectangles by volume. + size_t* d_prefix_rects; + size_t total_pts; + + GPUMicroOp::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream); + + CUDA_CHECK(cudaMemsetAsync(d_inst_counters, 0, (domain_transform.ptr_data.size()) * sizeof(uint32_t), stream), stream); + + //We do a two pass count + emit to chase all the pointers in parallel and check for membership in the parent index space + image_gpuPopulateBitmasksPtrsKernel<<>>(d_accessors, d_valid_rects, collapsed_parent.entries_buffer, d_prefix_rects, d_inst_prefix, nullptr, total_pts, num_valid_rects, domain_transform.ptr_data.size(), collapsed_parent.num_entries, d_inst_counters, nullptr); + KERNEL_CHECK(stream); + + std::vector h_inst_counters(domain_transform.ptr_data.size()+1); + h_inst_counters[0] = 0; // prefix sum starts at 0 + CUDA_CHECK(cudaMemcpyAsync(h_inst_counters.data()+1, d_inst_counters, domain_transform.ptr_data.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t i = 0; i < domain_transform.ptr_data.size(); ++i) { + h_inst_counters[i+1] += h_inst_counters[i]; + } + + size_t num_valid_points = h_inst_counters[domain_transform.ptr_data.size()]; + + if (num_valid_points == 0) { + num_completed += curr_tile; + subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + curr_tile = tile_size / 2; + continue; + } + + CUDA_CHECK(cudaMemcpyAsync(d_prefix_points, h_inst_counters.data(), (domain_transform.ptr_data.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream); + + buffer_arena.flip_parity(); + PointDesc* d_valid_points = buffer_arena.alloc>(num_valid_points); + buffer_arena.start(); + d_valid_points = buffer_arena.alloc>(num_valid_points); + + CUDA_CHECK(cudaMemsetAsync(d_inst_counters, 0, (domain_transform.ptr_data.size()) * sizeof(uint32_t), stream), stream); + + image_gpuPopulateBitmasksPtrsKernel<<>>(d_accessors, d_valid_rects, collapsed_parent.entries_buffer, d_prefix_rects, d_inst_prefix, d_prefix_points, total_pts, num_valid_rects, domain_transform.ptr_data.size(), collapsed_parent.num_entries, d_inst_counters, d_valid_points); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + + size_t num_new_rects = num_output == 0 ? 1 : 2; + assert(!buffer_arena.get_parity()); + RectDesc* d_new_rects; + + //Send it off for processing + this->complete_pipeline(d_valid_points, num_valid_points, d_new_rects, num_new_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + + if (host_fallback) { + this->split_output(d_new_rects, num_new_rects, host_rect_buffers, entry_counts, buffer_arena); + } + + if (num_output==0 || host_fallback) { + num_output = num_new_rects; + num_completed += curr_tile; + output_start = d_new_rects; + subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + curr_tile = tile_size / 2; + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + continue; + } + + RectDesc* d_old_rects = buffer_arena.alloc>(num_output); + assert(d_old_rects == d_new_rects + num_new_rects); + CUDA_CHECK(cudaMemcpyAsync(d_old_rects, output_start, num_output * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + size_t num_final_rects = 1; + + //Send it off for processing + this->complete_rect_pipeline(d_new_rects, num_output + num_new_rects, output_start, num_final_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + num_completed += curr_tile; + num_output = num_final_rects; + subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + curr_tile = tile_size / 2; + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } + catch (arena_oom&) { + //std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; + curr_tile /= 2; + if (curr_tile == 0) { + if (host_fallback) { + GPUMicroOp::shatter_rects(inst_space, num_completed, stream); + curr_tile = 1; + } else { + host_fallback = true; + if (num_output > 0) { + this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena); + } + curr_tile = tile_size / 2; + } + } + } + } + + if (num_output == 0) { + for (SparsityMap it : sparsity_outputs) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(it); + if (this->exclusive) { + impl->gpu_finalize(); + } else { + impl->contribute_nothing(); + } + } + return; + } + + if (!host_fallback) { + try { + this->send_output(output_start, num_output, buffer_arena, sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + } catch (arena_oom&) { + this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena); + host_fallback = true; + } + } + + if (host_fallback) { + for (size_t idx = 0; idx < sparsity_outputs.size(); ++idx) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(sparsity_outputs[idx]); + if (this->exclusive) { + impl->set_contributor_count(1); + } + if (entry_counts[idx] > 0) { + span> h_rects_span(host_rect_buffers[idx], entry_counts[idx]); + impl->contribute_dense_rect_list(h_rects_span, false); + deppart_host_free(host_rect_buffers[idx]); + } else { + impl->contribute_nothing(); + } + } + } +} +} diff --git a/src/realm/deppart/image_gpu_kernels.hpp b/src/realm/deppart/image_gpu_kernels.hpp new file mode 100644 index 0000000000..146d4e781f --- /dev/null +++ b/src/realm/deppart/image_gpu_kernels.hpp @@ -0,0 +1,167 @@ +#pragma once +#include "realm/deppart/image.h" + +namespace Realm { + +//Device helper to check parent space for membership +//TODO: if expensive, may benefit from BVH +template +__device__ bool image_isInIndexSpace( + const Point& p, + const SparsityMapEntry* parent_entries, + size_t numRects) +{ + // for each rectangle, check all dims… + for(size_t i = 0; i < numRects; ++i) { + const auto &r = parent_entries[i].bounds; + bool inside = true; + #pragma unroll + for(int d = 0; d < N; ++d) { + if(p[d] < r.lo[d] || p[d] > r.hi[d]) { + inside = false; + break; + } + } + if(inside) return true; + } + return false; +} + +//Count + emit to chase pointers and check for membership in parent space +template < + int N, typename T, + int N2, typename T2 +> +__global__ +void image_gpuPopulateBitmasksPtrsKernel( + AffineAccessor,N2,T2> *accessors, + RectDesc* rects, + SparsityMapEntry* parent_entries, + size_t* prefix, + uint32_t *inst_offsets, + uint32_t *d_inst_prefix, + size_t numPoints, + size_t numRects, + size_t num_insts, + size_t numParentRects, + uint32_t* d_inst_counters, + PointDesc *d_points +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numPoints) return; + size_t low = 0, high = numRects; + while (low < high) { + size_t mid = (low + high) >> 1; + if (prefix[mid+1] <= idx) low = mid + 1; + else high = mid; + } + size_t r = low; + bool found = false; + size_t inst_idx; + for (inst_idx = 0; inst_idx < num_insts; ++inst_idx) { + if (inst_offsets[inst_idx] <= r && inst_offsets[inst_idx+1] > r) { + found = true; + break; + } + } + assert(found); + size_t offset = idx - prefix[r]; + Point p; + for (int k = N2-1; k >= 0; --k) { + size_t dim = rects[r].rect.hi[k] + 1 - rects[r].rect.lo[k]; + p[k] = rects[r].rect.lo[k] + (offset % dim); + offset /= dim; + } + Point ptr = accessors[inst_idx].read(p); + if (image_isInIndexSpace(ptr, parent_entries, numParentRects)) { + uint32_t local = atomicAdd(&d_inst_counters[inst_idx], 1); + if (d_points != nullptr) { + uint32_t out_idx = d_inst_prefix[inst_idx] + local; + PointDesc point_desc; + point_desc.src_idx = rects[r].src_idx; + point_desc.point = ptr; + d_points[out_idx] = point_desc; + } + } + +} + +//Same as image_intersect_input, but for output rectangles and parent entries +//rather than input rectangles and parent rectangles + template +__global__ void image_intersect_output( + const SparsityMapEntry* d_parent_entries, + const RectDesc* d_output_rngs, + const uint32_t* d_src_prefix, + size_t numParentRects, + size_t numOutputRects, + uint32_t* d_src_counters, + RectDesc* d_rects +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numParentRects * numOutputRects) return; + size_t idx_x = idx % numParentRects; + size_t idx_y = idx / numParentRects; + const auto parent_entry = d_parent_entries[idx_x]; + const auto output_entry = d_output_rngs[idx_y]; + RectDesc rect_output; + rect_output.rect = parent_entry.bounds.intersection(output_entry.rect); + if (!rect_output.rect.empty()) { + uint32_t local = atomicAdd(&d_src_counters[output_entry.src_idx], 1); + if (d_rects != nullptr) { + rect_output.src_idx = output_entry.src_idx; + size_t out_idx = d_src_prefix[output_entry.src_idx] + local; + d_rects[out_idx] = rect_output; + } + } +} + +//Single pass function to chase pointers to rectangles. + template < + int N, typename T, + int N2, typename T2 +> +__global__ +void image_gpuPopulateBitmasksRngsKernel( + AffineAccessor,N2,T2> *accessors, + RectDesc* rects, + size_t* prefix, + uint32_t *inst_offsets, + size_t numPoints, + size_t numRects, + size_t num_insts, + RectDesc *d_rects +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numPoints) return; + size_t low = 0, high = numRects; + while (low < high) { + size_t mid = (low + high) >> 1; + if (prefix[mid+1] <= idx) low = mid + 1; + else high = mid; + } + size_t r = low; + bool found = false; + size_t inst_idx; + for (inst_idx = 0; inst_idx < num_insts; ++inst_idx) { + if (inst_offsets[inst_idx] <= r && inst_offsets[inst_idx+1] > r) { + found = true; + break; + } + } + assert(found); + size_t offset = idx - prefix[r]; + Point p; + for (int k = N2-1; k >= 0; --k) { + size_t dim = rects[r].rect.hi[k] + 1 - rects[r].rect.lo[k]; + p[k] = rects[r].rect.lo[k] + (offset % dim); + offset /= dim; + } + Rect rng = accessors[inst_idx].read(p); + RectDesc rect_desc; + rect_desc.src_idx = rects[r].src_idx; + rect_desc.rect = rng; + d_rects[idx] = rect_desc; +} + +} // namespace Realm \ No newline at end of file diff --git a/src/realm/deppart/image_gpu_tmpl.cu b/src/realm/deppart/image_gpu_tmpl.cu new file mode 100644 index 0000000000..6af4dcde61 --- /dev/null +++ b/src/realm/deppart/image_gpu_tmpl.cu @@ -0,0 +1,62 @@ +/* Copyright 2024 Stanford University, NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// per‐dimension instantiator for the GPU Image Operation +// Mirrors CPU Approach (image_tmpl.cc) + + +#include "realm/deppart/image_gpu_kernels.hpp" +#include "realm/deppart/image_gpu_impl.hpp" +#include "realm/deppart/inst_helper.h" + +#ifndef INST_N1 + #error "INST_N1 must be defined before including image_gpu_tmpl.cu" +#endif +#ifndef INST_N2 + #error "INST_N2 must be defined before including image_gpu_tmpl.cu" +#endif + +#define FOREACH_TT(__func__) \ + __func__(int, int) \ + __func__(int, unsigned) \ + __func__(int, long long) \ + __func__(unsigned,int) \ + __func__(unsigned,unsigned) \ + __func__(unsigned,long long) \ + __func__(long long, int) \ + __func__(long long, unsigned) \ + __func__(long long, long long) + +#define FOREACH_T(__func__) \ + __func__(int) \ + __func__(unsigned) \ + __func__(long long) + +namespace Realm { + #define N1 INST_N1 + #define N2 INST_N2 + + + #define DO_DOUBLE(T1,T2) \ + template class ImageMicroOp; \ + template class GPUImageMicroOp; + + FOREACH_TT(DO_DOUBLE) + + #undef DO_DOUBLE + #undef N1 + #undef N2 + +} // namespace Realm \ No newline at end of file diff --git a/src/realm/deppart/image_tmpl.cc b/src/realm/deppart/image_tmpl.cc index 578a78226b..a0d3d7319a 100644 --- a/src/realm/deppart/image_tmpl.cc +++ b/src/realm/deppart/image_tmpl.cc @@ -44,15 +44,25 @@ namespace Realm { #define N1 INST_N1 #define N2 INST_N2 +#ifdef REALM_USE_CUDA + #define GPU_IMAGE_LINE(N1,T1,N2,T2) template class GPUImageMicroOp; +#else + #define GPU_IMAGE_LINE(N1,T1,N2,T2) /* no CUDA */ +#endif + #define DOIT(T1,T2) \ template class StructuredImageMicroOp; \ - template class ImageMicroOp; \ + template class ImageMicroOp; \ + GPU_IMAGE_LINE(N1, T1, N2, T2) \ template class ImageOperation; \ template ImageMicroOp::ImageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \ + template void IndexSpace::by_image_buffer_requirements( \ + const std::vector>&, \ + const std::vector>&, \ + std::vector&) const; \ template Event IndexSpace::create_subspaces_by_image( \ const DomainTransform &, const std::vector > &, \ - std::vector > &, const ProfilingRequestSet &, Event) \ - const; \ + std::vector > &, const ProfilingRequestSet &, Event) const; \ template Event IndexSpace::create_subspaces_by_image_with_difference( \ const DomainTransform &, \ const std::vector >&, \ diff --git a/src/realm/deppart/partitions.cc b/src/realm/deppart/partitions.cc index b023f468fc..b25ddd17a5 100644 --- a/src/realm/deppart/partitions.cc +++ b/src/realm/deppart/partitions.cc @@ -18,7 +18,6 @@ // index space partitioning for Realm #include "realm/deppart/partitions.h" - #include "realm/profiling.h" #include "realm/runtime_impl.h" @@ -71,7 +70,7 @@ namespace Realm { size_t start, size_t count, size_t volume, IndexSpace *results, size_t first_result, size_t last_result, - const std::vector >& entries) + const span >& entries) { // should never be here with empty bounds assert(!bounds.empty()); @@ -111,13 +110,11 @@ namespace Realm { size_t lo_volume[N]; for(int i = 0; i < N; i++) lo_volume[i] = 0; - for(typename std::vector >::const_iterator it = entries.begin(); - it != entries.end(); - it++) { + for(size_t j = 0; j < entries.size(); j++) { for(int i = 0; i < N; i++) - lo_volume[i] += it->bounds.intersection(lo_half[i]).volume(); + lo_volume[i] += entries[j].bounds.intersection(lo_half[i]).volume(); } - // now compute how many subspaces would fall in each half and the + // now compute how many subspaces would fall in each half and the // inefficiency of the split size_t lo_count[N], inefficiency[N]; for(int i = 0; i < N; i++) { @@ -233,7 +230,7 @@ namespace Realm { // TODO: sparse case where we have to wait SparsityMapPublicImpl *impl = sparsity.impl(); assert(impl->is_valid()); - const std::vector >& entries = impl->get_entries(); + const span >& entries = impl->get_entries(); // initially every subspace will be a copy of this one, and then // we'll decompose the bounds subspace = *this; @@ -307,7 +304,7 @@ namespace Realm { // TODO: sparse case where we have to wait SparsityMapPublicImpl *impl = sparsity.impl(); assert(impl->is_valid()); - const std::vector >& entries = impl->get_entries(); + span> entries = impl->get_entries(); // initially every subspace will be a copy of this one, and then // we'll decompose the bounds subspaces.resize(count, *this); @@ -498,7 +495,7 @@ namespace Realm { template class RectListAdapter { public: - RectListAdapter(const std::vector >& _rects) + RectListAdapter(const span >& _rects) : rects(_rects.empty() ? 0 : &_rects[0]), count(_rects.size()) {} RectListAdapter(const Rect<1,T> *_rects, size_t _count) : rects(_rects), count(_count) {} @@ -583,7 +580,6 @@ namespace Realm { os << "AsyncMicroOp(" << (void *)uop << ")"; } - //////////////////////////////////////////////////////////////////////// // // class PartitioningMicroOp @@ -1061,4 +1057,3 @@ namespace Realm { FOREACH_NTNT(DOIT2) }; - diff --git a/src/realm/deppart/partitions.h b/src/realm/deppart/partitions.h index 7bb68c3630..0af8ec0673 100644 --- a/src/realm/deppart/partitions.h +++ b/src/realm/deppart/partitions.h @@ -34,12 +34,214 @@ #include "realm/deppart/sparsity_impl.h" #include "realm/deppart/inst_helper.h" #include "realm/bgwork.h" +#ifdef REALM_USE_CUDA +#include "realm/cuda/cuda_module.h" + +struct CUstream_st; +typedef CUstream_st* CUstream; + +#endif + namespace Realm { class PartitioningMicroOp; class PartitioningOperation; +#ifdef REALM_USE_CUDA + + namespace Cuda { + class GPUStream; + class GPUProcessor; + } + + template + struct HiFlag { + T hi; + uint8_t head; + }; + + struct DeltaFlag { + int32_t delta; + uint8_t head; + }; + + // Data representations for GPU micro-ops + // src idx tracks which subspace each rect/point + // belongs to and allows multiple subspaces to be + // computed together in a micro-op + template + struct RectDesc { + Rect rect; + size_t src_idx; + }; + + template + struct PointDesc { + Point point; + size_t src_idx; + }; + + // Combines one or multiple index spaces into a single struct + // If multiple, offsets tracks transitions between spaces + template + struct collapsed_space { + SparsityMapEntry* entries_buffer; + size_t num_entries; + size_t* offsets; + size_t num_children; + Rect bounds; + SparsityMapEntry* host_entries_owner = nullptr; + }; + + // Stores everything necessary to query a BVH + // Used with GPUMicroOp::build_bvh + template + struct BVH { + int root; + size_t num_leaves; + Rect* boxes; + uint64_t* indices; + size_t* labels; + int* childLeft; + int* childRight; + }; + + struct arena_oom : std::bad_alloc { + const char* what() const noexcept override { return "arena_oom"; } + }; + + class Arena { + public: + using byte = std::byte; + + Arena() noexcept : location(Memory::NO_MEMORY), base_(nullptr), cap_(0), parity_(false), left_(0), right_(0), base_left_(0), base_right_(0) {} + Arena(void* buffer, size_t bytes, Memory location) noexcept + : location(location), base_(reinterpret_cast(buffer)), cap_(bytes), parity_(false), left_(0), right_(0), base_left_(0), base_right_(0) {} + Arena(RegionInstance buffer) : Arena(buffer.pointer_untyped(0, buffer.get_layout()->bytes_used), buffer.get_layout()->bytes_used, buffer.get_location()) {} + + size_t capacity() const noexcept { return cap_; } + size_t used() const noexcept { return left_ + right_; } + + size_t mark() const noexcept { + return parity_ ? right_ : left_; + } + + void rollback(size_t mark) noexcept { + if (parity_) { + right_ = mark; + } else { + left_ = mark; + } + } + + size_t mark(bool dir) const noexcept { + return dir ? right_ : left_; + } + + void rollback(size_t mark, bool dir) noexcept { + if (dir) { + right_ = mark; + } else { + left_ = mark; + } + } + + template + T* alloc(size_t count = 1) { + static_assert(!std::is_void_v, "alloc is invalid"); + return reinterpret_cast(alloc_bytes(count * sizeof(T), alignof(T))); + } + + void* alloc_bytes(size_t bytes, size_t align = alignof(std::max_align_t)) { + return parity_ ? alloc_right_bytes(bytes, align) : alloc_left_bytes(bytes, align); + } + + void flip_parity(void) noexcept { + if (parity_) { + // switching from right to left + left_ = base_left_; + } else { + // switching from left to right + right_ = base_right_; + } + parity_ = !parity_; + } + + void commit(bool parity) noexcept { + if (parity) { + base_right_ = right_; + } else { + base_left_ = left_; + } + } + + void reset(bool parity) noexcept { + if (parity) { + base_right_ = 0; + right_ = 0; + } else { + base_left_ = 0; + left_ = 0; + } + } + + bool get_parity(void) const noexcept { + return parity_; + } + + void start(void) noexcept { + left_ = base_left_; + right_ = base_right_; + parity_ = false; + } + + Memory location; + + private: + + void* alloc_left_bytes(size_t bytes, size_t align = alignof(std::max_align_t)) { + const size_t aligned = align_up(left_, align); + if (aligned + bytes + right_ > cap_) { + throw arena_oom{}; + } + void* p = base_ + aligned; + left_ = aligned + bytes; + return p; + } + + void* alloc_right_bytes(size_t bytes, size_t align = alignof(std::max_align_t)) { + if (bytes + right_ > cap_) { + throw arena_oom{}; + } + const size_t aligned = align_down(cap_ - right_ - bytes, align); + if (aligned < left_) { + throw arena_oom{}; + } + void *p = base_ + aligned; + right_ = cap_ - aligned; + return p; + } + + static size_t align_up(size_t x, size_t a) noexcept { + return (x + (a - 1)) & ~(a - 1); + } + + static size_t align_down(size_t x, size_t a) noexcept { + return x & ~(a - 1); + } + + byte* base_; + size_t cap_; + bool parity_; + size_t left_; + size_t right_; + size_t base_left_; + size_t base_right_; + }; + + +#endif template class OverlapTester { @@ -147,6 +349,57 @@ namespace Realm { std::vector *> extra_deps; }; +#ifdef REALM_USE_CUDA + //The parent class for all GPU partitioning micro-ops. Provides output utility functions + + template + class GPUMicroOp : public PartitioningMicroOp { + public: + GPUMicroOp(void) = default; + GPUMicroOp(NodeID _requestor, AsyncMicroOp *_async_microop) + : PartitioningMicroOp(_requestor, _async_microop) {} + virtual ~GPUMicroOp(void) = default; + + virtual void execute(void) = 0; + + static void shatter_rects(collapsed_space & inst_space, size_t &num_completed, CUstream stream); + + template + static void collapse_multi_space(const std::vector& field_data, collapsed_space &out_space, Arena &my_arena, CUstream stream); + + static void collapse_parent_space(const IndexSpace& parent_space, collapsed_space &out_space, Arena &my_arena, CUstream stream); + + static void build_bvh(const collapsed_space &space, BVH &bvh, Arena &my_arena, CUstream stream); + + template + static void construct_input_rectlist(const collapsed_space &lhs, const collapsed_space &rhs, out_t* &d_valid_rects, size_t& out_size, uint32_t* counters, uint32_t* out_offsets, Arena &my_arena, CUstream stream); + + template + static void volume_prefix_sum(const out_t* d_rects, size_t total_rects, size_t* &d_prefix_rects, size_t& num_pts, Arena &my_arena, CUstream stream); + + template + void complete_pipeline(PointDesc* d_points, size_t total_pts, RectDesc* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap); + + template + void complete_rect_pipeline(RectDesc* d_rects, size_t total_rects, RectDesc* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap); + + template + void complete1d_pipeline(RectDesc* d_rects, size_t total_rects, RectDesc* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap); + + void split_output(RectDesc* d_rects, size_t total_rects, std::vector *> &output_instances, std::vector &output_counts, Arena &my_arena); + + template + void send_output(RectDesc* d_rects, size_t total_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap); + + virtual bool is_image_microop() const { return false; } + + bool exclusive = false; + Cuda::GPU* gpu; + Cuda::GPUStream* stream; + + }; +#endif + //////////////////////////////////////// // @@ -252,10 +505,25 @@ namespace Realm { static ActiveMessageHandlerReg areg; }; + // Finds a memory of the specified kind. Returns true on success, false otherwise. + inline bool choose_proc(Processor &best_proc, Memory location) + { + std::vector affinities; + unsigned best_bandwidth = 0; + best_proc = Processor::NO_PROC; + Machine::get_machine().get_proc_mem_affinity(affinities, Processor::NO_PROC, location); + for (auto affinity : affinities) { + if (affinity.bandwidth > best_bandwidth) { + best_bandwidth = affinity.bandwidth; + best_proc = affinity.p; + } + } + return best_proc != Processor::NO_PROC; + } + }; #include "realm/deppart/partitions.inl" #endif // REALM_PARTITIONS_H - diff --git a/src/realm/deppart/partitions_gpu.cu b/src/realm/deppart/partitions_gpu.cu new file mode 100644 index 0000000000..b842e93f58 --- /dev/null +++ b/src/realm/deppart/partitions_gpu.cu @@ -0,0 +1,29 @@ +/* Copyright 2024 Stanford University, NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// per‐dimension instantiator for the GPU version of +// ImageMicroOp<…>::gpu_populate_bitmasks_ptrs + + +#include "realm/deppart/partitions_gpu_impl.hpp" +#include "realm/deppart/inst_helper.h" + +namespace Realm { + #define DOIT(N,T) \ + template class GPUMicroOp; + + FOREACH_NT(DOIT) + +} // namespace Realm \ No newline at end of file diff --git a/src/realm/deppart/partitions_gpu_impl.hpp b/src/realm/deppart/partitions_gpu_impl.hpp new file mode 100644 index 0000000000..e293419b9a --- /dev/null +++ b/src/realm/deppart/partitions_gpu_impl.hpp @@ -0,0 +1,1850 @@ +#pragma once +#include "deppart_config.h" +#include "partitions.h" +#ifdef REALM_USE_NVTX +#include "realm/nvtx.h" +#endif +#include "realm/cuda/cuda_internal.h" +#include "realm/deppart/partitions_gpu_kernels.hpp" +#include + +//CUDA ERROR CHECKING MACROS + +#define CUDA_CHECK(call, stream) \ + do { \ + cudaError_t err = (call); \ + if (err != cudaSuccess) { \ + std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ \ + << " '" #call "' failed with " \ + << cudaGetErrorString(err) << " (" << err << ")\n"; \ + assert(false); \ + } \ + } while (0) + +#define KERNEL_CHECK(stream) \ + do { \ + cudaError_t err = cudaGetLastError(); \ + if (err != cudaSuccess) { \ + std::cerr << "Kernel launch failed at " << __FILE__ << ":" << __LINE__ \ + << ": " << cudaGetErrorString(err) << "\n"; \ + assert(false); \ + } \ + } while (0) + +#define THREADS_PER_BLOCK 256 + +#define COMPUTE_GRID(num_items) \ + (((num_items) + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK) + +#define CUDA_HOST_CHECK(call) \ + do { \ + cudaError_t err = (call); \ + if (err != cudaSuccess) { \ + std::cerr << "CUDA host error at " << __FILE__ << ":" << __LINE__ \ + << " '" #call "' failed with " \ + << cudaGetErrorString(err) << " (" << err << ")\n"; \ + assert(false); \ + } \ + } while (0) + + +//NVTX macros to only add ranges if defined. +#ifdef REALM_USE_NVTX + +#include + +inline int32_t next_nvtx_payload() { + static std::atomic counter{0}; + return counter.fetch_add(1, std::memory_order_relaxed); +} + +#define NVTX_CAT2(a, b) a##b +#define NVTX_CAT(a, b) NVTX_CAT2(a, b) + +#define NVTX_DEPPART(message) \ + nvtxScopedRange NVTX_CAT(nvtx_, __LINE__)("cuda", #message, next_nvtx_payload()) + +#else + + #define NVTX_DEPPART(message) do { } while (0) + +#endif + +namespace Realm { + + template + inline T *deppart_host_alloc(size_t count, unsigned flags = cudaHostAllocPortable) + { + if(count == 0) return nullptr; + void *ptr = nullptr; + CUDA_HOST_CHECK(cudaHostAlloc(&ptr, count * sizeof(T), flags)); + return reinterpret_cast(ptr); + } + + inline void deppart_host_free(void *ptr) + { + if(ptr != nullptr) + CUDA_HOST_CHECK(cudaFreeHost(ptr)); + } + + // Used by cub::DeviceReduce to compute bad GPU approximation. + template + struct UnionRectOp { + __host__ __device__ + Rect operator()(const Rect& a, + const Rect& b) const { + Rect r; + for(int d=0; d b.hi[d] ? a.hi[d] : b.hi[d]; + } + return r; + } + }; + + // Used to compute prefix sum by volume for an array of Rects or RectDescs. + template + struct RectVolumeOp { + __device__ __forceinline__ + size_t operator()(const out_t& r) const { + if constexpr (std::is_same_v, out_t>) { + return r.volume(); + } else { + return r.rect.volume(); + } + } + }; + + // Finds a memory of the specified kind. Returns true on success, false otherwise. + inline bool find_memory(Memory &output, Memory::Kind kind, Memory input = Memory::NO_MEMORY) + { + std::vector affinities; + unsigned best_bandwidth = 0; + output = Memory::NO_MEMORY; + Machine::get_machine().get_mem_mem_affinity(affinities, input, Memory::NO_MEMORY); + for (auto affinity : affinities) { + if (affinity.m2.kind() != kind) { + continue; + } + if (affinity.bandwidth > best_bandwidth) { + best_bandwidth = affinity.bandwidth; + output = affinity.m2; + } + } + if (output == Memory::NO_MEMORY) { + std::set memories; + Machine::get_machine().get_all_memories(memories); + for (auto mem : memories) { + if (mem.kind() == kind) { + output = mem; + return true; + } + } + return false; + } + return true; + } + + template + void GPUMicroOp::shatter_rects(collapsed_space & inst_space, size_t &num_completed, CUstream stream) { + + NVTX_DEPPART(shatter_rects); + size_t new_size = (inst_space.entries_buffer[num_completed].bounds.volume() + 1) / 2; + assert(new_size > 0); + size_t num_new_entries = 0; + std::vector offsets(inst_space.num_children + 1); + std::vector new_offsets(inst_space.num_children + 1); + CUDA_CHECK(cudaMemcpyAsync(offsets.data(), inst_space.offsets, (inst_space.num_children + 1) * sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t i = 0; i < inst_space.num_children; ++i) { + new_offsets[i] = num_new_entries; + if (offsets[i+1] <= num_completed) { + continue; + } + for (size_t j = offsets[i]; j < offsets[i+1]; ++j) { + if (j >= num_completed) { + num_new_entries += (inst_space.entries_buffer[j].bounds.volume() + new_size - 1) / new_size; + } + } + } + new_offsets[inst_space.num_children] = num_new_entries; + CUDA_CHECK(cudaMemcpyAsync(inst_space.offsets, new_offsets.data(), (inst_space.num_children + 1) * sizeof(size_t), cudaMemcpyHostToDevice, stream), stream); + SparsityMapEntry *new_entries_ptr = deppart_host_alloc>(num_new_entries); + + size_t write_loc = 0; + for (size_t i = num_completed; i < inst_space.num_entries; i++) { + Rect bounds = inst_space.entries_buffer[i].bounds; + if (bounds.volume() <= new_size) { + new_entries_ptr[write_loc] = inst_space.entries_buffer[i]; + write_loc++; + continue; + } + size_t count = (bounds.volume() + new_size - 1) / new_size; + // split in the largest dimension available + int split_dim = 0; + T total = std::max(bounds.hi[0] - bounds.lo[0] + 1, T(0)); + if(N > 1) { + for(int d = 1; d < N; d++) { + T extent = std::max(bounds.hi[d] - bounds.lo[d] + 1, T(0)); + if(extent > total) { + total = extent; + split_dim = d; + } + } + } + T px = bounds.lo[split_dim]; + // have to divide before multiplying to avoid overflow + T base_span_size = total / count; + T base_span_rem = total - (base_span_size * count); + T leftover = 0; + for(size_t j = 0; j < count; j++) { + new_entries_ptr[write_loc] = inst_space.entries_buffer[i]; + T nx = px + (base_span_size - 1); + if(base_span_rem != 0) { + leftover += base_span_rem; + if(leftover >= T(count)) { + nx += 1; + leftover -= count; + } + } + new_entries_ptr[write_loc].bounds.lo[split_dim] = px; + new_entries_ptr[write_loc].bounds.hi[split_dim] = nx; + px = nx + 1; + write_loc++; + } + } + + num_completed = 0; + inst_space.entries_buffer = new_entries_ptr; + inst_space.num_entries = num_new_entries; + deppart_host_free(inst_space.host_entries_owner); + inst_space.host_entries_owner = new_entries_ptr; + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + } + + //Given a list of spaces, compacts them all into one collapsed_space + template + template + void GPUMicroOp::collapse_multi_space(const std::vector& spaces, collapsed_space &out_space, Arena &my_arena, CUstream stream) + { + + NVTX_DEPPART(collapse_multi_space); + out_space.bounds = Rect::make_empty(); + + char *val = std::getenv("SHATTER_SIZE"); // or any env var + int shatter_size = 1; //default + if (val) { + shatter_size = atoi(val); + } + // We need space_offsets to preserve which space each rectangle came from + std::vector space_offsets(spaces.size() + 1); + + // Determine size of allocation for combined rects. + out_space.num_entries = 0; + + for (size_t i = 0; i < spaces.size(); ++i) { + space_offsets[i] = out_space.num_entries; + IndexSpace my_space; + if constexpr (std::is_same_v>) { + my_space = spaces[i]; + } else { + my_space = spaces[i].index_space; + } + out_space.bounds = out_space.bounds.union_bbox(my_space.bounds); + if (my_space.dense()) { + if constexpr (std::is_same_v>) { + out_space.num_entries += 1; + } else { + out_space.num_entries += shatter_size; + } + } else { + out_space.num_entries += my_space.sparsity.impl()->get_entries().size(); + } + } + space_offsets[spaces.size()] = out_space.num_entries; + + //We copy into one contiguous host buffer, then copy to device + SparsityMapEntry* h_entries = deppart_host_alloc>(out_space.num_entries); + + if (my_arena.capacity()==0) { + out_space.entries_buffer = h_entries; + out_space.host_entries_owner = h_entries; + } else { + out_space.entries_buffer = my_arena.alloc >(out_space.num_entries); + } + + + //Now we fill the host array with all rectangles + size_t pos = 0; + for (size_t i = 0; i < spaces.size(); ++i) { + IndexSpace my_space; + if constexpr (std::is_same_v>) { + my_space = spaces[i]; + } else { + my_space = spaces[i].index_space; + } + if (my_space.dense()) { + if constexpr (std::is_same_v>) { + SparsityMapEntry entry; + entry.bounds = my_space.bounds; + memcpy(h_entries + pos, &entry, sizeof(SparsityMapEntry)); + ++pos; + } else { + std::vector > tmp(shatter_size); + int ppt = (my_space.bounds.hi[0] - my_space.bounds.lo[0]+1) / shatter_size; + for (int i = 0; i < shatter_size; ++i) { + Rect new_rect = my_space.bounds; + new_rect.lo[0] = my_space.bounds.lo[0] + i * ppt; + new_rect.hi[0] = (i == shatter_size - 1) ? my_space.bounds.hi[0] : (new_rect.lo[0] + ppt - 1); + SparsityMapEntry entry; + entry.bounds = new_rect; + entry.sparsity.id = 0; + entry.bitmap = 0; + tmp[i] = entry; + } + memcpy(h_entries + pos, tmp.data(), tmp.size() * sizeof(SparsityMapEntry)); + pos += shatter_size; + } + } else { + span> tmp = my_space.sparsity.impl()->get_entries(); + memcpy(h_entries + pos, tmp.data(), tmp.size() * sizeof(SparsityMapEntry)); + pos += tmp.size(); + } + } + + //Now we copy our entries and offsets to the device + CUDA_CHECK(cudaMemcpyAsync(out_space.offsets, space_offsets.data(), (spaces.size() + 1) * sizeof(size_t), cudaMemcpyHostToDevice, stream), stream); + if (my_arena.capacity() != 0) { + CUDA_CHECK(cudaMemcpyAsync(out_space.entries_buffer, h_entries, out_space.num_entries * sizeof(SparsityMapEntry), cudaMemcpyHostToDevice, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + deppart_host_free(h_entries); + } + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + } + + // Only real work here is getting dense/sparse into a single collapsed_space. + template + void GPUMicroOp::collapse_parent_space(const IndexSpace& parent_space, collapsed_space &out_space, Arena &my_arena, cudaStream_t stream) + { + + NVTX_DEPPART(collapse_parent_space); + if (parent_space.dense()) { + SparsityMapEntry entry; + entry.bounds = parent_space.bounds; + out_space.entries_buffer = my_arena.alloc>(1); + out_space.num_entries = 1; + out_space.bounds = parent_space.bounds; + CUDA_CHECK(cudaMemcpyAsync(out_space.entries_buffer, &entry, sizeof(SparsityMapEntry), cudaMemcpyHostToDevice, stream), stream); + } else { + span> tmp = parent_space.sparsity.impl()->get_entries(); + out_space.num_entries = tmp.size(); + out_space.entries_buffer = my_arena.alloc>(tmp.size()); + out_space.bounds = parent_space.bounds; + CUDA_CHECK(cudaMemcpyAsync(out_space.entries_buffer, tmp.data(), tmp.size() * sizeof(SparsityMapEntry), cudaMemcpyHostToDevice, stream), stream); + } + out_space.offsets = nullptr; + out_space.num_children = 1; + } + + // Given a collapsed space, builds a (potentially marked) bvh over that space. + // Based on Tero Karras' Maximizing Parallelism in the Construction of BVHs, Octrees, and k-d Trees + template + void GPUMicroOp::build_bvh(const collapsed_space &space, BVH &result, Arena &my_arena, cudaStream_t stream) + { + NVTX_DEPPART(build_bvh); + //We want to keep the entire BVH that we return in one instance for convenience. + size_t indices_instance_size = space.num_entries * sizeof(uint64_t); + size_t labels_instance_size = space.offsets == nullptr ? 0 : space.num_entries * sizeof(size_t); + size_t boxes_instance_size = (2*space.num_entries - 1) * sizeof(Rect); + size_t child_instance_size = (2*space.num_entries - 1) * sizeof(int); + + size_t total_instance_size = indices_instance_size + labels_instance_size + boxes_instance_size + 2 * child_instance_size; + char* bvh_ptr = my_arena.alloc(total_instance_size); + + result.num_leaves = space.num_entries; + + size_t curr_idx = 0; + result.indices = reinterpret_cast(bvh_ptr + curr_idx); + curr_idx += indices_instance_size; + result.labels = space.offsets == nullptr ? nullptr : reinterpret_cast(bvh_ptr + curr_idx); + curr_idx += labels_instance_size; + result.boxes = reinterpret_cast*>(bvh_ptr + curr_idx); + curr_idx += boxes_instance_size; + result.childLeft = reinterpret_cast(bvh_ptr + curr_idx); + curr_idx += child_instance_size; + result.childRight = reinterpret_cast(bvh_ptr + curr_idx); + + size_t prev = my_arena.mark(); + + // Bounds used for morton code computation. + Rect* d_global_bounds = my_arena.alloc>(1); + CUDA_CHECK(cudaMemcpyAsync(d_global_bounds, &space.bounds, sizeof(Rect), cudaMemcpyHostToDevice, stream), stream); + + // These are intermediate instances we'll destroy before returning. + char* d_morton_visit = my_arena.alloc(2 * space.num_entries * max(sizeof(uint64_t), sizeof(int))); + uint64_t* d_morton_codes = reinterpret_cast(d_morton_visit); + + size_t intermed = my_arena.mark(); + + uint64_t* d_indices_in = my_arena.alloc(space.num_entries); + + // We compute morton codes for each leaf and sort, labeling if necessary. + bvh_build_morton_codes<<>>(space.entries_buffer, space.offsets, d_global_bounds, space.num_entries, space.num_children, d_morton_codes, d_indices_in, result.labels); + KERNEL_CHECK(stream); + + uint64_t* d_morton_codes_out = d_morton_codes + space.num_entries; + uint64_t* d_indices_out = result.indices; + + void *bvh_temp = nullptr; + size_t bvh_temp_bytes = 0; + cub::DeviceRadixSort::SortPairs(bvh_temp, bvh_temp_bytes, d_morton_codes, d_morton_codes_out, d_indices_in, + d_indices_out, space.num_entries, 0, 64, stream); + bvh_temp = reinterpret_cast(my_arena.alloc(bvh_temp_bytes)); + cub::DeviceRadixSort::SortPairs(bvh_temp, bvh_temp_bytes, d_morton_codes, d_morton_codes_out, d_indices_in, + d_indices_out, space.num_entries, 0, 64, stream); + + std::swap(d_morton_codes, d_morton_codes_out); + + my_arena.rollback(intermed); + + + // Another temporary instance. + int* d_parent = my_arena.alloc(2*space.num_entries - 1); + CUDA_CHECK(cudaMemsetAsync(d_parent, -1, (2*space.num_entries - 1) * sizeof(int), stream), stream); + + // Here's where we actually build the BVH + int n = (int) space.num_entries; + bvh_build_radix_tree_kernel<<< COMPUTE_GRID(space.num_entries - 1), THREADS_PER_BLOCK, 0, stream>>>(d_morton_codes, result.indices, n, result.childLeft, result.childRight, d_parent); + KERNEL_CHECK(stream); + + // Figure out which node didn't get its parent set. + int* d_root = my_arena.alloc(1); + + CUDA_CHECK(cudaMemsetAsync(d_root, -1, sizeof(int), stream), stream); + + bvh_build_root_kernel<<< COMPUTE_GRID(2 * space.num_entries - 1), THREADS_PER_BLOCK, 0, stream>>>(d_root, d_parent, space.num_entries); + KERNEL_CHECK(stream); + + CUDA_CHECK(cudaMemcpyAsync(&result.root, d_root, sizeof(int), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + // Now we materialize the tree into something the client can query. + bvh_init_leaf_boxes_kernel<<>>(space.entries_buffer, result.indices, space.num_entries, result.boxes); + KERNEL_CHECK(stream); + + int* d_visitCount = reinterpret_cast(d_morton_visit); + CUDA_CHECK(cudaMemsetAsync(d_visitCount, 0, (2*space.num_entries - 1) * sizeof(int), stream), stream); + + bvh_merge_internal_boxes_kernel < N, T ><<< COMPUTE_GRID(space.num_entries), THREADS_PER_BLOCK, 0, stream>>>(space.num_entries, result.childLeft, result.childRight, d_parent, result.boxes, d_visitCount); + KERNEL_CHECK(stream); + + // Cleanup. + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + my_arena.rollback(prev); + + } + + // Intersects two collapsed spaces, where lhs is always instances and rhs is either parent or sources/targets. + // If rhs is sources/targets, we mark the intersected rectangles by where they came from. + // If the intersection is costly, we accelerate with a BVH. + template + template + void GPUMicroOp::construct_input_rectlist(const collapsed_space &lhs, const collapsed_space &rhs, out_t* &d_valid_rects, size_t& out_size, uint32_t* counters, uint32_t* out_offsets, Arena &my_arena, cudaStream_t stream) + { + + NVTX_DEPPART(construct_input_rectlist); + CUDA_CHECK(cudaMemsetAsync(counters, 0, (lhs.num_children) * sizeof(uint32_t), stream), stream); + + BVH my_bvh; + bool bvh_valid = rhs.num_children < rhs.num_entries && lhs.num_children < lhs.num_entries && lhs.num_entries > 1000; + if (bvh_valid) { + build_bvh(rhs, my_bvh, my_arena, stream); + } + + // First pass: figure out how many rectangles survive intersection. + if (!bvh_valid) { + intersect_input_rects<<>>(lhs.entries_buffer, rhs.entries_buffer, lhs.offsets, nullptr, rhs.offsets, lhs.num_entries, rhs.num_entries, lhs.num_children, rhs.num_children, counters, nullptr); + } else { + query_input_bvh<<>>(lhs.entries_buffer, lhs.offsets, my_bvh.root, my_bvh.childLeft, my_bvh.childRight, my_bvh.indices, my_bvh.labels, my_bvh.boxes, lhs.num_entries, my_bvh.num_leaves, lhs.num_children, nullptr, counters, nullptr); + } + KERNEL_CHECK(stream); + + + // Prefix sum over instances (small enough to keep on host). + std::vector h_inst_counters(lhs.num_children+1); + h_inst_counters[0] = 0; // prefix sum starts at 0 + CUDA_CHECK(cudaMemcpyAsync(h_inst_counters.data()+1, counters, lhs.num_children * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t i = 0; i < lhs.num_children; ++i) { + h_inst_counters[i+1] += h_inst_counters[i]; + } + + out_size = h_inst_counters[lhs.num_children]; + + if (out_size==0) { + return; + } + + //Moving on... + my_arena.flip_parity(); + + // Non-empty rectangles from the intersection. + d_valid_rects = my_arena.alloc(out_size); + + // Where each instance should start writing its rectangles. + CUDA_CHECK(cudaMemcpyAsync(out_offsets, h_inst_counters.data(), (lhs.num_children + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream); + + // Reset counters. + CUDA_CHECK(cudaMemsetAsync(counters, 0, lhs.num_children * sizeof(uint32_t), stream), stream); + + // Second pass: recompute intersection, but this time write to output. + if (!bvh_valid) { + intersect_input_rects<<>>(lhs.entries_buffer, rhs.entries_buffer, lhs.offsets, out_offsets, rhs.offsets, lhs.num_entries, rhs.num_entries, lhs.num_children, rhs.num_children, counters, d_valid_rects); + } else { + query_input_bvh<<>>(lhs.entries_buffer, lhs.offsets, my_bvh.root, my_bvh.childLeft, my_bvh.childRight, my_bvh.indices, my_bvh.labels, my_bvh.boxes, lhs.num_entries, my_bvh.num_leaves, lhs.num_children, out_offsets, counters, d_valid_rects); + } + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } + + // Prefix sum an array of Rects or RectDescs by volume. + template + template + void GPUMicroOp::volume_prefix_sum(const out_t* d_rects, size_t total_rects, size_t* &d_prefix_rects, size_t& num_pts, Arena &my_arena, cudaStream_t stream) + { + + NVTX_DEPPART(volume_prefix_sum); + d_prefix_rects = my_arena.alloc(total_rects+1); + CUDA_CHECK(cudaMemsetAsync(d_prefix_rects, 0, sizeof(size_t), stream), stream); + + size_t prev = my_arena.mark(); + + // Build the CUB transform‐iterator. + using VolIter = cub::TransformInputIterator< + size_t, // output type + RectVolumeOp, // functor + const out_t* // underlying input iterator + >; + VolIter d_volumes(d_rects, RectVolumeOp()); + + void* d_temp = nullptr; + size_t rect_temp_bytes = 0; + cub::DeviceScan::InclusiveSum( + /* d_temp_storage */ nullptr, + /* temp_bytes */ rect_temp_bytes, + /* d_in */ d_volumes, + /* d_out */ d_prefix_rects + 1, // shift by one so prefix[1]..prefix[n] + /* num_items */ total_rects, stream); + + d_temp = reinterpret_cast(my_arena.alloc(rect_temp_bytes)); + cub::DeviceScan::InclusiveSum( + /* d_temp_storage */ d_temp, + /* temp_bytes */ rect_temp_bytes, + /* d_in */ d_volumes, + /* d_out */ d_prefix_rects + 1, + /* num_items */ total_rects, stream); + + + //Number of points across all rectangles (also our total output count). + CUDA_CHECK(cudaMemcpyAsync(&num_pts, &d_prefix_rects[total_rects], sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + my_arena.rollback(prev); + } + + template + struct SegmentedMax { + __device__ __forceinline__ + HiFlag operator()(HiFlag a, HiFlag b) const { + // if b.head==1, start new segment at b; otherwise merge with running max + return b.head + ? b + : HiFlag{ a.hi > b.hi ? a.hi : b.hi , a.head }; + } + }; + + struct SegmentedSum { + __device__ __forceinline__ + DeltaFlag operator()(DeltaFlag a, DeltaFlag b) const { + // if b.head==1, start new segment at b; otherwise merge with running max + return b.head + ? b + : DeltaFlag{ a.delta + b.delta , a.head }; + } + }; + + struct CustomSum + { + template + __device__ __forceinline__ + T operator()(const T &a, const T &b) const { + return b+a; + } + }; + + + /* + * Input: An array of rectangles (potentially overlapping) with associated + * src indices, where all the rectangles with a given src idx together represent an exact covering + * of the partitioning output for that index. + * Output: A disjoint, coalesced array of rectangles sorted by src idx that it then sends off + * to the send output function, which constructs the final sparsity map. + * Approach: The difficult part is constructing a disjoint covering. To do so, collect all the corners from all the + * rectangles as the unique "boundaries" for each dimension and mark them with the parity for the number of dimensions + * in which they are the hi+1 coord (we add 1 to make intervals half-open). This means that if you prefix sum in each dimension, + * for any given rectangle anything internal will sum to 1, and anything external will sum to 0. To understand the intuition, + * see the illustration below for the rectangle [(0,0), (2,2)] + * Corners: (0,0), (0,3), (3,0), (3,3) + * Parities: 0 hi-> +1, 1 hi -> -1, 1 hi -> -1, 2 hi -> +1 + * Computation: + * Initial Markings + * 0 1 2 3 4 ... + * 0 +1 -1 + * 1 + * 2 + * 3 -1 +1 + * 4 + * ... + * Prefix sum by Y + * 0 1 2 3 4 ... + * 0 +1 -1 + * 1 1 -1 + * 2 1 -1 + * 3 0 0 + * 4 0 0 + * ... + * Prefix sum by X + * 0 1 2 3 4 ... + * 0 +1 1 1 0 0 ... + * 1 1 1 1 0 0 ... + * 2 1 1 1 0 0 ... + * 3 0 0 0 0 0 ... + * 4 0 0 0 0 0 ... + * ... + * Note that all the points in the rectangle end up labeled 1, and all the points outside labeled 0. In the actual computation, we use segments + * rather than points, where a segment accounts for all points between two consecutive boundaries. Because a prefix sum is a linear operator, when + * we extend the computation above to multiple overlapping rectangles, you end up with included segments labeled with a count of how many rectangles include them, + * and excluded segments labeled with 0. Thus, for the last dimension, we emit all segments with sums > 0 as disjoint output rectangles. We can then dump these + * into the sort + coalesce pipeline. + */ + template + template + void GPUMicroOp::complete_rect_pipeline(RectDesc* d_rects, size_t total_rects, RectDesc* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap) + { + + //1D case is much simpler + if (N==1) { + this->complete1d_pipeline(d_rects, total_rects, d_out_rects, out_rects, my_arena, ctr, getIndex, getMap); + return; + } + NVTX_DEPPART(complete_rect_pipeline); + CUstream stream = this->stream->get_stream(); + + assert(!my_arena.get_parity()); + size_t beginning = my_arena.mark(); + + uint32_t* srcs_ptr = my_arena.alloc(4 * total_rects); + T* crds_ptr = my_arena.alloc(4 * total_rects); + uint8_t* heads_ptr = my_arena.alloc(2 * total_rects); + size_t* sums_ptr = my_arena.alloc(2 * total_rects); + + size_t left_restore = my_arena.mark(); + size_t right_restore = my_arena.mark(true); + + size_t *B_starts[N]; + size_t *B_ends[N]; + + T* B_coord[N]; + size_t B_size[N]; + + int threads_per_block = 256; + size_t grid_size = (total_rects + threads_per_block - 1) / threads_per_block; + + size_t orig_tmp = 0; + size_t temp_restore = my_arena.mark(); + void *tmp_storage = nullptr; + + //Our first step is to find all the unique "boundaries" in each dimension (lo coord or hi+1 coord) + { + NVTX_DEPPART(mark_endpoints); + for (int d = 0; d < N; ++d) { + + //We need the coordinates to be sorted by our curent dim and separated by src idx + grid_size = (total_rects + threads_per_block - 1) / threads_per_block; + uint32_t* d_srcs_in = srcs_ptr; + uint32_t* d_srcs_out = srcs_ptr + 2* total_rects; + T* d_coord_keys_in = crds_ptr; + T* d_coord_keys_out = crds_ptr + 2 * total_rects; + mark_endpoints<<>>(d_rects, total_rects, d, d_srcs_in, d_coord_keys_in); + KERNEL_CHECK(stream); + size_t temp_bytes; + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_srcs_in, d_srcs_out, + 2 * total_rects, 0, 8*sizeof(T), stream); + if (temp_bytes > orig_tmp) { + if (orig_tmp > 0) { + my_arena.rollback(temp_restore); + } + orig_tmp = temp_bytes; + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + } + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_srcs_in, d_srcs_out, + 2 * total_rects, 0, 8*sizeof(T), stream); + std::swap(d_srcs_in, d_srcs_out); + std::swap(d_coord_keys_in, d_coord_keys_out); + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, + d_srcs_in, d_srcs_out, + d_coord_keys_in, d_coord_keys_out, + 2 * total_rects, 0, 8*sizeof(uint32_t), stream); + if (temp_bytes > orig_tmp) { + if (orig_tmp > 0) { + my_arena.rollback(temp_restore); + } + orig_tmp = temp_bytes; + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + } + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, + d_srcs_in, d_srcs_out, + d_coord_keys_in, d_coord_keys_out, + 2 * total_rects, 0, 8*sizeof(uint32_t), stream); + + //Now mark the unique keys + grid_size = (2*total_rects + threads_per_block - 1) / threads_per_block; + uint8_t * d_heads = heads_ptr; + size_t *d_output = sums_ptr; + mark_heads<<>>(d_srcs_out, d_coord_keys_out, 2 * total_rects, d_heads); + KERNEL_CHECK(stream); + + cub::DeviceScan::ExclusiveSum(nullptr, temp_bytes, d_heads, d_output, 2 * total_rects, stream); + if (temp_bytes > orig_tmp) { + if (orig_tmp > 0) { + my_arena.rollback(temp_restore); + } + orig_tmp = temp_bytes; + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + } + cub::DeviceScan::ExclusiveSum(tmp_storage, temp_bytes, d_heads, d_output, 2 * total_rects, stream); + + size_t num_unique; + uint8_t last_bit; + CUDA_CHECK(cudaMemcpyAsync(&num_unique, &d_output[2*total_rects-1], sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaMemcpyAsync(&last_bit, &d_heads[2*total_rects-1], sizeof(uint8_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + num_unique += last_bit; + + my_arena.flip_parity(); + assert(my_arena.get_parity()); + my_arena.rollback(right_restore); + + //Collect all the data we'll need later for this dimension - starts/ends by src, unique boundaries, unique boundaries count + B_starts[d] = my_arena.alloc(2 *ctr.size()); + B_ends[d] = B_starts[d] + ctr.size(); + B_coord[d] = my_arena.alloc(num_unique); + B_size[d] = num_unique; + + right_restore = my_arena.mark(); + my_arena.flip_parity(); + assert(!my_arena.get_parity()); + my_arena.rollback(left_restore); + + CUDA_CHECK(cudaMemsetAsync(B_starts[d], 0, ctr.size() * sizeof(size_t), stream), stream); + CUDA_CHECK(cudaMemsetAsync(B_ends[d], 0, ctr.size() * sizeof(size_t), stream), stream); + scatter_unique<<>>(d_srcs_out, d_coord_keys_out, d_output, d_heads, 2 * total_rects, B_starts[d], B_ends[d], B_coord[d]); + KERNEL_CHECK(stream); + std::vector d_starts_host(ctr.size()), d_ends_host(ctr.size()); + CUDA_CHECK(cudaMemcpyAsync(d_starts_host.data(), B_starts[d], ctr.size() * sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaMemcpyAsync(d_ends_host.data(), B_ends[d], ctr.size() * sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t i = 1; i < ctr.size(); i++) { + if (d_starts_host[i] < d_ends_host[i-1]) { + d_starts_host[i] = d_ends_host[i-1]; + d_ends_host[i] = d_ends_host[i-1]; + } + } + CUDA_CHECK(cudaMemcpyAsync(B_starts[d], d_starts_host.data(), ctr.size() * sizeof(size_t), cudaMemcpyHostToDevice, stream), stream); + CUDA_CHECK(cudaMemcpyAsync(B_ends[d], d_ends_host.data(), ctr.size() * sizeof(size_t), cudaMemcpyHostToDevice, stream), stream); + } + + assert(!my_arena.get_parity()); + my_arena.rollback(beginning); + + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } + + orig_tmp = 0; + + my_arena.flip_parity(); + assert(my_arena.get_parity()); + my_arena.rollback(right_restore); + + size_t** B_start_ptrs = my_arena.alloc(2 * N); + size_t** B_end_ptrs = B_start_ptrs + N; + + T** B_coord_ptrs = my_arena.alloc(N); + + right_restore = my_arena.mark(); + + //We need the arrays themselves on the device + CUDA_CHECK(cudaMemcpyAsync(B_coord_ptrs, B_coord, N * sizeof(T*), cudaMemcpyHostToDevice, stream), stream); + CUDA_CHECK(cudaMemcpyAsync(B_start_ptrs, B_starts, N * sizeof(size_t*), cudaMemcpyHostToDevice, stream), stream); + CUDA_CHECK(cudaMemcpyAsync(B_end_ptrs, B_ends, N * sizeof(size_t*), cudaMemcpyHostToDevice, stream), stream); + + //Next up, we generate all the corners of all the rectangles and mark them by parity + size_t num_corners = (1 << N); + CornerDesc* d_corners_in = my_arena.alloc>(2 * num_corners * total_rects); + CornerDesc* d_corners_out = d_corners_in + num_corners * total_rects; + + size_t corner_restore = my_arena.mark(); + + my_arena.flip_parity(); + assert(!my_arena.get_parity()); + my_arena.flip_parity(); + my_arena.rollback(corner_restore); + + populate_corners<<>>(d_rects, total_rects, d_corners_in); + KERNEL_CHECK(stream); + + + // We have a LOT of bookkeeping to do + + size_t alloc_size_1 = std::max({sizeof(size_t), sizeof(T), sizeof(int32_t), sizeof(DeltaFlag)}); + size_t align_1 = std::max({alignof(size_t), alignof(T), alignof(int32_t), alignof(DeltaFlag)}); + + char* shared_ptr = reinterpret_cast(my_arena.alloc_bytes(2 * num_corners * total_rects * alloc_size_1, align_1)); + uint8_t* d_flags = my_arena.alloc(num_corners * total_rects); + size_t* d_exc_sum = my_arena.alloc(num_corners * total_rects); + + size_t* d_src_keys_in = reinterpret_cast(shared_ptr); + size_t* d_src_keys_out = d_src_keys_in + num_corners * total_rects; + T* d_coord_keys_in = reinterpret_cast(shared_ptr); + T* d_coord_keys_out = d_coord_keys_in + num_corners * total_rects; + int32_t* d_deltas = reinterpret_cast(shared_ptr); + int32_t* d_deltas_out = d_deltas + num_corners * total_rects; + DeltaFlag* d_delta_flags_in = reinterpret_cast(shared_ptr); + DeltaFlag* d_delta_flags_out = d_delta_flags_in + num_corners * total_rects; + + size_t* seg_starts; + size_t* seg_ends; + + uint32_t* d_seg_counters; + + uint32_t* d_seg_counters_out; + + grid_size = (num_corners * total_rects + threads_per_block - 1) / threads_per_block; + + orig_tmp = 0; + temp_restore = my_arena.mark(); + tmp_storage = nullptr; + + //We need to reduce duplicate corners by their parity, so we sort to get duplicates next to each other and then reduce by key + { + + NVTX_DEPPART(sort_corners); + for (int dim = 0; dim < N; dim++) { + build_coord_key<<>>(d_coord_keys_in, d_corners_in, num_corners * total_rects, dim); + KERNEL_CHECK(stream); + size_t temp_bytes; + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_corners_in, d_corners_out, + num_corners * total_rects, 0, 8*sizeof(T), stream); + if (temp_bytes > orig_tmp) { + if (orig_tmp > 0) { + my_arena.rollback(temp_restore); + } + orig_tmp = temp_bytes; + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + } + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_corners_in, d_corners_out, + num_corners * total_rects, 0, 8*sizeof(T), stream); + + std::swap(d_corners_in, d_corners_out); + + } + } + + size_t temp_bytes; + build_src_key<<>>(d_src_keys_in, d_corners_in, num_corners * total_rects); + KERNEL_CHECK(stream); + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, + d_src_keys_in, d_src_keys_out, + d_corners_in, d_corners_out, + num_corners * total_rects, 0, 8*sizeof(size_t), stream); + if (temp_bytes > orig_tmp) { + if (orig_tmp > 0) { + my_arena.rollback(temp_restore); + } + orig_tmp = temp_bytes; + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + } + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, + d_src_keys_in, d_src_keys_out, + d_corners_in, d_corners_out, + num_corners * total_rects, 0, 8*sizeof(size_t), stream); + + std::swap(d_corners_in, d_corners_out); + get_delta<<>>(d_deltas, d_corners_in, num_corners * total_rects); + KERNEL_CHECK(stream); + + my_arena.rollback(temp_restore); + int* d_num_runs = my_arena.alloc(1); + + //See above, we have custom equality and reduction operators for CornerDesc + CustomSum red_op; + cub::DeviceReduce::ReduceByKey( + nullptr, temp_bytes, + d_corners_in, d_corners_out, + d_deltas, d_deltas_out, + d_num_runs, + red_op, + /*num_items=*/(int) (num_corners * total_rects), + /*stream=*/stream); + + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + cub::DeviceReduce::ReduceByKey( + tmp_storage, temp_bytes, + d_corners_in, d_corners_out, + d_deltas, d_deltas_out, + d_num_runs, + red_op, + /*num_items=*/(int) (num_corners * total_rects), + /*stream=*/stream); + + int num_unique_corners; + CUDA_CHECK(cudaMemcpyAsync(&num_unique_corners, d_num_runs, sizeof(int), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + my_arena.rollback(temp_restore); + + grid_size = (num_unique_corners + threads_per_block - 1) / threads_per_block; + set_delta<<>>(d_deltas_out, d_corners_out, num_unique_corners); + KERNEL_CHECK(stream); + + std::swap(d_corners_out, d_corners_in); + + size_t num_intermediate = num_unique_corners; + size_t num_segments; + + //This is where the real work is done. In each dimension, we do a segmented prefix sum of the parity markings keyed on (src idx, {every dim but d}) for all active segments. + // Then, for each unique boundary b in dim d, for each segment s keyed on (src idx, {every dim but d}), we evaluate s's prefix sum value at b. If nonzero, we emit a segment + // for s between b and the next boundary in d with all the other coords set to s's coords. These become the active segments for the next pass. In the last pass (d = 0), rather + // than emitting segments, we emit rectangles for all segments with nonzero prefix sums (in fact they must also be nonnegative - recall the model is > 0 for included, 0 for excluded + // by the end). + { + NVTX_DEPPART(collapse_higher_dims); + for (int d = N-1; d >= 0; d--) { + grid_size = (num_intermediate + threads_per_block - 1) / threads_per_block; + + //Our least significant sort is by d. + build_coord_key<<>>(d_coord_keys_in, d_corners_in, num_intermediate, d); + KERNEL_CHECK(stream); + size_t temp_bytes; + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_corners_in, d_corners_out, + num_intermediate, 0, 8*sizeof(T), stream); + + my_arena.rollback(temp_restore); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_corners_in, d_corners_out, + num_intermediate, 0, 8*sizeof(T), stream); + + std::swap(d_corners_in, d_corners_out); + + //We need to key segments on every dimension but d and src idx, so we do a series of stable sorts to get there + for (int dim = 0; dim < N; dim++) { + if (dim == d) { + continue; + } + build_coord_key<<>>(d_coord_keys_in, d_corners_in, num_intermediate, dim); + KERNEL_CHECK(stream); + size_t temp_bytes; + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_corners_in, d_corners_out, + num_intermediate, 0, 8*sizeof(T), stream); + + my_arena.rollback(temp_restore); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_corners_in, d_corners_out, + num_intermediate, 0, 8*sizeof(T), stream); + + std::swap(d_corners_in, d_corners_out); + + } + + build_src_key<<>>(d_src_keys_in, d_corners_in, num_intermediate); + KERNEL_CHECK(stream); + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, + d_src_keys_in, d_src_keys_out, + d_corners_in, d_corners_out, + num_intermediate, 0, 8*sizeof(size_t), stream); + + my_arena.rollback(temp_restore); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, + d_src_keys_in, d_src_keys_out, + d_corners_in, d_corners_out, + num_intermediate, 0, 8*sizeof(size_t), stream); + + std::swap(d_corners_in, d_corners_out); + + //This serves 2 purposes + // 1) Our segmented prefix sum needs to know where to start and stop + // 2) We need to know how many unique segments (keyed on (src_idx, {every dimension but d}) we have + mark_deltas_heads<<>>(d_corners_in, num_intermediate, d, d_flags, d_delta_flags_in); + KERNEL_CHECK(stream); + + cub::DeviceScan::InclusiveSum(nullptr, temp_bytes, d_flags, d_exc_sum, num_intermediate, stream); + + my_arena.rollback(temp_restore); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + + cub::DeviceScan::InclusiveSum(tmp_storage, temp_bytes, d_flags, d_exc_sum, num_intermediate, stream); + + CUDA_CHECK(cudaMemcpyAsync(&num_segments, &d_exc_sum[num_intermediate-1], sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + //Mark the beginning and end of each segment for our kernel to use in binary search + seg_starts = my_arena.alloc(2 * num_segments); + seg_ends = seg_starts + num_segments; + + temp_restore = my_arena.mark(); + + seg_boundaries<<>>(d_flags, d_exc_sum, num_intermediate, seg_starts, seg_ends); + KERNEL_CHECK(stream); + + //Segmented prefix sum using our flags constructed above + cub::DeviceScan::InclusiveScan( + /*d_temp=*/ nullptr, + /*bytes=*/ temp_bytes, + /*in=*/ d_delta_flags_in, + /*out=*/ d_delta_flags_out, + /*op=*/ SegmentedSum(), + /*num_items=*/ num_intermediate, + /*stream=*/ stream + ); + + my_arena.rollback(temp_restore); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + + cub::DeviceScan::InclusiveScan( + /*d_temp=*/ tmp_storage, + /*bytes=*/ temp_bytes, + /*in=*/ d_delta_flags_in, + /*out=*/ d_delta_flags_out, + /*op=*/ SegmentedSum(), + /*num_items=*/ num_intermediate, + /*stream=*/ stream + ); + + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + //Per usual, we do a count + emit pass to track active segments and limit memory usage. If the evaluated prefix sum for a boundary within a segment + //is 0, we can skip it because it won't contribute anything to future sums and also won't be emitted. + d_seg_counters = my_arena.alloc(2 * num_segments); + d_seg_counters_out = d_seg_counters + num_segments; + CUDA_CHECK(cudaMemsetAsync(d_seg_counters, 0, num_segments * sizeof(uint32_t), stream), stream); + + temp_restore = my_arena.mark(); + + grid_size = ((num_segments*B_size[d]) + threads_per_block - 1) / threads_per_block; + count_segments<<>>(d_delta_flags_out, seg_starts, seg_ends, B_starts[d], B_ends[d], d_corners_in, B_coord[d], B_size[d], num_segments, d, d_seg_counters); + KERNEL_CHECK(stream); + + cub::DeviceScan::ExclusiveSum(nullptr, temp_bytes, d_seg_counters, d_seg_counters_out, num_segments, stream); + + my_arena.rollback(temp_restore); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + + cub::DeviceScan::ExclusiveSum(tmp_storage, temp_bytes, d_seg_counters, d_seg_counters_out, num_segments, stream); + + uint32_t next_round; + uint32_t last_count; + CUDA_CHECK(cudaMemcpyAsync(&next_round, &d_seg_counters_out[num_segments-1], sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaMemcpyAsync(&last_count, &d_seg_counters[num_segments-1], sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + next_round += last_count; + + num_intermediate = next_round; + + //In this case we exit out to emit rectangles rather than segments + if (d==0) { + break; + } + + my_arena.flip_parity(); + if (my_arena.get_parity()) { + my_arena.rollback(right_restore); + } + + CornerDesc* d_next_corners = my_arena.alloc>(2 * next_round); + CUDA_CHECK(cudaMemsetAsync(d_seg_counters, 0, num_segments*sizeof(uint32_t), stream), stream); + + corner_restore = my_arena.mark(); + my_arena.flip_parity(); + my_arena.flip_parity(); + my_arena.rollback(corner_restore); + + write_segments<<>>(d_delta_flags_out, seg_starts, seg_ends, B_starts[d], B_ends[d], d_corners_in, B_coord[d], d_seg_counters_out, B_size[d], num_segments, d, d_seg_counters, d_next_corners); + KERNEL_CHECK(stream); + + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + d_corners_in = d_next_corners; + d_corners_out = d_next_corners + next_round; + + //The segment count in each iter is not monotonic, so we have to realloc each time + shared_ptr = reinterpret_cast(my_arena.alloc_bytes(2 * num_intermediate * alloc_size_1, align_1)); + d_flags = my_arena.alloc(num_intermediate); + d_exc_sum = my_arena.alloc(num_intermediate); + + temp_restore = my_arena.mark(); + + d_src_keys_in = reinterpret_cast(shared_ptr); + d_src_keys_out = reinterpret_cast(shared_ptr) + num_intermediate; + + d_coord_keys_in = reinterpret_cast(shared_ptr); + d_coord_keys_out = reinterpret_cast(shared_ptr) + num_intermediate; + + d_deltas = reinterpret_cast(shared_ptr); + d_deltas_out = reinterpret_cast(shared_ptr) + num_intermediate; + + d_delta_flags_in = reinterpret_cast(shared_ptr); + d_delta_flags_out = reinterpret_cast(shared_ptr) + num_intermediate; + + } + } + + //Get to a known state + my_arena.flip_parity(); + if (my_arena.get_parity()) { + my_arena.rollback(right_restore); + } + + + //For our last dim, we emit rectangles rather than segments. These rectangles are a disjoint, precise covering of the original set. + RectDesc* d_rects_out = my_arena.alloc>(num_intermediate); + CUDA_CHECK(cudaMemsetAsync(d_seg_counters, 0, num_segments*sizeof(uint32_t), stream), stream); + + write_segments<<>>(d_delta_flags_out, seg_starts, seg_ends, B_start_ptrs, B_end_ptrs, d_corners_in, B_coord_ptrs, d_seg_counters_out, B_size[0], num_segments, d_seg_counters, d_rects_out); + KERNEL_CHECK(stream); + + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + //Force the rectangles to the left side of the buffer + if (my_arena.get_parity()) { + my_arena.flip_parity(); + RectDesc* tmp_out = my_arena.alloc>(num_intermediate); + CUDA_CHECK(cudaMemcpyAsync(tmp_out, d_rects_out, num_intermediate * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); + } + + //Clear everything out, we should be on the left + my_arena.flip_parity(); + my_arena.flip_parity(); + assert(!my_arena.get_parity()); + + RectDesc* d_rects_in = my_arena.alloc>(2 * num_intermediate); + d_rects_out = d_rects_in + num_intermediate; + + size_t alloc_size_2 = max(sizeof(size_t), sizeof(T)); + size_t align_2 = max(alignof(size_t), alignof(T)); + + + shared_ptr = reinterpret_cast(my_arena.alloc_bytes(2 * num_intermediate * alloc_size_2, align_2)); + + d_src_keys_in = reinterpret_cast(shared_ptr); + d_src_keys_out = reinterpret_cast(shared_ptr) + num_intermediate; + d_coord_keys_in = reinterpret_cast(shared_ptr); + d_coord_keys_out = reinterpret_cast(shared_ptr) + num_intermediate; + + size_t* group_ids = reinterpret_cast(shared_ptr); + + uint8_t* break_points = my_arena.alloc(num_intermediate); + + temp_restore = my_arena.mark(); + + //Now that we have disjoint rectangles, we can do our usual sort and coalesce pass + size_t last = INT_MAX; + { + NVTX_DEPPART(compact_disjoint_rects); + while (last > num_intermediate) { + last = num_intermediate; + + bool done = false; + for (int dim = 1; !done; dim++) { + if (dim == N) { + dim = 0; // wrap around to 0 + done = true; + } + grid_size = (num_intermediate + threads_per_block - 1) / threads_per_block; + + build_lo_key<<>>(d_coord_keys_in, d_rects_in, num_intermediate, dim); + KERNEL_CHECK(stream); + size_t temp_bytes; + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_rects_in, d_rects_out, + num_intermediate, 0, 8*sizeof(T), stream); + + my_arena.rollback(temp_restore); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_rects_in, d_rects_out, + num_intermediate, 0, 8*sizeof(T), stream); + + std::swap(d_rects_in, d_rects_out); + for (int d = 0; d < N; d++) { + if (d == dim) { + continue; + } + build_hi_key<<>>(d_coord_keys_in, d_rects_in, num_intermediate, d); + KERNEL_CHECK(stream); + size_t temp_bytes; + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_rects_in, d_rects_out, + num_intermediate, 0, 8*sizeof(T), stream); + + my_arena.rollback(temp_restore); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_rects_in, d_rects_out, + num_intermediate, 0, 8*sizeof(T), stream); + + std::swap(d_rects_in, d_rects_out); + build_lo_key<<>>(d_coord_keys_in, d_rects_in, num_intermediate, d); + KERNEL_CHECK(stream); + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_rects_in, d_rects_out, + num_intermediate, 0, 8*sizeof(T), stream); + + my_arena.rollback(temp_restore); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, + d_coord_keys_in, d_coord_keys_out, + d_rects_in, d_rects_out, + num_intermediate, 0, 8*sizeof(T), stream); + + std::swap(d_rects_in, d_rects_out); + + } + + build_src_key<<>>(d_src_keys_in, d_rects_in, num_intermediate); + KERNEL_CHECK(stream); + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, + d_src_keys_in, d_src_keys_out, + d_rects_in, d_rects_out, + num_intermediate, 0, 8*sizeof(size_t), stream); + + my_arena.rollback(temp_restore); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + + cub::DeviceRadixSort::SortPairs(tmp_storage, temp_bytes, + d_src_keys_in, d_src_keys_out, + d_rects_in, d_rects_out, + num_intermediate, 0, 8*sizeof(size_t), stream); + + std::swap(d_rects_in, d_rects_out); + + mark_breaks_dim<<>>(d_rects_in, break_points, num_intermediate, dim); + KERNEL_CHECK(stream); + + cub::DeviceScan::InclusiveSum(nullptr, temp_bytes, break_points, group_ids, num_intermediate, stream); + + my_arena.rollback(temp_restore); + tmp_storage = reinterpret_cast(my_arena.alloc(temp_bytes)); + + cub::DeviceScan::InclusiveSum(tmp_storage, temp_bytes, break_points, group_ids, num_intermediate, stream); + + size_t last_grp; + CUDA_CHECK(cudaMemcpyAsync(&last_grp, &group_ids[num_intermediate-1], sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + init_rects_dim<<>>(d_rects_in, break_points, group_ids, d_rects_out, num_intermediate, dim); + KERNEL_CHECK(stream); + + num_intermediate = last_grp; + std::swap(d_rects_in, d_rects_out); + } + } + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } + + if (out_rects == 2) { + d_out_rects = d_rects; + if (d_out_rects != d_rects_in) { + CUDA_CHECK(cudaMemcpyAsync(d_out_rects, d_rects_in, num_intermediate * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); + } + out_rects = num_intermediate; + } else if (out_rects == 1) { + my_arena.reset(true); + d_out_rects = my_arena.alloc>(num_intermediate); + my_arena.commit(true); + if (d_rects_in + num_intermediate >= d_out_rects) { + assert(d_rects_out < d_rects_in); + CUDA_CHECK(cudaMemcpyAsync(d_rects_out, d_rects_in, num_intermediate * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); + std::swap(d_rects_in, d_rects_out); + } + CUDA_CHECK(cudaMemcpyAsync(d_out_rects, d_rects_in, num_intermediate * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); + out_rects = num_intermediate; + } else { + this->send_output(d_rects_in, num_intermediate, my_arena, ctr, getIndex, getMap); + } + } + + /* + * Input: An array of 1D rectangles (potentially overlapping) with associated + * src indices, where all the rectangles with a given src idx together represent an exact covering + * of the partitioning output for that index. + * Output: A disjoint, coalesced array of rectangles sorted by src idx that it then sends off + * to the send output function, which constructs the final sparsity map. + * Approach: The canonical 1D rectangle merge, in parallel. Sort the rectangles by (src_idx, lo). Then + * prefix max by hi segmented by src_idx to find overlapping rectangles. Then, RLE by starting a new rectangle + * when in a new src or lo > current max hi and merging otherwise. + */ + template + template + void GPUMicroOp::complete1d_pipeline(RectDesc* d_rects, size_t total_rects, RectDesc* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap) + { + + NVTX_DEPPART(complete1d_pipeline); + CUstream stream = this->stream->get_stream(); + + RectDesc* d_rects_in = d_rects; + + size_t bytes_T = total_rects * sizeof(T); + size_t bytes_S = total_rects * sizeof(size_t); + size_t bytes_HF = total_rects * sizeof(HiFlag); + size_t max_bytes = std::max({bytes_T, bytes_HF, bytes_S}); + size_t max_align = std::max({alignof(T), alignof(HiFlag), alignof(size_t)}); + + char* aux_ptr = reinterpret_cast(my_arena.alloc_bytes(2 * max_bytes, max_align)); + + uint8_t* break_points = my_arena.alloc(total_rects); + size_t* group_ids = my_arena.alloc(total_rects); + + T* d_keys_in = reinterpret_cast(aux_ptr); + T* d_keys_out = reinterpret_cast(aux_ptr + max_bytes); + + size_t* d_src_keys_in = reinterpret_cast(aux_ptr); + size_t* d_src_keys_out = reinterpret_cast(aux_ptr + max_bytes); + + HiFlag* d_hi_flags_in = reinterpret_cast*>(aux_ptr); + HiFlag* d_hi_flags_out = reinterpret_cast*>(aux_ptr + max_bytes); + + size_t num_intermediate = total_rects; + + const size_t prev = my_arena.mark(); + RectDesc* d_rects_out = my_arena.alloc>(total_rects); + + size_t t1=0, t2 = 0, t3 = 0, t4 = 0; + cub::DeviceRadixSort::SortPairs(nullptr, t1, + d_keys_in, d_keys_out, d_rects_in, d_rects_out, num_intermediate, + 0, 8*sizeof(T), stream); + // exclusive scan + cub::DeviceScan::ExclusiveScan(nullptr, t2, + d_hi_flags_in, d_hi_flags_out, + SegmentedMax(), HiFlag{std::numeric_limits::min(), 0}, + num_intermediate, stream); + // inclusive sum + cub::DeviceScan::InclusiveSum(nullptr, t3, + break_points, group_ids, + num_intermediate, stream); + + cub::DeviceRadixSort::SortPairs(nullptr, t4, d_src_keys_in, d_src_keys_out, d_rects_in, d_rects_out, num_intermediate, 0, 8*sizeof(size_t), stream); + + size_t temp_bytes = std::max({t1, t2, t3, t4}); + size_t use_bytes = temp_bytes; + void *temp_storage = my_arena.alloc(temp_bytes); + + int threads_per_block = 256; + size_t grid_size = (num_intermediate + threads_per_block - 1) / threads_per_block; + + //Sort the rectangles keyed by (src, lo) + { + NVTX_DEPPART(sort_rects); + + build_lo_key<<>>(d_keys_in, d_rects_in, num_intermediate, 0); + KERNEL_CHECK(stream); + cub::DeviceRadixSort::SortPairs(temp_storage, use_bytes, d_keys_in, d_keys_out, d_rects_in, d_rects_out, num_intermediate, 0, 8*sizeof(T), stream); + std::swap(d_rects_in, d_rects_out); + + build_src_key<<>>(d_src_keys_in, d_rects_in, num_intermediate); + KERNEL_CHECK(stream); + + use_bytes = temp_bytes; + cub::DeviceRadixSort::SortPairs(temp_storage, use_bytes, d_src_keys_in, d_src_keys_out, d_rects_in, d_rects_out, num_intermediate, 0, 8*sizeof(size_t), stream); + std::swap(d_rects_in, d_rects_out); + } + + //Prefix max by hi segmented by src, then RLE to merge. + { + NVTX_DEPPART(run_length_encode); + build_hi_flag<<>>(d_hi_flags_in, d_rects_in, num_intermediate, 0); + KERNEL_CHECK(stream); + + + use_bytes = temp_bytes; + cub::DeviceScan::ExclusiveScan( + /*d_temp=*/ temp_storage, + /*bytes=*/ use_bytes, + /*in=*/ d_hi_flags_in, + /*out=*/ d_hi_flags_out, + /*op=*/ SegmentedMax(), + HiFlag{std::numeric_limits::min(), 0}, + /*num_items=*/ num_intermediate, + /*stream=*/ stream + ); + + threads_per_block = 256; + grid_size = (num_intermediate + threads_per_block - 1) / threads_per_block; + mark_breaks_dim<<>>(d_hi_flags_in, d_hi_flags_out, d_rects_in, break_points, num_intermediate, 0); + KERNEL_CHECK(stream); + use_bytes = temp_bytes; + cub::DeviceScan::InclusiveSum(temp_storage, use_bytes, break_points, group_ids, num_intermediate, stream); + + size_t last_grp; + CUDA_CHECK(cudaMemcpyAsync(&last_grp, &group_ids[num_intermediate-1], sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + my_arena.flip_parity(); + assert(my_arena.get_parity()); + + if (out_rects == 1) { + my_arena.reset(true); + } + d_rects_out = my_arena.alloc>(last_grp); + if (out_rects == 1) { + my_arena.commit(true); + } + + init_rects_dim<<>>(d_rects_in, d_hi_flags_out, break_points, group_ids, d_rects_out, num_intermediate, 0); + KERNEL_CHECK(stream); + + num_intermediate = last_grp; + if (out_rects == 2) { + my_arena.flip_parity(); + d_rects_in = my_arena.alloc>(num_intermediate); + CUDA_CHECK(cudaMemcpyAsync(d_rects_in, d_rects_out, num_intermediate * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); + } else { + std::swap(d_rects_in, d_rects_out); + } + + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } + + if (out_rects > 0) { + d_out_rects = d_rects_in; + out_rects = num_intermediate; + } else { + this->send_output(d_rects_in, num_intermediate, my_arena, ctr, getIndex, getMap); + } + } + + /* + * Input: An array of points (potentially with duplicates) with associated + * src indices, where all the points with a given src idx together represent an exact covering + * of the partitioning output for that index. + * Output: A disjoint, coalesced array of rectangles sorted by src idx that it then sends off + * to the send output function, which constructs the final sparsity map. + * Approach: Sort the points by (x0,x1,...,xN-1,src) (right is MSB). Convert them to singleton rects. + * Run-length encode along each dimension (N-1...0). + */ + template + template + void GPUMicroOp::complete_pipeline(PointDesc* d_points, size_t total_pts, RectDesc* &d_out_rects, size_t &out_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap) + { + + NVTX_DEPPART(complete_pipeline); + + if (out_rects == 2) { + my_arena.flip_parity(); + } + + + CUstream stream = this->stream->get_stream(); + + size_t bytes_T = total_pts * sizeof(T); + size_t bytes_S = total_pts * sizeof(size_t); + size_t bytes_R = total_pts * sizeof(RectDesc); + size_t bytes_p = total_pts * sizeof(PointDesc); + size_t max_aux_bytes = std::max({bytes_T, bytes_S, bytes_R}); + size_t max_pg_bytes = std::max({bytes_p, bytes_S}); + + // Instance shared by coordinate keys, source keys, and rectangle outputs + char* aux_ptr = my_arena.alloc(2 * max_aux_bytes); + + //Instance shared by group ids (RLE) and intermediate points in sorting + char* pg_ptr = my_arena.alloc(max_pg_bytes); + + uint8_t* break_points = my_arena.alloc(total_pts); + + T* d_keys_in = reinterpret_cast(aux_ptr); + T* d_keys_out = reinterpret_cast(aux_ptr + max_aux_bytes); + + PointDesc* d_points_in = d_points; + PointDesc* d_points_out = reinterpret_cast*>(pg_ptr); + + size_t* group_ids = reinterpret_cast(pg_ptr); + + RectDesc* d_rects_in = reinterpret_cast*>(aux_ptr); + RectDesc *d_rects_out = reinterpret_cast*>(aux_ptr + max_aux_bytes); + + size_t* d_src_keys_in = reinterpret_cast(aux_ptr); + size_t* d_src_keys_out = reinterpret_cast(aux_ptr + max_aux_bytes); + + size_t t1=0, t2=0, t3=0; + cub::DeviceRadixSort::SortPairs(nullptr, t1, d_keys_in, d_keys_out, d_points_in, d_points_out, total_pts, 0, 8*sizeof(T), stream); + cub::DeviceRadixSort::SortPairs(nullptr, t2, d_src_keys_in, d_src_keys_out, d_points_in, d_points_out, total_pts, 0, 8*sizeof(size_t), stream); + cub::DeviceScan::InclusiveSum(nullptr, t3, break_points, group_ids, total_pts, stream); + + //Temporary storage instance shared by CUB operations. + size_t temp_bytes = std::max({t1, t2, t3}); + + void *temp_storage = my_arena.alloc(temp_bytes); + + //Sort along each dimension from LSB to MSB (0 to N-1) + size_t use_bytes = temp_bytes; + + { + NVTX_DEPPART(sort_valid_points); + for (int dim = 0; dim < N; ++dim) { + build_coord_key<<>>(d_keys_in, d_points_in, total_pts, dim); + KERNEL_CHECK(stream); + cub::DeviceRadixSort::SortPairs(temp_storage, use_bytes, d_keys_in, d_keys_out, d_points_in, d_points_out, total_pts, 0, 8*sizeof(T), stream); + std::swap(d_keys_in, d_keys_out); + std::swap(d_points_in, d_points_out); + } + + //Sort by source index now to keep individual partitions separate + build_src_key<<>>(d_src_keys_in, d_points_in, total_pts); + KERNEL_CHECK(stream); + use_bytes = temp_bytes; + cub::DeviceRadixSort::SortPairs(temp_storage, use_bytes, d_src_keys_in, d_src_keys_out, d_points_in, d_points_out, total_pts, 0, 8*sizeof(size_t), stream); + } + + + points_to_rects<<>>(d_points_out, d_rects_in, total_pts); + KERNEL_CHECK(stream); + + size_t num_intermediate = total_pts; + + { + NVTX_DEPPART(run_length_encode); + + for (int dim = N-1; dim >= 0; --dim) { + + // Step 1: Mark rectangle starts + // e.g. [1, 2, 4, 5, 6, 8] -> [1, 0, 1, 0, 0, 1] + mark_breaks_dim<<>>(d_rects_in, break_points, num_intermediate, dim); + KERNEL_CHECK(stream); + + // Step 2: Inclusive scan of break points to get group ids + // e.g. [1, 0, 1, 0, 0, 1] -> [1, 1, 2, 2, 2, 3] + use_bytes = temp_bytes; + cub::DeviceScan::InclusiveSum(temp_storage, use_bytes, break_points, group_ids, num_intermediate, stream); + + //Determine new number of intermediate rectangles + size_t last_grp; + CUDA_CHECK(cudaMemcpyAsync(&last_grp, &group_ids[num_intermediate-1], sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + //Step 3: Write output rectangles, where rect starts write lo and rect ends write hi + init_rects_dim<<>>(d_rects_in, break_points, group_ids, d_rects_out, num_intermediate, dim); + KERNEL_CHECK(stream); + + num_intermediate = last_grp; + std::swap(d_rects_in, d_rects_out); + } + my_arena.flip_parity(); + if (out_rects == 2) { + assert(!my_arena.get_parity()); + } else if (out_rects == 1) { + assert(my_arena.get_parity()); + my_arena.reset(true); + } + d_out_rects = my_arena.alloc>(num_intermediate); + if (out_rects == 1) { + my_arena.commit(true); + } + CUDA_CHECK(cudaMemcpyAsync(d_out_rects, d_rects_in, num_intermediate * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } + + if (out_rects > 0) { + out_rects = num_intermediate; + } else { + this->send_output(d_rects_in, num_intermediate, my_arena, ctr, getIndex, getMap); + } + } + + template + void GPUMicroOp::split_output(RectDesc* d_rects, size_t total_rects, std::vector *> &output_instances, std::vector &output_counts, Arena &my_arena) + { + NVTX_DEPPART(split_output); + + CUstream stream = this->stream->get_stream(); + bool use_sysmem = false; + + Rect* final_rects; + std::vector d_starts_host(output_instances.size()), d_ends_host(output_instances.size()); + + try { + final_rects = my_arena.alloc>(total_rects); + + size_t* d_starts = my_arena.alloc(2 * output_instances.size()); + size_t* d_ends = d_starts + output_instances.size(); + + CUDA_CHECK(cudaMemsetAsync(d_starts, 0, output_instances.size()*sizeof(size_t),stream), stream); + CUDA_CHECK(cudaMemsetAsync(d_ends, 0, output_instances.size()*sizeof(size_t),stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + //Convert RectDesc to SparsityMapEntry and determine where each src's rectangles start and end. + build_final_output<<>>(d_rects, nullptr, final_rects, d_starts, d_ends, total_rects); + KERNEL_CHECK(stream); + + + //Copy starts and ends back to host and handle empty partitions + + CUDA_CHECK(cudaMemcpyAsync(d_starts_host.data(), d_starts, output_instances.size() * sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaMemcpyAsync(d_ends_host.data(), d_ends, output_instances.size() * sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } catch (arena_oom&) { + use_sysmem = true; + RectDesc* h_tmp_rects = deppart_host_alloc>(total_rects); + final_rects = deppart_host_alloc>(total_rects); + CUDA_CHECK(cudaMemcpyAsync(h_tmp_rects, d_rects, total_rects * sizeof(RectDesc), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t idx = 0; idx < total_rects; idx++ ) { + final_rects[idx] = h_tmp_rects[idx].rect; + + //Checks if we're the first value for a given src + if (idx == 0 || h_tmp_rects[idx].src_idx != h_tmp_rects[idx-1].src_idx) { + d_starts_host[h_tmp_rects[idx].src_idx] = idx; + } + + //Checks if we're the last value for a given src + if (idx == total_rects-1 || h_tmp_rects[idx].src_idx != h_tmp_rects[idx+1].src_idx) { + d_ends_host[h_tmp_rects[idx].src_idx] = idx+1; + } + } + deppart_host_free(h_tmp_rects); + } + + for (size_t i = 1; i < output_instances.size(); i++) { + if (d_starts_host[i] < d_ends_host[i-1]) { + d_starts_host[i] = d_ends_host[i-1]; + d_ends_host[i] = d_ends_host[i-1]; + } + } + + for (size_t i = 0; i < output_instances.size(); i++) { + if (d_ends_host[i] > d_starts_host[i]) { + size_t end = d_ends_host[i]; + size_t start = d_starts_host[i]; + if (end - start > 0) { + Rect* h_new_rects = deppart_host_alloc>((end - start) + output_counts[i]); + if (output_counts[i] > 0) { + std::memcpy(h_new_rects, output_instances[i], output_counts[i] * sizeof(Rect)); + deppart_host_free(output_instances[i]); + } + if (use_sysmem) { + std::memcpy(h_new_rects + output_counts[i], final_rects + start, (end - start) * sizeof(Rect)); + } else { + CUDA_CHECK(cudaMemcpyAsync(h_new_rects + output_counts[i], final_rects + start, (end - start) * sizeof(Rect), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } + output_instances[i] = h_new_rects; + output_counts[i] += end - start; + } + } + } + if (use_sysmem) { + deppart_host_free(final_rects); + } + } + + /* + * Input: An array of disjoint rectangles sorted by src idx. + * Output: Fills the sparsity output for each src with a host region instance + * containing the entries/approx entries and calls gpu_finalize on the SparsityMapImpl. + * Approach: Segments the rectangles by their src idx and copies them back to the host, + */ + + template + template + void GPUMicroOp::send_output(RectDesc* d_rects, size_t total_rects, Arena &my_arena, const Container& ctr, IndexFn getIndex, MapFn getMap) + { + NVTX_DEPPART(send_output); + + size_t prev = my_arena.mark(); + + CUstream stream = this->stream->get_stream(); + + SparsityMapEntry* final_entries = my_arena.alloc>(total_rects); + Rect* final_rects = my_arena.alloc>(total_rects); + + size_t* d_starts = my_arena.alloc(2 * ctr.size()); + size_t* d_ends = d_starts + ctr.size(); + + CUDA_CHECK(cudaMemsetAsync(d_starts, 0, ctr.size()*sizeof(size_t),stream), stream); + CUDA_CHECK(cudaMemsetAsync(d_ends, 0, ctr.size()*sizeof(size_t),stream), stream); + + //Convert RectDesc to SparsityMapEntry and determine where each src's rectangles start and end. + build_final_output<<>>(d_rects, final_entries, final_rects, d_starts, d_ends, total_rects); + KERNEL_CHECK(stream); + + + //Copy starts and ends back to host and handle empty partitions + std::vector d_starts_host(ctr.size()), d_ends_host(ctr.size()); + CUDA_CHECK(cudaMemcpyAsync(d_starts_host.data(), d_starts, ctr.size() * sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaMemcpyAsync(d_ends_host.data(), d_ends, ctr.size() * sizeof(size_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t i = 1; i < ctr.size(); i++) { + if (d_starts_host[i] < d_ends_host[i-1]) { + d_starts_host[i] = d_ends_host[i-1]; + d_ends_host[i] = d_ends_host[i-1]; + } + } + + Memory sysmem; + assert(find_memory(sysmem, Memory::SYSTEM_MEM, my_arena.location)); + if (!this->exclusive) { + for (auto const& elem : ctr) { + size_t idx = getIndex(elem); + auto mapOpj = getMap(elem); + SparsityMapImpl *impl = SparsityMapImpl::lookup(mapOpj); + if (d_ends_host[idx] > d_starts_host[idx]) { + size_t end = d_ends_host[idx]; + size_t start = d_starts_host[idx]; + Rect *h_rects = deppart_host_alloc>(end - start); + CUDA_CHECK(cudaMemcpyAsync(h_rects, final_rects + start, (end - start) * sizeof(Rect), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + span> h_rects_span(h_rects, end - start); + bool disjoint = !this->is_image_microop(); + impl->contribute_dense_rect_list(h_rects_span, disjoint); + deppart_host_free(h_rects); + } else { + impl->contribute_nothing(); + } + } + } else { + std::vector *> local_finalizations; + + //Use provided lambdas to iterate over sparsity output container (map or vector) + for (auto const& elem : ctr) { + size_t idx = getIndex(elem); + auto mapOpj = getMap(elem); + SparsityMapImpl *impl = SparsityMapImpl::lookup(mapOpj); + NodeID owner = ID(mapOpj).sparsity_creator_node(); + assert(owner == Network::my_node_id); + if (d_ends_host[idx] > d_starts_host[idx]) { + size_t end = d_ends_host[idx]; + size_t start = d_starts_host[idx]; + SparsityMapEntry *h_entries = deppart_host_alloc>(end - start); + CUDA_CHECK(cudaMemcpyAsync(h_entries, final_entries + start, (end - start) * sizeof(SparsityMapEntry), cudaMemcpyDeviceToHost, stream), stream); + + Rect *approx_rects; + size_t num_approx; + if (end - start <= ((size_t) DeppartConfig::cfg_max_rects_in_approximation)) { + approx_rects = final_rects + start; + num_approx = end - start; + } else { + //TODO: Maybe add a better GPU approx here when given more rectangles + //Use CUB to compute a bad approx on the GPU (union of all rectangles) + approx_rects = my_arena.alloc>(1); + num_approx = 1; + void* d_temp = nullptr; + size_t temp_sz = 0; + Rect identity_rect; + for(int d=0; d::max(); + identity_rect.hi[d] = std::numeric_limits::min(); + } + cub::DeviceReduce::Reduce( + d_temp, temp_sz, + final_rects + start, + approx_rects, + (end - start), + UnionRectOp(), + identity_rect, + stream + ); + d_temp = reinterpret_cast(my_arena.alloc(temp_sz)); + cub::DeviceReduce::Reduce( + d_temp, temp_sz, + final_rects + start, + approx_rects, + end - start, + UnionRectOp(), + identity_rect, + stream + ); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } + Rect *h_approx_entries = deppart_host_alloc>(num_approx); + CUDA_CHECK(cudaMemcpyAsync(h_approx_entries, approx_rects, num_approx * sizeof(Rect), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + if(owner == Network::my_node_id) { + impl->set_gpu_entries(h_entries, end - start); + impl->set_gpu_approx_rects(h_approx_entries, num_approx); + local_finalizations.push_back(impl); + } else { + size_t payload_bytes = ((end - start) * sizeof(SparsityMapEntry)) + + (num_approx * sizeof(Rect)); + ActiveMessage::RemoteGpuFinalizeMessage> + amsg(owner, payload_bytes); + amsg->sparsity = mapOpj; + amsg->num_entries = end - start; + amsg->num_approx = num_approx; + amsg.add_payload(h_entries, (end - start) * sizeof(SparsityMapEntry), + PAYLOAD_COPY); + amsg.add_payload(h_approx_entries, num_approx * sizeof(Rect), + PAYLOAD_COPY); + amsg.commit(); + deppart_host_free(h_entries); + deppart_host_free(h_approx_entries); + } + } else { + if(owner == Network::my_node_id) { + local_finalizations.push_back(impl); + } else { + ActiveMessage::RemoteGpuFinalizeMessage> + amsg(owner); + amsg->sparsity = mapOpj; + amsg->num_entries = 0; + amsg->num_approx = 0; + amsg.commit(); + } + } + } + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (SparsityMapImpl *impl : local_finalizations) { + impl->gpu_finalize(); + } + } + my_arena.rollback(prev); + } + + +} diff --git a/src/realm/deppart/partitions_gpu_kernels.hpp b/src/realm/deppart/partitions_gpu_kernels.hpp new file mode 100644 index 0000000000..b3bd280be4 --- /dev/null +++ b/src/realm/deppart/partitions_gpu_kernels.hpp @@ -0,0 +1,814 @@ +#pragma once +#include "realm/deppart/partitions.h" + +namespace Realm { + +template +__device__ __forceinline__ size_t bsearch(const T* arr, size_t len, T val) { + size_t low = 0, high = len; + while (low < high) { + size_t mid = low + ((high - low) >> 1); + if (arr[mid + 1] <= val) + low = mid + 1; + else + high = mid; + } + return low; +} + +template +__global__ void subtract_const( + T* d_data, + size_t num_elems, + T value +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= num_elems) return; + d_data[idx] = d_data[idx] <= value ? 0 : d_data[idx] - value; +} + +// Intersect all instance rectangles with all parent rectangles in parallel. +// Used for both count and emit depending on whether the output array is null. + +template +__global__ void intersect_input_rects( + const SparsityMapEntry* d_lhs_entries, + const SparsityMapEntry* d_rhs_entries, + const size_t *d_lhs_offsets, + const uint32_t *d_lhs_prefix, + const size_t* d_rhs_offsets, + size_t numLHSRects, + size_t numRHSRects, + size_t numLHSChildren, + size_t numRHSChildren, + uint32_t *d_lhs_counters, + out_t* d_rects +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numLHSRects * numRHSRects) return; + size_t idx_x = idx % numRHSRects; + size_t idx_y = idx / numRHSRects; + assert(idx_x < numRHSRects); + assert(idx_y < numLHSRects); + const SparsityMapEntry rhs_entry = d_rhs_entries[idx_x]; + const SparsityMapEntry lhs_entry = d_lhs_entries[idx_y]; + Rect rect_output = lhs_entry.bounds.intersection(rhs_entry.bounds); + if (rect_output.empty()) { + return; + } + size_t lhs_idx = bsearch(d_lhs_offsets, numLHSChildren, idx_y); + uint32_t local = atomicAdd(&d_lhs_counters[lhs_idx], 1); + if (d_rects != nullptr) { + // If d_rects is not null, we write the output rect + uint32_t out_idx = d_lhs_prefix[lhs_idx] + local; + if constexpr (std::is_same_v>) { + d_rects[out_idx].src_idx = bsearch(d_rhs_offsets, numRHSChildren, idx_x); + d_rects[out_idx].rect = rect_output; + } else { + d_rects[out_idx] = rect_output; + } + } +} + +template +__device__ __forceinline__ uint64_t bvh_morton_code(const Rect& rect, + const Rect& globalBounds) { + // bits per axis (floor) + constexpr int bits = 64 / N; + constexpr uint64_t maxQ = (bits == 64 ? ~0ULL + : (1ULL << bits) - 1); + + uint64_t coords[N]; +#pragma unroll + for(int d = 0; d < N; ++d) { + // 1) compute centroid in dimension d + float center = 0.5f * (float(rect.lo[d]) + float(rect.hi[d]) + 1.0f); + + // 2) normalize into [0,1] using globalBounds + float span = float(globalBounds.hi[d] + 1 - globalBounds.lo[d]); + float norm = (center - float(globalBounds.lo[d])) / span; + + // 3) quantize to [0 … maxQ] + uint64_t q = uint64_t(norm * float(maxQ) + 0.5f); + coords[d] = (q > maxQ ? maxQ : q); + } + + // 4) interleave bits MSB→LSB across all dims + uint64_t code = 0; + for(int b = bits - 1; b >= 0; --b) { +#pragma unroll + for(int d = 0; d < N; ++d) { + code = (code << 1) | ((coords[d] >> b) & 1ULL); + } + } + + return code; +} + +template +__global__ void bvh_build_morton_codes( + const SparsityMapEntry* d_targets_entries, + const size_t* d_offsets_rects, + const Rect* d_global_bounds, + size_t total_rects, + size_t num_targets, + uint64_t* d_morton_codes, + uint64_t* d_indices, + uint64_t* d_targets_indices) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= total_rects) return; + const auto &entry = d_targets_entries[idx]; + d_morton_codes[idx] = bvh_morton_code(entry.bounds, *d_global_bounds); + d_indices[idx] = idx; + if (d_offsets_rects != nullptr) { + d_targets_indices[idx] = bsearch(d_offsets_rects, num_targets, idx); + } +} + + __global__ +void bvh_build_radix_tree_kernel( + const uint64_t *morton, // [n] + const uint64_t *leafIdx, // [n] (unused here but kept for symmetry) + int n, + int *childLeft, // [2n−1] + int *childRight, // [2n−1] + int *parent); // [2n−1], pre‐initialized to −1 + +__global__ +void bvh_build_root_kernel( + int *root, + int *parent, + size_t total_rects); + +template +__global__ +void bvh_init_leaf_boxes_kernel( + const SparsityMapEntry *rects, // [G] all flattened Rects + const uint64_t *leafIdx, // [n] maps leaf→orig Rect index + size_t total_rects, + Rect *boxes) // [(2n−1)] +{ + int k = blockIdx.x*blockDim.x + threadIdx.x; + if (k >= total_rects) return; + + size_t orig = leafIdx[k]; + boxes[k + total_rects - 1] = rects[orig].bounds; +} + +template +__global__ +void bvh_merge_internal_boxes_kernel( + size_t total_rects, + const int *childLeft, // [(2n−1)] + const int *childRight, // [(2n−1)] + const int *parent, // [(2n−1)] + Rect *boxes, // [(2n−1)×N] + int *visitCount) // [(2n−1)] initialized to zero +{ + int leaf = blockIdx.x*blockDim.x + threadIdx.x; + if (leaf >= total_rects) return; + + int cur = leaf + total_rects - 1; + int p = parent[cur]; + + while(p >= 0) { + // increment visit count; the second arrival merges + int prev = atomicAdd(&visitCount[p], 1); + if (prev == 1) { + // both children ready, do the merge + int c0 = childLeft[p], c1 = childRight[p]; + boxes[p] = boxes[c0].union_bbox(boxes[c1]); + // climb + cur = p; + p = parent[cur]; + } else { + // first child arrived, wait for sibling + break; + } + } +} + +template +__global__ +void query_input_bvh( + SparsityMapEntry* queries, + size_t* d_query_offsets, + int root, + int *childLeft, + int *childRight, + uint64_t *indices, + uint64_t *labels, + Rect *boxes, + size_t numQueries, + size_t numBoxes, + size_t numLHSChildren, + uint32_t* d_inst_prefix, + uint32_t* d_inst_counters, + out_t *d_rects +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numQueries) return; + Rect in_rect = queries[idx].bounds; + size_t lhs_idx = bsearch(d_query_offsets, numLHSChildren, idx); + + constexpr int MAX_STACK = 64; // max stack size for BVH traversal + int stack[MAX_STACK]; + int sp = 0; + + // start at the root + stack[sp++] = -1; + int node = root; + do + { + + int left = childLeft[node]; + int right = childRight[node]; + + bool overlapL = boxes[left].overlaps(in_rect); + bool overlapR = boxes[right].overlaps(in_rect); + + if (overlapL && left >= numBoxes - 1) { + uint64_t rect_idx = indices[left - (numBoxes - 1)]; + uint32_t local = atomicAdd(&d_inst_counters[lhs_idx], 1); + if (d_rects != nullptr) { + uint32_t out_idx = d_inst_prefix[lhs_idx] + local; + Rect out_rect = boxes[left].intersection(in_rect); + if constexpr (std::is_same_v>) { + d_rects[out_idx].rect = out_rect; + d_rects[out_idx].src_idx = labels[rect_idx]; + } else { + d_rects[out_idx] = out_rect; + } + } + } + if (overlapR && right >= numBoxes - 1) { + uint64_t rect_idx = indices[right - (numBoxes - 1)]; + uint32_t local = atomicAdd(&d_inst_counters[lhs_idx], 1); + if (d_rects != nullptr) { + uint32_t out_idx = d_inst_prefix[lhs_idx] + local; + Rect out_rect = boxes[right].intersection(in_rect); + if constexpr (std::is_same_v>) { + d_rects[out_idx].rect = out_rect; + d_rects[out_idx].src_idx = labels[rect_idx]; + } else { + d_rects[out_idx] = out_rect; + } + } + } + + bool traverseL = overlapL && left < numBoxes - 1; + bool traverseR = overlapR && right < numBoxes - 1; + + if (!traverseL && !traverseR) { + node = stack[--sp]; + } else { + node = (traverseL ? left : right); + if (traverseL && traverseR) { + stack[sp++] = right; + } + } + } while (node != -1); +} + +template +struct CornerDesc { + uint32_t src_idx; + T coord[N]; + int32_t delta; + + // Equality for ReduceByKey: compare key fields only (src_idx, coords) + __host__ __device__ __forceinline__ + bool operator==(const CornerDesc& rhs) const { + if (src_idx != rhs.src_idx) return false; + for (int d = 0; d < N; ++d) + if (coord[d] != rhs.coord[d]) return false; + return true; + } +}; + +template +__global__ void mark_endpoints(const RectDesc* d_rects, + size_t M, + int dim, + uint32_t* d_src_keys, + T* d_crd_keys) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= M) return; + d_src_keys[2*i] = d_rects[i].src_idx; + d_src_keys[2*i+1] = d_rects[i].src_idx; + d_crd_keys[2*i] = d_rects[i].rect.lo[dim]; + d_crd_keys[2*i+1] = d_rects[i].rect.hi[dim] + 1; +} + +template +__global__ void mark_heads(const uint32_t* d_src_keys, + const T* d_crd_keys, + size_t M, + uint8_t* d_heads) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= M) return; + if (i==0) d_heads[0] = 1; + else { + d_heads[i] = d_src_keys[i] != d_src_keys[i-1] || d_crd_keys[i] != d_crd_keys[i-1]; + } +} + +template +__global__ void seg_boundaries(const uint8_t* d_flags, + const T* d_exc_sum, + size_t M, + size_t *d_starts, + size_t *d_ends) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= M) return; + if (d_flags[i]) { + d_starts[d_exc_sum[i]-1] = i; + } + if (i== M-1 || d_flags[i+1]) { + d_ends[d_exc_sum[i]-1] = i + 1; + } +} + +template +__global__ void scatter_unique(const uint32_t* d_src_keys, + const T* d_crd_keys, + const size_t* d_output, + const uint8_t* d_heads, + size_t M, + size_t *d_starts, + size_t *d_ends, + T* d_boundaries) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= M) return; + size_t u = d_output[i] - (d_heads[i] ? 0 : 1); + d_boundaries[u] = d_crd_keys[i]; + if (i == 0 || d_src_keys[i] != d_src_keys[i-1]) { + d_starts[d_src_keys[i]] = u; + } + if (i== M-1 || d_src_keys[i] != d_src_keys[i+1]) { + d_ends[d_src_keys[i]] = u + 1; + } +} + +template +__global__ void mark_deltas_heads(const CornerDesc* d_corners, + size_t M, + int dim, + uint8_t* d_heads, + DeltaFlag* d_deltas) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= M) return; + uint8_t head = 1; + if (i>0) { + head = 0; + for (int j = 0; j < N; j++) { + if (j== dim) continue; + if (d_corners[i].coord[j] != d_corners[i-1].coord[j]) { + head = 1; + break; + } + } + head = head || d_corners[i].src_idx != d_corners[i-1].src_idx; + } + d_heads[i] = head; + d_deltas[i].delta = d_corners[i].delta; + d_deltas[i].head = head; +} + +// For each segment and each boundary, determine whether to emit a new subsegment +template +__global__ void count_segments(const DeltaFlag* d_delta_flags, + const size_t *d_segment_starts, + const size_t *d_segment_ends, + const size_t *d_boundary_starts, + const size_t *d_boundary_ends, + const CornerDesc* d_corners, + const T* d_boundaries, + size_t num_boundaries, + size_t num_segments, + int dim, + uint32_t *seg_counters) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= num_segments * num_boundaries) return; + size_t bnd_idx = i % num_boundaries; + size_t seg_idx = i / num_boundaries; + int my_src = d_corners[d_segment_starts[seg_idx]].src_idx; + + //No boundaries for this src + if (d_boundary_starts[my_src]>= d_boundary_ends[my_src]) return; + + //This boundary is not a subsegment start for this segment's src + if (bnd_idx < d_boundary_starts[my_src] || bnd_idx >= d_boundary_ends[my_src]-1) return; + + //Binary search the segment to find the first subsegment whose start is > boundary + size_t low = d_segment_starts[seg_idx]; + size_t high = d_segment_ends[seg_idx]; + while (low < high) { + int mid = (low + high) / 2; + if (d_corners[mid].coord[dim] <= d_boundaries[bnd_idx]) { + low = mid + 1; + } else { + high = mid; + } + } + + //The prefix sum for this boundary within this segment is the delta of the corner just before it (if any) + int my_delta = (low == d_segment_starts[seg_idx] ? 0 : d_delta_flags[low-1].delta); + + //We emit if it's non-zero, and strengthen the requirement to > 0 for dim 0. + if (my_delta != 0 && (dim !=0 || my_delta > 0)) { + atomicAdd(&seg_counters[seg_idx], 1); + } +} + +//Do the same computation as above, but this time emit the actual subsegment +template +__global__ void write_segments(const DeltaFlag* d_delta_flags, + const size_t *d_segment_starts, + const size_t *d_segment_ends, + const size_t *d_boundary_starts, + const size_t *d_boundary_ends, + const CornerDesc* d_corners, + const T* d_boundaries, + const uint32_t *seg_offsets, + size_t num_boundaries, + size_t num_segments, + int dim, + uint32_t *seg_counters, + CornerDesc* d_out_corners) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= num_segments * num_boundaries) return; + size_t bnd_idx = i % num_boundaries; + size_t seg_idx = i / num_boundaries; + int my_src = d_corners[d_segment_starts[seg_idx]].src_idx; + if (d_boundary_starts[my_src]>= d_boundary_ends[my_src]) return; + if (bnd_idx < d_boundary_starts[my_src] || bnd_idx >= d_boundary_ends[my_src]-1) return; + size_t low = d_segment_starts[seg_idx]; + size_t high = d_segment_ends[seg_idx]; + while (low < high) { + int mid = (low + high) / 2; + if (d_corners[mid].coord[dim] <= d_boundaries[bnd_idx]) { + low = mid + 1; + } else { + high = mid; + } + } + int my_delta = (low == d_segment_starts[seg_idx] ? 0 : d_delta_flags[low-1].delta); + + //To emit, we keep everything the same except the current dim - set that to the boundary value + if (my_delta != 0 && (dim !=0 || my_delta > 0)) { + uint32_t my_idx = seg_offsets[seg_idx] + atomicAdd(&seg_counters[seg_idx], 1); + CornerDesc my_corner = d_corners[low-1]; + my_corner.coord[dim] = d_boundaries[bnd_idx]; + my_corner.delta = my_delta; + d_out_corners[my_idx] = my_corner; + } +} + +//Again, do the same computation as above, but this time emit the actual rectangle +template +__global__ void write_segments(const DeltaFlag* d_delta_flags, + const size_t *d_segment_starts, + const size_t *d_segment_ends, + size_t **d_boundary_starts, + size_t **d_boundary_ends, + const CornerDesc* d_corners, + T** d_boundaries, + const uint32_t *seg_offsets, + size_t num_boundaries, + size_t num_segments, + uint32_t *seg_counters, + RectDesc* d_out_rects) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= num_segments * num_boundaries) return; + size_t bnd_idx = i % num_boundaries; + size_t seg_idx = i / num_boundaries; + int my_src = d_corners[d_segment_starts[seg_idx]].src_idx; + if (d_boundary_starts[0][my_src]>= d_boundary_ends[0][my_src]) return; + if (bnd_idx < d_boundary_starts[0][my_src] || bnd_idx >= d_boundary_ends[0][my_src]-1) return; + + size_t low = d_segment_starts[seg_idx]; + size_t high = d_segment_ends[seg_idx]; + while (low < high) { + int mid = (low + high) / 2; + if (d_corners[mid].coord[0] <= d_boundaries[0][bnd_idx]) { + low = mid + 1; + } else { + high = mid; + } + } + int my_delta = (low == d_segment_starts[seg_idx] ? 0 : d_delta_flags[low-1].delta); + if (my_delta==0) return; + int my_corner_idx = low - 1; + uint32_t my_idx = seg_offsets[seg_idx] + atomicAdd(&seg_counters[seg_idx], 1); + RectDesc my_output; + my_output.src_idx = my_src; + my_output.rect.lo[0] = d_boundaries[0][bnd_idx]; + + //Remember we marked each boundary as hi+1, so need to revert + my_output.rect.hi[0] = d_boundaries[0][bnd_idx+1] - 1; + + //For every other dimension, map segment -> rect by finding the two boundaries that surround the segment's corner + for (int d = 1; d < N; d++) { + low = d_boundary_starts[d][my_src]; + high = d_boundary_ends[d][my_src]; + while (low < high) { + int mid = (low + high) / 2; + if (d_boundaries[d][mid] <= d_corners[my_corner_idx].coord[d]) { + low = mid + 1; + } else { + high = mid; + } + } + my_output.rect.lo[d] = d_boundaries[d][low-1]; + my_output.rect.hi[d] = d_boundaries[d][low] - 1; + } + d_out_rects[my_idx] = my_output; +} + + template + __global__ void populate_corners(const RectDesc* __restrict__ d_rects, + size_t M, + CornerDesc* __restrict__ d_corners) +{ + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= M) return; + + const auto& r = d_rects[i]; // assumes r.rect.lo[d], r.rect.hi[d], r.src_idx + const uint32_t src = r.src_idx; + + const size_t corners_per_rect = size_t(1) << N; + const size_t base = i * corners_per_rect; + + // emit 2^N corners. Each 1 in the mask -> use hi[d]+1, each 0 -> use lo[d] + for (unsigned mask = 0; mask < corners_per_rect; ++mask) { + CornerDesc c; + c.src_idx = src; + // sign = +1 for even popcount(mask), -1 for odd + c.delta = (__popc(mask) & 1) ? -1 : +1; + + #pragma unroll + for (int d = 0; d < N; ++d) { + const T lo = r.rect.lo[d]; + const T hip1 = r.rect.hi[d] + T(1); // half-open (hi+1) + c.coord[d] = ( (mask & (1u << d)) ? hip1 : lo ); + } + + d_corners[base + mask] = c; + } +} + + +template +__global__ void build_coord_key(T* d_keys, + const PointDesc* d_pts, + size_t M, + int dim) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i < M) d_keys[i] = d_pts[i].point[dim]; +} + + +template +__global__ void build_coord_key(T* d_keys, + const CornerDesc* d_corners, + size_t M, + int dim) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i < M) d_keys[i] = d_corners[i].coord[dim]; +} + +template +__global__ void get_delta(int32_t* d_deltas, + const CornerDesc* d_corners, + size_t M) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i < M) d_deltas[i] = d_corners[i].delta; +} + +template +__global__ void set_delta(const int32_t* d_deltas, + CornerDesc* d_corners, + size_t M) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i < M) d_corners[i].delta = d_deltas[i]; +} + + + template +__global__ void build_lo_key(T* d_keys, + const RectDesc* d_rects, + size_t M, + int dim) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i < M) d_keys[i] = d_rects[i].rect.lo[dim]; +} + + template +__global__ void build_hi_key(T* d_keys, + const RectDesc* d_rects, + size_t M, + int dim) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i < M) d_keys[i] = d_rects[i].rect.hi[dim]; +} + + template +__global__ void build_hi_flag(HiFlag* d_flags, + const RectDesc* d_rects, + size_t M, + int dim) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= M) return; + d_flags[i].hi = d_rects[i].rect.hi[dim]; + d_flags[i].head = i==0 || d_rects[i].src_idx != d_rects[i-1].src_idx; +} + + template +__global__ void build_src_key(size_t* d_keys, + const RectDesc* d_rects, + size_t M) { + size_t i = blockIdx.x*blockDim.x + threadIdx.x; + if(i < M) d_keys[i] = d_rects[i].src_idx; +} + + template +__global__ void build_src_key(size_t* d_keys, + const CornerDesc *d_corners, + size_t M) { + size_t i = blockIdx.x*blockDim.x + threadIdx.x; + if(i < M) d_keys[i] = d_corners[i].src_idx; +} + +template +__global__ void build_src_key(size_t* d_keys, + const PointDesc* d_pts, + size_t M) { + size_t i = blockIdx.x*blockDim.x + threadIdx.x; + if(i < M) d_keys[i] = d_pts[i].src_idx; +} + + +template +__global__ +void points_to_rects(const PointDesc* pts, + RectDesc* rects, + size_t M) +{ + size_t i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= M) return; + rects[i].src_idx = pts[i].src_idx; + rects[i].rect.lo = pts[i].point; + rects[i].rect.hi = pts[i].point; +} + +// 1) mark breaks on RectDesc array at pass d +// Starts a new rectangle if src or lo/hi in any dimension but d doesn't match, +// or if dim d doesn't match or advance by +1 +//NOTE: ONLY WORKS IF WE STARTED WITH DISJOINT RECTANGLES +template +__global__ +void mark_breaks_dim(const RectDesc* in, + uint8_t* brk, + size_t M, + int d) +{ + size_t i = blockIdx.x*blockDim.x + threadIdx.x; + + if(i >= M) return; + if(i == 0) { brk[0] = 1; return; } + + const auto &p = in[i].rect, &q = in[i-1].rect; + bool split = (in[i].src_idx != in[i-1].src_idx); + + // more‐significant dims 0..d-1 must match [lo,hi] + #pragma unroll + for(int k = 0; k < d && !split; ++k) + if(p.lo[k] != q.lo[k] || p.hi[k] != q.hi[k]) split = true; + + // already‐processed dims d+1..N-1 must match [lo,hi] + #pragma unroll + for(int k = d+1; k < N && !split; ++k) + if((p.lo[k] != q.lo[k]) || (p.hi[k] != q.hi[k])) + split = true; + + // current dim d must equal or advance by +1 in lo + if(!split && (p.lo[d] != (q.hi[d] + 1)) && (p.lo[d] != q.lo[d])) + split = true; + + brk[i] = split ? 1 : 0; +} + +//1) Mark breaks for 1D rectangle merge - if low > hi + 1, must start new rect + template +__global__ +void mark_breaks_dim(const HiFlag* hi_flag_in, + const HiFlag* hi_flag_out, + const RectDesc* in, + uint8_t* brk, + size_t M, + int d) +{ + size_t i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= M) return; + brk[i] = hi_flag_in[i].head || in[i].rect.lo[d] > hi_flag_out[i].hi + 1; +} + +// 2) Write output rectangles for ND disjoint rects RLE +// Starts write lo, ends write hi, everyone else no-ops +template +__global__ +void init_rects_dim(const RectDesc* in, + const uint8_t* brk, + const size_t* gid, + RectDesc* out, + size_t M, + int d) +{ + size_t i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= M) return; + + bool is_end = (i == M-1) || (gid[i+1] != gid[i]); + if (!brk[i] && !is_end) return; + + size_t g = gid[i] - 1; // zero-based rectangle index + const Rect &r = in[i].rect; + out[g].src_idx = in[i].src_idx; + + #pragma unroll + for(int k = 0; k < N; ++k) { + if (brk[i]) { + out[g].rect.lo[k] = r.lo[k]; + } + if (is_end) { + out[g].rect.hi[k] = r.hi[k]; + } + } +} + + // 2) Write output rectangles for 1D rects RLE + // Starts write lo, ends write max(hi, prefix max hi) because the max was exclusive + template + __global__ + void init_rects_dim(const RectDesc* in, + const HiFlag *hi_flag_out, + const uint8_t* brk, + const size_t* gid, + RectDesc* out, + size_t M, + int d) +{ + size_t i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= M) return; + + bool is_end = (i == M-1) || (gid[i+1] != gid[i]); + if (!brk[i] && !is_end) return; + + size_t g = gid[i] - 1; // zero-based + const auto &r = in[i].rect; + out[g].src_idx = in[i].src_idx; + + // copy dims ≠ d +#pragma unroll + for(int k = 0; k < N; ++k) { + if (brk[i]) { + out[g].rect.lo[k] = r.lo[k]; + } + if (k != d || (brk[i] && is_end)) { + out[g].rect.hi[k] = r.hi[k]; + } else if (is_end) { + out[g].rect.hi[k] = r.hi[k] > hi_flag_out[i].hi ? r.hi[k] : hi_flag_out[i].hi; + } + } +} + +//Convert RectDesc to sparsity output and determine [d_start[i], d_end[i]) for each src i +template +__global__ +void build_final_output(const RectDesc* d_rects, + SparsityMapEntry* d_entries_out, + Rect* d_rects_out, + size_t* d_starts, + size_t* d_ends, + size_t numRects) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numRects) return; + d_rects_out[idx] = d_rects[idx].rect; + if (d_entries_out != nullptr) { + d_entries_out[idx].bounds = d_rects[idx].rect; + d_entries_out[idx].sparsity.id = 0; + d_entries_out[idx].bitmap = 0; + } + + //Checks if we're the first value for a given src + if (idx == 0 || d_rects[idx].src_idx != d_rects[idx-1].src_idx) { + d_starts[d_rects[idx].src_idx] = idx; + } + + //Checks if we're the last value for a given src + if (idx == numRects-1 || d_rects[idx].src_idx != d_rects[idx+1].src_idx) { + d_ends[d_rects[idx].src_idx] = idx+1; + } +} + +} // namespace Realm \ No newline at end of file diff --git a/src/realm/deppart/preimage.cc b/src/realm/deppart/preimage.cc index 0e43956865..e283a3ec47 100644 --- a/src/realm/deppart/preimage.cc +++ b/src/realm/deppart/preimage.cc @@ -17,20 +17,81 @@ // preimage operations for Realm dependent partitioning -#include "realm/deppart/preimage.h" - -#include "realm/deppart/deppart_config.h" -#include "realm/deppart/rectlist.h" -#include "realm/deppart/inst_helper.h" -#include "realm/deppart/image.h" -#include "realm/logging.h" +#include "preimage.h" + +#include "deppart_config.h" +#include "rectlist.h" +#include "inst_helper.h" +#include "image.h" +#include "../logging.h" +#include #include +#include "realm/cuda/cuda_internal.h" namespace Realm { extern Logger log_part; extern Logger log_uop_timing; + template + template + void IndexSpace::by_preimage_buffer_requirements( + const std::vector>& target_spaces, + const std::vector>& inputs, + std::vector& requirements) const { + size_t minimal_size = 0; + size_t source_entries = 0; + bool bvh = false; + for (auto subspace : target_spaces) { + source_entries += subspace.entries == 0 ? 1 : subspace.entries; + if (subspace.entries > 1) { + bvh = true; + } + } + minimal_size += sizeof(Rect) * source_entries; + if (this->dense()) { + minimal_size += sizeof(Rect); + } else { + minimal_size += sizeof(Rect) * this->sparsity.impl()->get_entries().size(); + } + if (bvh) { + minimal_size += + (source_entries * sizeof(uint64_t)) + + (source_entries * sizeof(size_t)) + + ((2*source_entries - 1) * sizeof(Rect)) + + (2 * (2*source_entries - 1) * sizeof(int)) + + sizeof(Rect) + + (2 * source_entries * sizeof(uint64_t)) + + (source_entries * sizeof(uint64_t)); + } + requirements = std::vector(inputs.size()); + for (size_t i = 0; i < inputs.size(); i++) { + IndexSpace is = inputs[i].space; + Memory mem = inputs[i].location; + if (mem.kind() == Memory::GPU_FB_MEM || + mem.kind() == Memory::Z_COPY_MEM) { + const char* val = std::getenv("MIN_SIZE"); // or any env var + size_t device_size = 2000000; //default + if (val) { + device_size = atoi(val); + } + minimal_size = max(minimal_size, device_size); + size_t optimal_size = is.bounds.volume() * sizeof(Rect) * target_spaces.size() * 20 + minimal_size; + Processor best_proc = Processor::NO_PROC; + assert(choose_proc(best_proc, mem)); + requirements[i].affinity_processor = best_proc; + requirements[i].lower_bound = minimal_size; + requirements[i].upper_bound = optimal_size; + requirements[i].minimum_alignment = 128; + } else { + requirements[i].affinity_processor = Processor::NO_PROC; + requirements[i].lower_bound = 0; + requirements[i].upper_bound = 0; + requirements[i].minimum_alignment = 0; + } + } + } + template template Event IndexSpace::create_subspaces_by_preimage( @@ -151,6 +212,13 @@ namespace Realm { { TimeStamp ts("PreimageMicroOp::execute", true, &log_uop_timing); std::map *> rect_map; + if (is_ranged || N2 > 1) { + for (const IndexSpace& target : targets) { + if (!target.dense()) { + target.sparsity.impl()->request_bvh(); + } + } + } if(is_ranged) populate_bitmasks_ranges(rect_map); @@ -165,529 +233,801 @@ namespace Realm { std::cout << " " << targets[it->first] << " = " << it->second->rects.size() << " rectangles" << std::endl; #endif - // iterate over sparsity outputs and contribute to all (even if we didn't have any - // points found for it) - int empty_count = 0; - for(size_t i = 0; i < sparsity_outputs.size(); i++) { - SparsityMapImpl *impl = SparsityMapImpl::lookup(sparsity_outputs[i]); - typename std::map *>::const_iterator it2 = rect_map.find(i); - if(it2 != rect_map.end()) { - impl->contribute_dense_rect_list(it2->second->rects, true /*disjoint*/); - delete it2->second; - } else { - impl->contribute_nothing(); - empty_count++; - } - } - if(empty_count > 0) - log_part.info() << empty_count << " empty preimages (out of " << sparsity_outputs.size() << ")"; - } - - template - void PreimageMicroOp::dispatch(PartitioningOperation *op, bool inline_ok) - { - // a PreimageMicroOp should always be executed on whichever node the field data lives - NodeID exec_node = ID(inst).instance_owner_node(); - - if(exec_node != Network::my_node_id) { - forward_microop >(exec_node, op, this); - return; - } - - // Need valid data for the instance space - if (!inst_space.dense()) { - // it's safe to add the count after the registration only because we initialized - // the count to 2 instead of 1 - bool registered = SparsityMapImpl::lookup(inst_space.sparsity)->add_waiter(this, true /*precise*/); - if(registered) - wait_count.fetch_add(1); - } - - // need valid data for each target - for(size_t i = 0; i < targets.size(); i++) { - if(!targets[i].dense()) { - // it's safe to add the count after the registration only because we initialized - // the count to 2 instead of 1 - bool registered = SparsityMapImpl::lookup(targets[i].sparsity)->add_waiter(this, true /*precise*/); - if(registered) - wait_count.fetch_add(1); - } - } + // iterate over sparsity outputs and contribute to all (even if we didn't have any + // points found for it) + int empty_count = 0; + for (size_t i = 0; i < sparsity_outputs.size(); i++) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(sparsity_outputs[i]); + typename std::map *>::const_iterator it2 = rect_map.find(i); + if (it2 != rect_map.end()) { + impl->contribute_dense_rect_list(it2->second->rects, true /*disjoint*/); + delete it2->second; + } else { + impl->contribute_nothing(); + empty_count++; + } + } + if (empty_count > 0) { + log_part.info() << empty_count << " empty preimages (out of " << sparsity_outputs.size() << ")"; + } + } - // need valid data for the parent space too - if(!parent_space.dense()) { - // it's safe to add the count after the registration only because we initialized - // the count to 2 instead of 1 - bool registered = SparsityMapImpl::lookup(parent_space.sparsity)->add_waiter(this, true /*precise*/); - if(registered) - wait_count.fetch_add(1); - } - - finish_dispatch(op, inline_ok); - } + template + void PreimageMicroOp::dispatch(PartitioningOperation *op, bool inline_ok) { + // a PreimageMicroOp should always be executed on whichever node the field data lives + NodeID exec_node = ID(inst).instance_owner_node(); + + if (exec_node != Network::my_node_id) { + forward_microop >(exec_node, op, this); + return; + } + + // Need valid data for the instance space + if (!inst_space.dense()) { + // it's safe to add the count after the registration only because we initialized + // the count to 2 instead of 1 + bool registered = SparsityMapImpl::lookup(inst_space.sparsity)->add_waiter(this, true /*precise*/); + if (registered) + wait_count.fetch_add(1); + } + + // need valid data for each target + for (size_t i = 0; i < targets.size(); i++) { + if (!targets[i].dense()) { + // it's safe to add the count after the registration only because we initialized + // the count to 2 instead of 1 + bool registered = SparsityMapImpl::lookup(targets[i].sparsity)->add_waiter( + this, true /*precise*/); + if (registered) + wait_count.fetch_add(1); + } + } + + // need valid data for the parent space too + if (!parent_space.dense()) { + // it's safe to add the count after the registration only because we initialized + // the count to 2 instead of 1 + bool registered = SparsityMapImpl::lookup(parent_space.sparsity)->add_waiter(this, true /*precise*/); + if (registered) + wait_count.fetch_add(1); + } + + finish_dispatch(op, inline_ok); + } - template - template - bool PreimageMicroOp::serialize_params(S& s) const - { - return((s << parent_space) && - (s << inst_space) && - (s << inst) && - (s << field_offset) && - (s << is_ranged) && - (s << targets) && - (s << sparsity_outputs)); - } + template + template + bool PreimageMicroOp::serialize_params(S &s) const { + return ((s << parent_space) && + (s << inst_space) && + (s << inst) && + (s << field_offset) && + (s << is_ranged) && + (s << targets) && + (s << sparsity_outputs)); + } - template - template - PreimageMicroOp::PreimageMicroOp(NodeID _requestor, - AsyncMicroOp *_async_microop, S& s) - : PartitioningMicroOp(_requestor, _async_microop) - { - bool ok = ((s >> parent_space) && - (s >> inst_space) && - (s >> inst) && - (s >> field_offset) && - (s >> is_ranged) && - (s >> targets) && - (s >> sparsity_outputs)); - assert(ok); - (void)ok; - } + template + template + PreimageMicroOp::PreimageMicroOp(NodeID _requestor, + AsyncMicroOp *_async_microop, S &s) + : PartitioningMicroOp(_requestor, _async_microop) { + bool ok = ((s >> parent_space) && + (s >> inst_space) && + (s >> inst) && + (s >> field_offset) && + (s >> is_ranged) && + (s >> targets) && + (s >> sparsity_outputs)); + assert(ok); + (void) ok; + } - template - ActiveMessageHandlerReg > > PreimageMicroOp::areg; + template + ActiveMessageHandlerReg > > PreimageMicroOp::areg; + + + //////////////////////////////////////////////////////////////////////// + // + // class PreimageOperation + + template + PreimageOperation::PreimageOperation( + const IndexSpace &_parent, + const DomainTransform &_domain_transform, + const ProfilingRequestSet &reqs, GenEventImpl *_finish_event, + EventImpl::gen_t _finish_gen) + : PartitioningOperation(reqs, _finish_event, _finish_gen), + parent(_parent), + domain_transform(_domain_transform), + overlap_tester(0), + dummy_overlap_uop(0), + exclusive_gpu_owner(exclusive_gpu_exec_node()) { + areg.force_instantiation(); + } + template + PreimageOperation::~PreimageOperation(void) { + if (overlap_tester) + delete overlap_tester; + } - //////////////////////////////////////////////////////////////////////// - // - // class PreimageOperation + template + NodeID PreimageOperation::exclusive_gpu_exec_node(void) const { + size_t gpu_ptrs = 0, gpu_rects = 0, cpu_ptrs = 0, cpu_rects = 0; + for(size_t i = 0; i < domain_transform.ptr_data.size(); i++) { + Memory::Kind kind = domain_transform.ptr_data[i].inst.get_location().kind(); + if((kind == Memory::GPU_FB_MEM) || (kind == Memory::Z_COPY_MEM)) + gpu_ptrs++; + else + cpu_ptrs++; + } + for(size_t i = 0; i < domain_transform.range_data.size(); i++) { + Memory::Kind kind = domain_transform.range_data[i].inst.get_location().kind(); + if((kind == Memory::GPU_FB_MEM) || (kind == Memory::Z_COPY_MEM)) + gpu_rects++; + else + cpu_rects++; + } + size_t opcount = gpu_ptrs + gpu_rects + cpu_ptrs + cpu_rects; + if((gpu_ptrs + gpu_rects) == 0 || (opcount != 1)) + return -1; + if(gpu_ptrs == 1) + return ID(domain_transform.ptr_data[0].inst).instance_owner_node(); + if(gpu_rects == 1) + return ID(domain_transform.range_data[0].inst).instance_owner_node(); + return -1; + } - template - PreimageOperation::PreimageOperation( - const IndexSpace &_parent, - const DomainTransform &_domain_transform, - const ProfilingRequestSet &reqs, GenEventImpl *_finish_event, - EventImpl::gen_t _finish_gen) - : PartitioningOperation(reqs, _finish_event, _finish_gen), - parent(_parent), - domain_transform(_domain_transform), - overlap_tester(0), - dummy_overlap_uop(0) { - areg.force_instantiation(); - } + template + IndexSpace PreimageOperation::add_target(const IndexSpace &target) { + // try to filter out obviously empty targets + if (parent.empty() || target.empty()) + return IndexSpace::make_empty(); + + // otherwise it'll be something smaller than the current parent + IndexSpace preimage; + preimage.bounds = parent.bounds; + + // if the target has a sparsity map, use the same node - otherwise + // get a sparsity ID by round-robin'ing across the nodes that have field data + int target_node; + if (exclusive_gpu_owner >= 0) + target_node = exclusive_gpu_owner; + else if (!target.dense()) + target_node = ID(target.sparsity).sparsity_creator_node(); + else if (!domain_transform.ptr_data.empty()) + target_node = + ID(domain_transform + .ptr_data[targets.size() % domain_transform.ptr_data.size()] + .inst) + .instance_owner_node(); + else + target_node = + ID(domain_transform + .range_data[targets.size() % domain_transform.range_data.size()] + .inst) + .instance_owner_node(); + if (exclusive_gpu_owner >= 0) + assert(target_node == exclusive_gpu_exec_node()); + SparsityMap sparsity = + create_deppart_output_sparsity(target_node).convert>(); + preimage.sparsity = sparsity; + + targets.push_back(target); + preimages.push_back(sparsity); + + return preimage; + } - template - PreimageOperation::~PreimageOperation(void) - { - if(overlap_tester) - delete overlap_tester; - } + template + void PreimageOperation::execute(void) { + std::vector,Point> > gpu_ptr_data; + std::vector,Point> > cpu_ptr_data; + std::vector,Rect> > gpu_rect_data; + std::vector,Rect> > cpu_rect_data; + for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) { + if (domain_transform.ptr_data[i].inst.get_location().kind() == + Memory::GPU_FB_MEM || domain_transform.ptr_data[i].inst.get_location().kind() == Memory::Z_COPY_MEM) { + gpu_ptr_data.push_back(domain_transform.ptr_data[i]); + } else { + cpu_ptr_data.push_back(domain_transform.ptr_data[i]); + } + } + for (size_t i = 0; i < domain_transform.range_data.size(); i++) { + if (domain_transform.range_data[i].inst.get_location().kind() == + Memory::GPU_FB_MEM || domain_transform.range_data[i].inst.get_location().kind() == Memory::Z_COPY_MEM) { + gpu_rect_data.push_back(domain_transform.range_data[i]); + } else { + cpu_rect_data.push_back(domain_transform.range_data[i]); + } + } + bool gpu_data = !gpu_ptr_data.empty() || !gpu_rect_data.empty(); + size_t opcount = cpu_ptr_data.size() + cpu_rect_data.size() + gpu_ptr_data.size() + gpu_rect_data.size(); + bool exclusive = (gpu_data && (opcount == 1)); + if (domain_transform.type == + DomainTransform::DomainTransformType::STRUCTURED && !gpu_data) { + for (size_t i = 0; i < preimages.size(); i++) { + SparsityMapImpl::lookup(preimages[i])->set_contributor_count(1); + } + + StructuredPreimageMicroOp *micro_op = + new StructuredPreimageMicroOp( + domain_transform.structured_transform, parent); + + for (size_t j = 0; j < targets.size(); j++) { + micro_op->add_sparsity_output(targets[j], preimages[j]); + } + micro_op->dispatch(this, true); + } else if (!DeppartConfig::cfg_disable_intersection_optimization && !gpu_data) { + // build the overlap tester based on the targets, since they're at least + // known + ComputeOverlapMicroOp *uop = + new ComputeOverlapMicroOp(this); + + remaining_sparse_images.store(domain_transform.ptr_data.size() + + domain_transform.range_data.size()); + contrib_counts.resize(preimages.size(), atomic(0)); + + // create a dummy async microop that lives until we've received all the + // sparse images + dummy_overlap_uop = new AsyncMicroOp(this, 0); + add_async_work_item(dummy_overlap_uop); + + // add each target, but also generate a bounding box for all of them + Rect target_bbox; + for (size_t i = 0; i < targets.size(); i++) { + uop->add_input_space(targets[i]); + if (i == 0) + target_bbox = targets[i].bounds; + else + target_bbox = target_bbox.union_bbox(targets[i].bounds); + } + + for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) { + // in parallel, we will request the approximate images of each instance's + // data (ideally limited to the target_bbox) + ImageMicroOp *img = new ImageMicroOp( + target_bbox, domain_transform.ptr_data[i].index_space, + domain_transform.ptr_data[i].inst, + domain_transform.ptr_data[i].field_offset, false /*ptrs*/); + img->add_approx_output(i, this); + img->dispatch(this, false /* do not run in this thread */); + } + + for (size_t i = 0; i < domain_transform.range_data.size(); i++) { + // in parallel, we will request the approximate images of each instance's + // data (ideally limited to the target_bbox) + ImageMicroOp *img = new ImageMicroOp( + target_bbox, domain_transform.range_data[i].index_space, + domain_transform.range_data[i].inst, + domain_transform.range_data[i].field_offset, true /*ranges*/); + img->add_approx_output(i + domain_transform.ptr_data.size(), this); + img->dispatch(this, false /* do not run in this thread */); + } + + uop->dispatch(this, true /* ok to run in this thread */); + } else { + if (!exclusive) { + for (size_t i = 0; i < preimages.size(); i++) + SparsityMapImpl::lookup(preimages[i]) + ->set_contributor_count(opcount); + } + for (size_t i = 0; i < cpu_ptr_data.size(); i++) { + PreimageMicroOp *uop = new PreimageMicroOp( + parent, cpu_ptr_data[i].index_space, + cpu_ptr_data[i].inst, + cpu_ptr_data[i].field_offset, false /*ptrs*/); + for (size_t j = 0; j < targets.size(); j++) + uop->add_sparsity_output(targets[j], preimages[j]); + uop->dispatch(this, true /* ok to run in this thread */); + } + for (size_t i = 0; i < cpu_rect_data.size(); i++) { + PreimageMicroOp *uop = new PreimageMicroOp( + parent, cpu_rect_data[i].index_space, + cpu_rect_data[i].inst, + cpu_rect_data[i].field_offset, true /*ranges*/); + for (size_t j = 0; j < targets.size(); j++) + uop->add_sparsity_output(targets[j], preimages[j]); + uop->dispatch(this, true /* ok to run in this thread */); + } +#ifdef REALM_USE_CUDA + for (auto ptr_fdd : gpu_ptr_data) { + domain_transform.ptr_data = {ptr_fdd}; + GPUPreimageMicroOp *micro_op = + new GPUPreimageMicroOp( + domain_transform, parent, exclusive); + for (size_t j = 0; j < targets.size(); j++) { + micro_op->add_sparsity_output(targets[j], preimages[j]); + } + micro_op->dispatch(this, true); + } + for (auto range_fdd : gpu_rect_data) { + domain_transform.range_data = {range_fdd}; + GPUPreimageMicroOp *micro_op = + new GPUPreimageMicroOp( + domain_transform, parent, exclusive); + for (size_t j = 0; j < targets.size(); j++) { + micro_op->add_sparsity_output(targets[j], preimages[j]); + } + micro_op->dispatch(this, true); + } +#else + assert(!gpu_data); +#endif - template - IndexSpace PreimageOperation::add_target(const IndexSpace& target) - { - // try to filter out obviously empty targets - if(parent.empty() || target.empty()) - return IndexSpace::make_empty(); - - // otherwise it'll be something smaller than the current parent - IndexSpace preimage; - preimage.bounds = parent.bounds; - - // if the target has a sparsity map, use the same node - otherwise - // get a sparsity ID by round-robin'ing across the nodes that have field data - int target_node; - if(!target.dense()) - target_node = ID(target.sparsity).sparsity_creator_node(); - else if (!domain_transform.ptr_data.empty()) - target_node = - ID(domain_transform - .ptr_data[targets.size() % domain_transform.ptr_data.size()] - .inst) - .instance_owner_node(); - else - target_node = - ID(domain_transform - .range_data[targets.size() % domain_transform.range_data.size()] - .inst) - .instance_owner_node(); - SparsityMap sparsity = get_runtime()->get_available_sparsity_impl(target_node)->me.convert >(); - preimage.sparsity = sparsity; - - targets.push_back(target); - preimages.push_back(sparsity); - - return preimage; - } + } + } - template - void PreimageOperation::execute(void) { - if (domain_transform.type == - DomainTransform::DomainTransformType::STRUCTURED) { - for (size_t i = 0; i < preimages.size(); i++) { - SparsityMapImpl::lookup(preimages[i])->set_contributor_count(1); - } + template + void PreimageOperation::provide_sparse_image(int index, const Rect *rects, size_t count) { + // atomically check the overlap tester's readiness and queue us if not + bool tester_ready = false; + { + AutoLock<> al(mutex); + if (overlap_tester != 0) { + tester_ready = true; + } else { + std::vector > &r = pending_sparse_images[index]; + r.insert(r.end(), rects, rects + count); + } + } + + if (tester_ready) { + // see which of the targets this image overlaps + std::set overlaps; + overlap_tester->test_overlap(rects, count, overlaps); + if ((size_t) index < domain_transform.ptr_data.size()) { + log_part.info() << "image of ptr_data[" << index << "] overlaps " << overlaps.size() << " targets"; + PreimageMicroOp *uop = new PreimageMicroOp( + parent, domain_transform.ptr_data[index].index_space, + domain_transform.ptr_data[index].inst, + domain_transform.ptr_data[index].field_offset, false /*ptrs*/); + for (std::set::const_iterator it2 = overlaps.begin(); + it2 != overlaps.end(); + it2++) { + int j = *it2; + contrib_counts[j].fetch_add(1); + uop->add_sparsity_output(targets[j], preimages[j]); + } + uop->dispatch(this, false /* do not run in this thread */); + } else { + size_t rel_index = index - domain_transform.ptr_data.size(); + assert(rel_index < domain_transform.range_data.size()); + log_part.info() << "image of range_data[" << rel_index << "] overlaps " << overlaps.size() << + " targets"; + PreimageMicroOp *uop = new PreimageMicroOp( + parent, domain_transform.range_data[rel_index].index_space, + domain_transform.range_data[rel_index].inst, + domain_transform.range_data[rel_index].field_offset, + true /*ranges*/); + for (std::set::const_iterator it2 = overlaps.begin(); + it2 != overlaps.end(); + it2++) { + int j = *it2; + contrib_counts[j].fetch_add(1); + uop->add_sparsity_output(targets[j], preimages[j]); + } + uop->dispatch(this, false /* do not run in this thread */); + } + + // if these were the last sparse images, we can now set the contributor counts + int v = remaining_sparse_images.fetch_sub(1) - 1; + if (v == 0) { + for (size_t j = 0; j < preimages.size(); j++) { + log_part.info() << contrib_counts[j].load() << " total contributors to preimage " << j; + SparsityMapImpl::lookup(preimages[j])->set_contributor_count(contrib_counts[j].load()); + } + dummy_overlap_uop->mark_finished(true /*successful*/); + } + } + } - StructuredPreimageMicroOp *micro_op = - new StructuredPreimageMicroOp( - domain_transform.structured_transform, parent); + template + void PreimageOperation::set_overlap_tester(void *tester) { + // atomically set the overlap tester and see if there are any pending entries + std::map > > pending; + { + AutoLock<> al(mutex); + assert(overlap_tester == 0); + overlap_tester = static_cast *>(tester); + pending.swap(pending_sparse_images); + } + + // now issue work for any sparse images we got before the tester was ready + if (!pending.empty()) { + for (typename std::map > >::const_iterator it = pending.begin(); + it != pending.end(); + it++) { + // see which instance this is an image from + size_t idx = it->first; + // see which of the targets that image overlaps + std::set overlaps; + overlap_tester->test_overlap(&it->second[0], it->second.size(), overlaps); + if (idx < domain_transform.ptr_data.size()) { + log_part.info() << "image of ptr_data[" << idx << "] overlaps " << overlaps.size() << " targets"; + PreimageMicroOp *uop = + new PreimageMicroOp( + parent, domain_transform.ptr_data[idx].index_space, + domain_transform.ptr_data[idx].inst, + domain_transform.ptr_data[idx].field_offset, false /*ptrs*/); + for (std::set::const_iterator it2 = overlaps.begin(); + it2 != overlaps.end(); + it2++) { + int j = *it2; + contrib_counts[j].fetch_add(1); + uop->add_sparsity_output(targets[j], preimages[j]); + } + uop->dispatch(this, true /* ok to run in this thread */); + } else { + size_t rel_index = idx - domain_transform.ptr_data.size(); + assert(rel_index < domain_transform.range_data.size()); + log_part.info() << "image of range_data[" << rel_index << "] overlaps " << overlaps.size() << + " targets"; + PreimageMicroOp *uop = + new PreimageMicroOp( + parent, domain_transform.range_data[rel_index].index_space, + domain_transform.range_data[rel_index].inst, + domain_transform.range_data[rel_index].field_offset, + true /*ranges*/); + for (std::set::const_iterator it2 = overlaps.begin(); + it2 != overlaps.end(); + it2++) { + int j = *it2; + contrib_counts[j].fetch_add(1); + uop->add_sparsity_output(targets[j], preimages[j]); + } + uop->dispatch(this, true /* ok to run in this thread */); + } + } + + // if these were the last sparse images, we can now set the contributor counts + int v = remaining_sparse_images.fetch_sub(pending.size()) - pending.size(); + if (v == 0) { + for (size_t j = 0; j < preimages.size(); j++) { + log_part.info() << contrib_counts[j].load() << " total contributors to preimage " << j; + SparsityMapImpl::lookup(preimages[j])->set_contributor_count(contrib_counts[j].load()); + } + dummy_overlap_uop->mark_finished(true /*successful*/); + } + } + } - for (size_t j = 0; j < targets.size(); j++) { - micro_op->add_sparsity_output(targets[j], preimages[j]); - } - micro_op->dispatch(this, true); - } else { - if (!DeppartConfig::cfg_disable_intersection_optimization) { - // build the overlap tester based on the targets, since they're at least - // known - ComputeOverlapMicroOp *uop = - new ComputeOverlapMicroOp(this); - - remaining_sparse_images.store(domain_transform.ptr_data.size() + - domain_transform.range_data.size()); - contrib_counts.resize(preimages.size(), atomic(0)); - - // create a dummy async microop that lives until we've received all the - // sparse images - dummy_overlap_uop = new AsyncMicroOp(this, 0); - add_async_work_item(dummy_overlap_uop); - - // add each target, but also generate a bounding box for all of them - Rect target_bbox; - for (size_t i = 0; i < targets.size(); i++) { - uop->add_input_space(targets[i]); - if (i == 0) - target_bbox = targets[i].bounds; - else - target_bbox = target_bbox.union_bbox(targets[i].bounds); - } + template + void PreimageOperation::print(std::ostream &os) const { + os << "PreimageOperation(" << parent << ")"; + } - for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) { - // in parallel, we will request the approximate images of each instance's - // data (ideally limited to the target_bbox) - ImageMicroOp *img = new ImageMicroOp( - target_bbox, domain_transform.ptr_data[i].index_space, - domain_transform.ptr_data[i].inst, - domain_transform.ptr_data[i].field_offset, false /*ptrs*/); - img->add_approx_output(i, this); - img->dispatch(this, false /* do not run in this thread */); - } + template + ActiveMessageHandlerReg > > PreimageOperation::areg; - for (size_t i = 0; i < domain_transform.range_data.size(); i++) { - // in parallel, we will request the approximate images of each instance's - // data (ideally limited to the target_bbox) - ImageMicroOp *img = new ImageMicroOp( - target_bbox, domain_transform.range_data[i].index_space, - domain_transform.range_data[i].inst, - domain_transform.range_data[i].field_offset, true /*ranges*/); - img->add_approx_output(i + domain_transform.ptr_data.size(), this); - img->dispatch(this, false /* do not run in this thread */); - } - uop->dispatch(this, true /* ok to run in this thread */); - } else { - for (size_t i = 0; i < preimages.size(); i++) - SparsityMapImpl::lookup(preimages[i]) - ->set_contributor_count(domain_transform.ptr_data.size() + - domain_transform.range_data.size()); - - for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) { - PreimageMicroOp *uop = new PreimageMicroOp( - parent, domain_transform.ptr_data[i].index_space, - domain_transform.ptr_data[i].inst, - domain_transform.ptr_data[i].field_offset, false /*ptrs*/); - for (size_t j = 0; j < targets.size(); j++) - uop->add_sparsity_output(targets[j], preimages[j]); - uop->dispatch(this, true /* ok to run in this thread */); - } + //////////////////////////////////////////////////////////////////////// + // + // class ApproxImageResponseMessage - for (size_t i = 0; i < domain_transform.range_data.size(); i++) { - PreimageMicroOp *uop = new PreimageMicroOp( - parent, domain_transform.range_data[i].index_space, - domain_transform.range_data[i].inst, - domain_transform.range_data[i].field_offset, true /*ranges*/); - for (size_t j = 0; j < targets.size(); j++) - uop->add_sparsity_output(targets[j], preimages[j]); - uop->dispatch(this, true /* ok to run in this thread */); - } - } - } - } + template + /*static*/ void ApproxImageResponseMessage::handle_message(NodeID sender, + const ApproxImageResponseMessage &msg, + const void *data, size_t datalen) { + T *op = reinterpret_cast(msg.approx_output_op); + op->provide_sparse_image(msg.approx_output_index, + static_cast *>(data), + datalen / sizeof(Rect)); + } - template - void PreimageOperation::provide_sparse_image(int index, const Rect *rects, size_t count) - { - // atomically check the overlap tester's readiness and queue us if not - bool tester_ready = false; - { - AutoLock<> al(mutex); - if(overlap_tester != 0) { - tester_ready = true; - } else { - std::vector >& r = pending_sparse_images[index]; - r.insert(r.end(), rects, rects + count); - } - } + //////////////////////////////////////////////////////////////////////// + // + // class StructuredPreimageMicroOp - if(tester_ready) { - // see which of the targets this image overlaps - std::set overlaps; - overlap_tester->test_overlap(rects, count, overlaps); - if((size_t)index < domain_transform.ptr_data.size()) { - log_part.info() << "image of ptr_data[" << index << "] overlaps " << overlaps.size() << " targets"; - PreimageMicroOp *uop = new PreimageMicroOp( - parent, domain_transform.ptr_data[index].index_space, - domain_transform.ptr_data[index].inst, - domain_transform.ptr_data[index].field_offset, false /*ptrs*/); - for(std::set::const_iterator it2 = overlaps.begin(); - it2 != overlaps.end(); - it2++) { - int j = *it2; - contrib_counts[j].fetch_add(1); - uop->add_sparsity_output(targets[j], preimages[j]); + template + StructuredPreimageMicroOp::StructuredPreimageMicroOp( + const StructuredTransform &_transform, + IndexSpace _parent_space) + : transform(_transform), parent_space(_parent_space) { } - uop->dispatch(this, false /* do not run in this thread */); - } else { - size_t rel_index = index - domain_transform.ptr_data.size(); - assert(rel_index < domain_transform.range_data.size()); - log_part.info() << "image of range_data[" << rel_index << "] overlaps " << overlaps.size() << " targets"; - PreimageMicroOp *uop = new PreimageMicroOp( - parent, domain_transform.range_data[rel_index].index_space, - domain_transform.range_data[rel_index].inst, - domain_transform.range_data[rel_index].field_offset, - true /*ranges*/); - for(std::set::const_iterator it2 = overlaps.begin(); - it2 != overlaps.end(); - it2++) { - int j = *it2; - contrib_counts[j].fetch_add(1); - uop->add_sparsity_output(targets[j], preimages[j]); - } - uop->dispatch(this, false /* do not run in this thread */); - } - // if these were the last sparse images, we can now set the contributor counts - int v = remaining_sparse_images.fetch_sub(1) - 1; - if(v == 0) { - for(size_t j = 0; j < preimages.size(); j++) { - log_part.info() << contrib_counts[j].load() << " total contributors to preimage " << j; - SparsityMapImpl::lookup(preimages[j])->set_contributor_count(contrib_counts[j].load()); + template + StructuredPreimageMicroOp::~StructuredPreimageMicroOp(void) { } - dummy_overlap_uop->mark_finished(true /*successful*/); - } - } - } - - template - void PreimageOperation::set_overlap_tester(void *tester) - { - // atomically set the overlap tester and see if there are any pending entries - std::map > > pending; - { - AutoLock<> al(mutex); - assert(overlap_tester == 0); - overlap_tester = static_cast *>(tester); - pending.swap(pending_sparse_images); - } - // now issue work for any sparse images we got before the tester was ready - if(!pending.empty()) { - for(typename std::map > >::const_iterator it = pending.begin(); - it != pending.end(); - it++) { - // see which instance this is an image from - size_t idx = it->first; - // see which of the targets that image overlaps - std::set overlaps; - overlap_tester->test_overlap(&it->second[0], it->second.size(), overlaps); - if(idx < domain_transform.ptr_data.size()) { - log_part.info() << "image of ptr_data[" << idx << "] overlaps " << overlaps.size() << " targets"; - PreimageMicroOp *uop = - new PreimageMicroOp( - parent, domain_transform.ptr_data[idx].index_space, - domain_transform.ptr_data[idx].inst, - domain_transform.ptr_data[idx].field_offset, false /*ptrs*/); - for(std::set::const_iterator it2 = overlaps.begin(); - it2 != overlaps.end(); - it2++) { - int j = *it2; - contrib_counts[j].fetch_add(1); - uop->add_sparsity_output(targets[j], preimages[j]); - } - uop->dispatch(this, true /* ok to run in this thread */); - } else { - size_t rel_index = idx - domain_transform.ptr_data.size(); - assert(rel_index < domain_transform.range_data.size()); - log_part.info() << "image of range_data[" << rel_index << "] overlaps " << overlaps.size() << " targets"; - PreimageMicroOp *uop = - new PreimageMicroOp( - parent, domain_transform.range_data[rel_index].index_space, - domain_transform.range_data[rel_index].inst, - domain_transform.range_data[rel_index].field_offset, - true /*ranges*/); - for(std::set::const_iterator it2 = overlaps.begin(); - it2 != overlaps.end(); - it2++) { - int j = *it2; - contrib_counts[j].fetch_add(1); - uop->add_sparsity_output(targets[j], preimages[j]); - } - uop->dispatch(this, true /* ok to run in this thread */); + template + void StructuredPreimageMicroOp::add_sparsity_output( + IndexSpace _target, SparsityMap _sparsity) { + targets.push_back(_target); + sparsity_outputs.push_back(_sparsity); } - } - // if these were the last sparse images, we can now set the contributor counts - int v = remaining_sparse_images.fetch_sub(pending.size()) - pending.size(); - if(v == 0) { - for(size_t j = 0; j < preimages.size(); j++) { - log_part.info() << contrib_counts[j].load() << " total contributors to preimage " << j; - SparsityMapImpl::lookup(preimages[j])->set_contributor_count(contrib_counts[j].load()); + template + template + void StructuredPreimageMicroOp::populate_bitmasks( + std::map &bitmasks) { + Rect target_bbox = targets[0].bounds; + for (size_t i = 1; i < targets.size(); i++) { + target_bbox = target_bbox.union_bbox(targets[i].bounds); + } + for (IndexSpaceIterator it2(parent_space); it2.valid; it2.step()) { + Rect parent_bbox; + parent_bbox.lo = transform[it2.rect.lo]; + parent_bbox.hi = transform[it2.rect.hi]; + + if (target_bbox.intersection(parent_bbox).empty()) continue; + + for (PointInRectIterator pir(it2.rect); pir.valid; pir.step()) { + Point target_point = transform[pir.p]; + for (size_t i = 0; i < targets.size(); i++) { + if (targets[i].contains(target_point)) { + BM *&bmp = bitmasks[i]; + if (!bmp) bmp = new BM; + bmp->add_point(pir.p); + } + } + } + } } - dummy_overlap_uop->mark_finished(true /*successful*/); - } - } - } - template - void PreimageOperation::print(std::ostream& os) const - { - os << "PreimageOperation(" << parent << ")"; - } - - template - ActiveMessageHandlerReg > > PreimageOperation::areg; + template + void StructuredPreimageMicroOp::execute(void) { + TimeStamp ts("PreimageMicroOp::execute", true, &log_uop_timing); + std::map *> rect_map; + if (N2 > 1) { + for (const IndexSpace& target : targets) { + if (!target.dense()) { + target.sparsity.impl()->request_bvh(); + } + } + } - //////////////////////////////////////////////////////////////////////// - // - // class ApproxImageResponseMessage - - template - /*static*/ void ApproxImageResponseMessage::handle_message(NodeID sender, - const ApproxImageResponseMessage &msg, - const void *data, size_t datalen) - { - T *op = reinterpret_cast(msg.approx_output_op); - op->provide_sparse_image(msg.approx_output_index, - static_cast *>(data), - datalen / sizeof(Rect)); - } - - //////////////////////////////////////////////////////////////////////// - // - // class StructuredPreimageMicroOp + populate_bitmasks(rect_map); +#ifdef DEBUG_PARTITIONING + std::cout << rect_map.size() << " non-empty preimages present in instance " + << inst << std::endl; + for (typename std::map *>::const_iterator it = + rect_map.begin(); + it != rect_map.end(); it++) + std::cout << " " << targets[it->first] << " = " + << it->second->rects.size() << " rectangles" << std::endl; +#endif + // iterate over sparsity outputs and contribute to all (even if we + // didn't have any points found for it) + int empty_count = 0; + for (size_t i = 0; i < sparsity_outputs.size(); i++) { + SparsityMapImpl *impl = + SparsityMapImpl::lookup(sparsity_outputs[i]); + typename std::map *>::const_iterator it2 = + rect_map.find(i); + if (it2 != rect_map.end()) { + impl->contribute_dense_rect_list(it2->second->rects, true /*disjoint*/); + delete it2->second; + } else { + impl->contribute_nothing(); + empty_count++; + } + } + + if (empty_count > 0) { + log_part.info() << empty_count << " empty preimages (out of " + << sparsity_outputs.size() << ")"; + } + } - template - StructuredPreimageMicroOp::StructuredPreimageMicroOp( - const StructuredTransform &_transform, - IndexSpace _parent_space) - : transform(_transform), parent_space(_parent_space) {} + template + void StructuredPreimageMicroOp::dispatch( + PartitioningOperation *op, bool inline_ok) { + // need valid data for each target + for (size_t i = 0; i < targets.size(); i++) { + if (!targets[i].dense()) { + // it's safe to add the count after the registration only because we + // initialized the count to 2 instead of 1 + bool registered = SparsityMapImpl::lookup(targets[i].sparsity) + ->add_waiter(this, true /*precise*/); + if (registered) wait_count.fetch_add(1); + } + } + + // need valid data for the parent space too + if (!parent_space.dense()) { + // it's safe to add the count after the registration only because we + // initialized the count to 2 instead of 1 + bool registered = SparsityMapImpl::lookup(parent_space.sparsity) + ->add_waiter(this, true /*precise*/); + if (registered) wait_count.fetch_add(1); + } + + this->finish_dispatch(op, inline_ok); + } - template - StructuredPreimageMicroOp::~StructuredPreimageMicroOp(void) {} + //////////////////////////////////////////////////////////////////////// + // + // class GPUPreimageMicroOp +#ifdef REALM_USE_CUDA + + template + GPUPreimageMicroOp::GPUPreimageMicroOp( + const DomainTransform &_domain_transform, + IndexSpace _parent_space, bool _exclusive) + : domain_transform(_domain_transform), parent_space(_parent_space) { + this->exclusive = _exclusive; + areg.force_instantiation(); + // GPU setup (this->gpu, this->stream) deferred to execute(), which runs on the + // correct node after dispatch() has forwarded to the instance owner if needed. + } - template - void StructuredPreimageMicroOp::add_sparsity_output( - IndexSpace _target, SparsityMap _sparsity) { - targets.push_back(_target); - sparsity_outputs.push_back(_sparsity); - } + template + template + GPUPreimageMicroOp::GPUPreimageMicroOp( + NodeID _requestor, AsyncMicroOp *_async_microop, S& s) + : GPUMicroOp(_requestor, _async_microop) { + bool ok = true; + // domain_transform is always UNSTRUCTURED; only one of ptr_data/range_data + // is populated — a single bool distinguishes the two cases. + bool use_ptr_data = false; + ok = ok && (s >> use_ptr_data); + if(use_ptr_data) { + domain_transform.type = + DomainTransform::DomainTransformType::UNSTRUCTURED_PTR; + size_t np = 0; + ok = ok && (s >> np); + domain_transform.ptr_data.resize(np); + for(size_t i = 0; i < np && ok; i++) + ok = ok && (s >> domain_transform.ptr_data[i].index_space) && + (s >> domain_transform.ptr_data[i].inst) && + (s >> domain_transform.ptr_data[i].field_offset) && + (s >> domain_transform.ptr_data[i].scratch_buffer); + } else { + domain_transform.type = + DomainTransform::DomainTransformType::UNSTRUCTURED_RANGE; + size_t nr = 0; + ok = ok && (s >> nr); + domain_transform.range_data.resize(nr); + for(size_t i = 0; i < nr && ok; i++) + ok = ok && (s >> domain_transform.range_data[i].index_space) && + (s >> domain_transform.range_data[i].inst) && + (s >> domain_transform.range_data[i].field_offset) && + (s >> domain_transform.range_data[i].scratch_buffer); + } + ok = ok && (s >> parent_space); + ok = ok && (s >> this->exclusive); + ok = ok && (s >> targets); + ok = ok && (s >> sparsity_outputs); + assert(ok); + (void)ok; + } - template - template - void StructuredPreimageMicroOp::populate_bitmasks( - std::map &bitmasks) { - Rect target_bbox = targets[0].bounds; - for (size_t i = 1; i < targets.size(); i++) { - target_bbox = target_bbox.union_bbox(targets[i].bounds); - } - for (IndexSpaceIterator it2(parent_space); it2.valid; it2.step()) { - Rect parent_bbox; - parent_bbox.lo = transform[it2.rect.lo]; - parent_bbox.hi = transform[it2.rect.hi]; - - if (target_bbox.intersection(parent_bbox).empty()) continue; - - for (PointInRectIterator pir(it2.rect); pir.valid; pir.step()) { - Point target_point = transform[pir.p]; - for (size_t i = 0; i < targets.size(); i++) { - if (targets[i].contains(target_point)) { - BM *&bmp = bitmasks[i]; - if (!bmp) bmp = new BM; - bmp->add_point(pir.p); - } - } - } - } - } + template + template + bool GPUPreimageMicroOp::serialize_params(S& s) const { + bool ok = true; + // domain_transform is always UNSTRUCTURED; only one of ptr_data/range_data + // is populated — a single bool distinguishes the two cases. + bool use_ptr_data = !domain_transform.ptr_data.empty(); + ok = ok && (s << use_ptr_data); + if(use_ptr_data) { + ok = ok && (s << domain_transform.ptr_data.size()); + for(size_t i = 0; i < domain_transform.ptr_data.size() && ok; i++) + ok = ok && (s << domain_transform.ptr_data[i].index_space) && + (s << domain_transform.ptr_data[i].inst) && + (s << domain_transform.ptr_data[i].field_offset) && + (s << domain_transform.ptr_data[i].scratch_buffer); + } else { + ok = ok && (s << domain_transform.range_data.size()); + for(size_t i = 0; i < domain_transform.range_data.size() && ok; i++) + ok = ok && (s << domain_transform.range_data[i].index_space) && + (s << domain_transform.range_data[i].inst) && + (s << domain_transform.range_data[i].field_offset) && + (s << domain_transform.range_data[i].scratch_buffer); + } + ok = ok && (s << parent_space); + ok = ok && (s << this->exclusive); + ok = ok && (s << targets); + ok = ok && (s << sparsity_outputs); + return ok; + } - template - void StructuredPreimageMicroOp::execute(void) - { - TimeStamp ts("PreimageMicroOp::execute", true, &log_uop_timing); - std::map *> rect_map; + template + GPUPreimageMicroOp::~GPUPreimageMicroOp(void) { + } - populate_bitmasks(rect_map); -#ifdef DEBUG_PARTITIONING - std::cout << rect_map.size() << " non-empty preimages present in instance " - << inst << std::endl; - for (typename std::map *>::const_iterator it = - rect_map.begin(); - it != rect_map.end(); it++) - std::cout << " " << targets[it->first] << " = " - << it->second->rects.size() << " rectangles" << std::endl; -#endif - // iterate over sparsity outputs and contribute to all (even if we - // didn't have any points found for it) - int empty_count = 0; - for (size_t i = 0; i < sparsity_outputs.size(); i++) { - SparsityMapImpl *impl = - SparsityMapImpl::lookup(sparsity_outputs[i]); - typename std::map *>::const_iterator it2 = - rect_map.find(i); - if (it2 != rect_map.end()) { - impl->contribute_dense_rect_list(it2->second->rects, true /*disjoint*/); - delete it2->second; - } else { - impl->contribute_nothing(); - empty_count++; - } - } + template + void GPUPreimageMicroOp::add_sparsity_output( + IndexSpace _target, SparsityMap _sparsity) { + targets.push_back(_target); + sparsity_outputs.push_back(_sparsity); + } - if (empty_count > 0) { - log_part.info() << empty_count << " empty preimages (out of " - << sparsity_outputs.size() << ")"; - } - } + template + void GPUPreimageMicroOp::execute(void) { + TimeStamp ts("GPUPreimageMicroOp::execute", true, &log_uop_timing); + // Resolve the local GPU processor now that we are guaranteed to be on the + // correct node (dispatch() forwarded us here if the instance was remote). + { + Memory my_mem = domain_transform.ptr_data.empty() ? + domain_transform.range_data[0].inst.get_location() : + domain_transform.ptr_data[0].inst.get_location(); + Processor best_proc; + assert(choose_proc(best_proc, my_mem)); + Cuda::GPUProcessor *gpu_proc = + dynamic_cast(get_runtime()->get_processor_impl(best_proc)); + assert(gpu_proc); + this->gpu = gpu_proc->gpu; + this->stream = gpu_proc->gpu->get_deppart_stream(); + } + Cuda::AutoGPUContext agc(this->gpu); + if (domain_transform.ptr_data.size() > 0) { + gpu_populate_bitmasks(); + } else if (domain_transform.range_data.size() > 0) { + gpu_populate_ranges(); + } + } - template - void StructuredPreimageMicroOp::dispatch( - PartitioningOperation *op, bool inline_ok) { - // need valid data for each target - for (size_t i = 0; i < targets.size(); i++) { - if (!targets[i].dense()) { - // it's safe to add the count after the registration only because we - // initialized the count to 2 instead of 1 - bool registered = SparsityMapImpl::lookup(targets[i].sparsity) - ->add_waiter(this, true /*precise*/); - if (registered) wait_count.fetch_add(1); - } - } + template + void GPUPreimageMicroOp::dispatch( + PartitioningOperation *op, bool inline_ok) { + // GPU preimage must execute on the node that owns the GPU memory + NodeID exec_node = domain_transform.ptr_data.empty() ? + ID(domain_transform.range_data[0].inst).instance_owner_node() : + ID(domain_transform.ptr_data[0].inst).instance_owner_node(); + if(this->exclusive) { + for(size_t i = 0; i < sparsity_outputs.size(); i++) + assert(NodeID(ID(sparsity_outputs[i]).sparsity_creator_node()) == exec_node); + } + if(exec_node != Network::my_node_id) { + PartitioningMicroOp::template forward_microop >(exec_node, op, this); + return; + } + + for (size_t i = 0; i < domain_transform.ptr_data.size(); i++) { + IndexSpace inst_space = domain_transform.ptr_data[i].index_space; + if (!inst_space.dense()) { + // it's safe to add the count after the registration only because we initialized + // the count to 2 instead of 1 + bool registered = SparsityMapImpl::lookup(inst_space.sparsity)->add_waiter(this, true /*precise*/); + if(registered) + this->wait_count.fetch_add(1); + } + } + for (size_t i = 0; i < domain_transform.range_data.size(); i++) { + IndexSpace inst_space = domain_transform.range_data[i].index_space; + if (!inst_space.dense()) { + // it's safe to add the count after the registration only because we initialized + // the count to 2 instead of 1 + bool registered = SparsityMapImpl::lookup(inst_space.sparsity)->add_waiter(this, true /*precise*/); + if(registered) + this->wait_count.fetch_add(1); + } + } + + // need valid data for each target + for (size_t i = 0; i < targets.size(); i++) { + if (!targets[i].dense()) { + // it's safe to add the count after the registration only because we + // initialized the count to 2 instead of 1 + bool registered = SparsityMapImpl::lookup(targets[i].sparsity) + ->add_waiter(this, true /*precise*/); + if (registered) this->wait_count.fetch_add(1); + } + } + + // need valid data for the parent space too + if (!parent_space.dense()) { + // it's safe to add the count after the registration only because we + // initialized the count to 2 instead of 1 + bool registered = SparsityMapImpl::lookup(parent_space.sparsity) + ->add_waiter(this, true /*precise*/); + if (registered) this->wait_count.fetch_add(1); + } + + this->finish_dispatch(op, inline_ok); + } - // need valid data for the parent space too - if (!parent_space.dense()) { - // it's safe to add the count after the registration only because we - // initialized the count to 2 instead of 1 - bool registered = SparsityMapImpl::lookup(parent_space.sparsity) - ->add_waiter(this, true /*precise*/); - if (registered) wait_count.fetch_add(1); - } + template + ActiveMessageHandlerReg > > + GPUPreimageMicroOp::areg; - finish_dispatch(op, inline_ok); - } +#endif - // instantiations of templates handled in preimage_tmpl.cc + // instantiations of templates handled in preimage_tmpl.cc }; // namespace Realm diff --git a/src/realm/deppart/preimage.h b/src/realm/deppart/preimage.h index c08c0dfd30..01032d2517 100644 --- a/src/realm/deppart/preimage.h +++ b/src/realm/deppart/preimage.h @@ -20,7 +20,8 @@ #ifndef REALM_DEPPART_PREIMAGE_H #define REALM_DEPPART_PREIMAGE_H -#include "realm/deppart/partitions.h" +#include "partitions.h" +#include "realm/deppart/rectlist.h" namespace Realm { @@ -99,6 +100,7 @@ namespace Realm { protected: static ActiveMessageHandlerReg > > areg; + NodeID exclusive_gpu_exec_node(void) const; IndexSpace parent; DomainTransform domain_transform; @@ -110,6 +112,7 @@ namespace Realm { atomic remaining_sparse_images; std::vector > contrib_counts; AsyncMicroOp *dummy_overlap_uop; + int exclusive_gpu_owner; }; template @@ -152,6 +155,50 @@ namespace Realm { std::vector > sparsity_outputs; }; + #ifdef REALM_USE_CUDA + + template + class GPUPreimageMicroOp : public GPUMicroOp { + public: + static const int DIM = N; + typedef T IDXTYPE; + static const int DIM2 = N2; + typedef T2 IDXTYPE2; + + GPUPreimageMicroOp(const DomainTransform &_domain_transform, + IndexSpace _parent_space, bool _exclusive); + + virtual ~GPUPreimageMicroOp(void); + + void add_sparsity_output(IndexSpace _target, SparsityMap _sparsity); + + virtual void execute(void); + + void dispatch(PartitioningOperation *op, bool inline_ok); + + protected: + friend struct RemoteMicroOpMessage >; + static ActiveMessageHandlerReg > > areg; + + friend class PartitioningMicroOp; + template + REALM_ATTR_WARN_UNUSED(bool serialize_params(S& s) const); + + // construct from received packet + template + GPUPreimageMicroOp(NodeID _requestor, AsyncMicroOp *_async_microop, S& s); + + void gpu_populate_ranges(); + void gpu_populate_bitmasks(); + + DomainTransform domain_transform; + IndexSpace parent_space; + std::vector > targets; + std::vector > sparsity_outputs; + }; + +#endif + }; // namespace Realm #endif // REALM_DEPPART_PREIMAGE_H diff --git a/src/realm/deppart/preimage_gpu_impl.hpp b/src/realm/deppart/preimage_gpu_impl.hpp new file mode 100644 index 0000000000..6934772fe4 --- /dev/null +++ b/src/realm/deppart/preimage_gpu_impl.hpp @@ -0,0 +1,631 @@ +#pragma once +#include "realm/deppart/preimage.h" +#include "realm/deppart/preimage_gpu_kernels.hpp" +#include "realm/deppart/byfield_gpu_kernels.hpp" +#include "realm/deppart/partitions_gpu_impl.hpp" +#include +#include +#include "realm/nvtx.h" + +namespace Realm { + + template + void GPUPreimageMicroOp::gpu_populate_ranges() { + if (targets.size() == 0) { + assert(sparsity_outputs.empty()); + return; + } + + RegionInstance buffer = domain_transform.range_data[0].scratch_buffer; + + size_t tile_size = buffer.get_layout()->bytes_used; + //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; + Arena buffer_arena(buffer); + + NVTX_DEPPART(gpu_preimage_range); + + Memory sysmem; + assert(find_memory(sysmem, Memory::SYSTEM_MEM, buffer_arena.location)); + + CUstream stream = this->stream->get_stream(); + + collapsed_space inst_space; + + // We combine all of our instances into one to batch work, tracking the offsets between instances. + inst_space.offsets = buffer_arena.alloc(domain_transform.range_data.size() + 1); + inst_space.num_children = domain_transform.range_data.size(); + + Arena sys_arena; + GPUMicroOp::collapse_multi_space(domain_transform.range_data, inst_space, sys_arena, stream); + + collapsed_space collapsed_parent; + + // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs. + GPUMicroOp::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream); + + + // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter + // to figure out where to write each rectangle. + uint32_t* d_inst_counters = buffer_arena.alloc(2 * domain_transform.range_data.size() + 1); + + // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second + // to track which instance each rectangle came from in the populate phase. + uint32_t* d_inst_prefix = d_inst_counters + domain_transform.range_data.size(); + + collapsed_space target_space; + target_space.offsets = buffer_arena.alloc(targets.size() + 1); + target_space.num_children = targets.size(); + + GPUMicroOp::collapse_multi_space(targets, target_space, buffer_arena, stream); + + std::vector,N,T>> h_accessors(domain_transform.range_data.size()); + for (size_t i = 0; i < domain_transform.range_data.size(); ++i) { + h_accessors[i] = AffineAccessor,N,T>(domain_transform.range_data[i].inst, domain_transform.range_data[i].field_offset); + } + AffineAccessor,N,T>* d_accessors = + buffer_arena.alloc,N,T>>(domain_transform.range_data.size()); + CUDA_CHECK(cudaMemcpyAsync(d_accessors, h_accessors.data(), + domain_transform.range_data.size() * sizeof(AffineAccessor,N,T>), + cudaMemcpyHostToDevice, stream), stream); + + uint32_t* d_target_counters = buffer_arena.alloc(2*targets.size() + 1); + uint32_t* d_targets_prefix = d_target_counters + targets.size(); + CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, targets.size() * sizeof(uint32_t), stream), stream); + + buffer_arena.commit(false); + + size_t num_output = 0; + RectDesc* output_start = nullptr; + size_t num_completed = 0; + size_t curr_tile = tile_size / 2; + int count = 0; + if (count) {} + bool host_fallback = false; + std::vector*> host_rect_buffers(targets.size(), nullptr); + std::vector entry_counts(targets.size(), 0); + while (num_completed < inst_space.num_entries) { + try { + + //std::cout << "Preimage iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; + buffer_arena.start(); + if (num_completed + curr_tile > inst_space.num_entries) { + curr_tile = inst_space.num_entries - num_completed; + } + + collapsed_space inst_space_tile = inst_space; + inst_space_tile.num_entries = curr_tile; + inst_space_tile.entries_buffer = buffer_arena.alloc>(curr_tile); + CUDA_CHECK(cudaMemcpyAsync(inst_space_tile.entries_buffer, inst_space.entries_buffer + num_completed, curr_tile * sizeof(SparsityMapEntry), cudaMemcpyHostToDevice, stream), stream); + + size_t num_valid_rects; + Rect* d_valid_rects; + // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from. + GPUMicroOp::template construct_input_rectlist>(inst_space_tile, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream); + + if (num_valid_rects == 0) { + num_completed += curr_tile; + subtract_const<<>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + curr_tile = tile_size / 2; + continue; + } + + // Prefix sum the valid rectangles by volume. + size_t total_pts; + size_t* d_prefix_rects; + GPUMicroOp::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream); + + PointDesc* d_points; + size_t num_valid_points; + + CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, targets.size() * sizeof(uint32_t), stream), stream); + + if (target_space.num_entries > targets.size()) { + + BVH preimage_bvh; + GPUMicroOp::build_bvh(target_space, preimage_bvh, buffer_arena, stream); + + preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices, + preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.range_data.size(), preimage_bvh.num_leaves, nullptr, d_target_counters, nullptr); + KERNEL_CHECK(stream); + + std::vector h_target_counters(targets.size()+1); + h_target_counters[0] = 0; // prefix sum starts at 0 + CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t i = 0; i < targets.size(); ++i) { + h_target_counters[i+1] += h_target_counters[i]; + } + + num_valid_points = h_target_counters[targets.size()]; + + if (num_valid_points == 0) { + num_completed += curr_tile; + subtract_const<<>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + curr_tile = tile_size / 2; + continue; + } + + CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream); + + buffer_arena.flip_parity(); + d_points = buffer_arena.alloc>(num_valid_points); + + CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream); + + preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices, + preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.range_data.size(), preimage_bvh.num_leaves, d_targets_prefix, d_target_counters, d_points); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } else { + preimage_dense_populate_bitmasks_kernel< N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts, + num_valid_rects, domain_transform.range_data.size(), targets.size(), nullptr, d_target_counters, nullptr); + KERNEL_CHECK(stream); + + std::vector h_target_counters(targets.size()+1); + h_target_counters[0] = 0; // prefix sum starts at 0 + CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t i = 0; i < targets.size(); ++i) { + h_target_counters[i+1] += h_target_counters[i]; + } + + num_valid_points = h_target_counters[targets.size()]; + + if (num_valid_points == 0) { + num_completed += curr_tile; + subtract_const<<>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + curr_tile = tile_size / 2; + continue; + } + + CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream); + + buffer_arena.flip_parity(); + d_points = buffer_arena.alloc>(num_valid_points); + + CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream); + + preimage_dense_populate_bitmasks_kernel< N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts, + num_valid_rects, domain_transform.range_data.size(), targets.size(), d_targets_prefix, d_target_counters, d_points); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } + + buffer_arena.flip_parity(); + buffer_arena.flip_parity(); + d_points = buffer_arena.alloc>(num_valid_points); + + size_t num_new_rects = num_output == 0 ? 1 : 2; + assert(!buffer_arena.get_parity()); + RectDesc* d_new_rects; + + this->complete_pipeline(d_points, num_valid_points, d_new_rects, num_new_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + + if (host_fallback) { + this->split_output(d_new_rects, num_new_rects, host_rect_buffers, entry_counts, buffer_arena); + } + + if (num_output==0 || host_fallback) { + num_output = num_new_rects; + num_completed += curr_tile; + output_start = d_new_rects; + subtract_const<<>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + curr_tile = tile_size / 2; + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + continue; + } + + RectDesc* d_old_rects = buffer_arena.alloc>(num_output); + assert(d_old_rects == d_new_rects + num_new_rects); + CUDA_CHECK(cudaMemcpyAsync(d_old_rects, output_start, num_output * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + size_t num_final_rects = 1; + + //Send it off for processing + this->complete_rect_pipeline(d_new_rects, num_output + num_new_rects, output_start, num_final_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + num_completed += curr_tile; + num_output = num_final_rects; + subtract_const<<>>(inst_space.offsets, domain_transform.range_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + curr_tile = tile_size / 2; + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + } catch (arena_oom&) { + //std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; + curr_tile /= 2; + if (curr_tile == 0) { + if (host_fallback) { + GPUMicroOp::shatter_rects(inst_space, num_completed, stream); + curr_tile = 1; + } else { + host_fallback = true; + if (num_output > 0) { + this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena); + } + curr_tile = tile_size / 2; + } + } + } + } + if (num_output == 0) { + for (SparsityMap it : sparsity_outputs) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(it); + if (this->exclusive) { + impl->gpu_finalize(); + } else { + impl->contribute_nothing(); + } + } + return; + } + + if (!host_fallback) { + try { + this->send_output(output_start, num_output, buffer_arena, sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + } catch (arena_oom&) { + this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena); + host_fallback = true; + } + } + + if (host_fallback) { + for (size_t idx = 0; idx < sparsity_outputs.size(); ++idx) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(sparsity_outputs[idx]); + if (this->exclusive) { + impl->set_contributor_count(1); + } + if (entry_counts[idx] > 0) { + span> h_rects_span(host_rect_buffers[idx], entry_counts[idx]); + impl->contribute_dense_rect_list(h_rects_span, true); + deppart_host_free(host_rect_buffers[idx]); + } else { + impl->contribute_nothing(); + } + } + } + } + + template + void GPUPreimageMicroOp::gpu_populate_bitmasks() { + if (targets.size() == 0) { + assert(sparsity_outputs.empty()); + return; + } + + RegionInstance buffer = domain_transform.ptr_data[0].scratch_buffer; + + size_t tile_size = buffer.get_layout()->bytes_used; + //std::cout << "Using tile size of " << tile_size << " bytes." << std::endl; + Arena buffer_arena(buffer); + + Memory sysmem; + assert(find_memory(sysmem, Memory::SYSTEM_MEM, buffer_arena.location)); + + CUstream stream = this->stream->get_stream(); + + NVTX_DEPPART(gpu_preimage); + + collapsed_space inst_space; + + // We combine all of our instances into one to batch work, tracking the offsets between instances. + inst_space.offsets = buffer_arena.alloc(domain_transform.ptr_data.size() + 1); + inst_space.num_children = domain_transform.ptr_data.size(); + + Arena sys_arena; + GPUMicroOp::collapse_multi_space(domain_transform.ptr_data, inst_space, sys_arena, stream); + + collapsed_space collapsed_parent; + + // We collapse the parent space to undifferentiate between dense and sparse and match downstream APIs. + GPUMicroOp::collapse_parent_space(parent_space, collapsed_parent, buffer_arena, stream); + + + // This is used for count + emit: first pass counts how many rectangles survive intersection, second pass uses the counter + // to figure out where to write each rectangle. + uint32_t* d_inst_counters = buffer_arena.alloc(2 * domain_transform.ptr_data.size() + 1); + + // This will be a prefix sum over the counters, used first to figure out where to write in the emit phase, and second + // to track which instance each rectangle came from in the populate phase. + uint32_t* d_inst_prefix = d_inst_counters + domain_transform.ptr_data.size(); + + collapsed_space target_space; + target_space.offsets = buffer_arena.alloc(targets.size() + 1); + target_space.num_children = targets.size(); + + GPUMicroOp::collapse_multi_space(targets, target_space, buffer_arena, stream); + + std::vector,N,T>> h_accessors(domain_transform.ptr_data.size()); + for (size_t i = 0; i < domain_transform.ptr_data.size(); ++i) { + h_accessors[i] = AffineAccessor,N,T>(domain_transform.ptr_data[i].inst, domain_transform.ptr_data[i].field_offset); + } + AffineAccessor,N,T>* d_accessors = + buffer_arena.alloc,N,T>>(domain_transform.ptr_data.size()); + CUDA_CHECK(cudaMemcpyAsync(d_accessors, h_accessors.data(), + domain_transform.ptr_data.size() * sizeof(AffineAccessor,N,T>), + cudaMemcpyHostToDevice, stream), stream); + + uint32_t* d_target_counters = buffer_arena.alloc(2*targets.size() + 1); + uint32_t* d_targets_prefix = d_target_counters + targets.size(); + CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, targets.size() * sizeof(uint32_t), stream), stream); + + buffer_arena.commit(false); + + size_t num_output = 0; + RectDesc* output_start = nullptr; + size_t num_completed = 0; + size_t curr_tile = tile_size / 2; + int count = 0; + if (count) {} + bool host_fallback = false; + std::vector*> host_rect_buffers(targets.size(), nullptr); + std::vector entry_counts(targets.size(), 0); + while (num_completed < inst_space.num_entries) { + try { + + //std::cout << "Preimage iteration " << count++ << ", completed " << num_completed << " / " << inst_space.num_entries << " entries." << std::endl; + buffer_arena.start(); + if (num_completed + curr_tile > inst_space.num_entries) { + curr_tile = inst_space.num_entries - num_completed; + } + + collapsed_space inst_space_tile = inst_space; + inst_space_tile.num_entries = curr_tile; + inst_space_tile.entries_buffer = buffer_arena.alloc>(curr_tile); + CUDA_CHECK(cudaMemcpyAsync(inst_space_tile.entries_buffer, inst_space.entries_buffer + num_completed, curr_tile * sizeof(SparsityMapEntry), cudaMemcpyHostToDevice, stream), stream); + + size_t num_valid_rects; + Rect* d_valid_rects; + // Here we intersect the instance spaces with the parent space, and make sure we know which instance each resulting rectangle came from. + GPUMicroOp::template construct_input_rectlist>(inst_space_tile, collapsed_parent, d_valid_rects, num_valid_rects, d_inst_counters, d_inst_prefix, buffer_arena, stream); + + if (num_valid_rects == 0) { + num_completed += curr_tile; + subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + curr_tile = tile_size / 2; + continue; + } + + // Prefix sum the valid rectangles by volume. + size_t total_pts; + size_t* d_prefix_rects; + GPUMicroOp::volume_prefix_sum(d_valid_rects, num_valid_rects, d_prefix_rects, total_pts, buffer_arena, stream); + + PointDesc* d_points; + size_t num_valid_points; + + CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream); + + if (target_space.num_entries > targets.size()) { + + BVH preimage_bvh; + GPUMicroOp::build_bvh(target_space, preimage_bvh, buffer_arena, stream); + + preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices, + preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.ptr_data.size(), preimage_bvh.num_leaves, nullptr, d_target_counters, nullptr); + KERNEL_CHECK(stream); + + std::vector h_target_counters(targets.size()+1); + h_target_counters[0] = 0; // prefix sum starts at 0 + CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t i = 0; i < targets.size(); ++i) { + h_target_counters[i+1] += h_target_counters[i]; + } + + num_valid_points = h_target_counters[targets.size()]; + + if (num_valid_points == 0) { + num_completed += curr_tile; + subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + curr_tile = tile_size / 2; + continue; + } + + CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream); + + buffer_arena.flip_parity(); + d_points = buffer_arena.alloc>(num_valid_points); + + CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream); + + preimage_gpuPopulateBitmasksPtrsKernel < N, T, N2, T2 ><<>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, preimage_bvh.root, preimage_bvh.childLeft, preimage_bvh.childRight, preimage_bvh.indices, + preimage_bvh.labels, preimage_bvh.boxes, total_pts, num_valid_rects, domain_transform.ptr_data.size(), preimage_bvh.num_leaves, d_targets_prefix, d_target_counters, d_points); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } else { + preimage_dense_populate_bitmasks_kernel< N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts, + num_valid_rects, domain_transform.ptr_data.size(), targets.size(), nullptr, d_target_counters, nullptr); + KERNEL_CHECK(stream); + + std::vector h_target_counters(targets.size()+1); + h_target_counters[0] = 0; // prefix sum starts at 0 + CUDA_CHECK(cudaMemcpyAsync(h_target_counters.data()+1, d_target_counters, targets.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + for (size_t i = 0; i < targets.size(); ++i) { + h_target_counters[i+1] += h_target_counters[i]; + } + + num_valid_points = h_target_counters[targets.size()]; + + if (num_valid_points == 0) { + num_completed += curr_tile; + subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + curr_tile = tile_size / 2; + continue; + } + + CUDA_CHECK(cudaMemcpyAsync(d_targets_prefix, h_target_counters.data(), (targets.size() + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice, stream), stream); + + buffer_arena.flip_parity(); + d_points = buffer_arena.alloc>(num_valid_points); + + CUDA_CHECK(cudaMemsetAsync(d_target_counters, 0, (targets.size()) * sizeof(uint32_t), stream), stream); + + preimage_dense_populate_bitmasks_kernel< N, T, N2, T2 ><<< COMPUTE_GRID(total_pts), THREADS_PER_BLOCK, 0, stream>>>(d_accessors, d_valid_rects, d_prefix_rects, d_inst_prefix, target_space.entries_buffer, target_space.offsets, total_pts, + num_valid_rects, domain_transform.ptr_data.size(), targets.size(), d_targets_prefix, d_target_counters, d_points); + KERNEL_CHECK(stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + } + + buffer_arena.flip_parity(); + buffer_arena.flip_parity(); + d_points = buffer_arena.alloc>(num_valid_points); + + size_t num_new_rects = num_output == 0 ? 1 : 2; + assert(!buffer_arena.get_parity()); + RectDesc* d_new_rects; + + this->complete_pipeline(d_points, num_valid_points, d_new_rects, num_new_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + + if (host_fallback) { + this->split_output(d_new_rects, num_new_rects, host_rect_buffers, entry_counts, buffer_arena); + } + + if (num_output==0 || host_fallback) { + num_output = num_new_rects; + num_completed += curr_tile; + output_start = d_new_rects; + subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + curr_tile = tile_size / 2; + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + continue; + } + + RectDesc* d_old_rects = buffer_arena.alloc>(num_output); + assert(d_old_rects == d_new_rects + num_new_rects); + CUDA_CHECK(cudaMemcpyAsync(d_old_rects, output_start, num_output * sizeof(RectDesc), cudaMemcpyDeviceToDevice, stream), stream); + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + size_t num_final_rects = 1; + + //Send it off for processing + this->complete_rect_pipeline(d_new_rects, num_output + num_new_rects, output_start, num_final_rects, buffer_arena, + /* the Container: */ sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + num_completed += curr_tile; + num_output = num_final_rects; + subtract_const<<>>(inst_space.offsets, domain_transform.ptr_data.size()+1, curr_tile); + KERNEL_CHECK(stream); + curr_tile = tile_size / 2; + CUDA_CHECK(cudaStreamSynchronize(stream), stream); + + } catch (arena_oom&) { + //std::cout << "Caught arena_oom, reducing tile size from " << curr_tile << " to " << curr_tile / 2 << std::endl; + curr_tile /= 2; + if (curr_tile == 0) { + if (host_fallback) { + GPUMicroOp::shatter_rects(inst_space, num_completed, stream); + curr_tile = 1; + } else { + host_fallback = true; + if (num_output > 0) { + this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena); + } + curr_tile = tile_size / 2; + } + } + } + } + if (num_output == 0) { + for (SparsityMap it : sparsity_outputs) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(it); + if (this->exclusive) { + impl->gpu_finalize(); + } else { + impl->contribute_nothing(); + } + } + return; + } + + if (!host_fallback) { + try { + this->send_output(output_start, num_output, buffer_arena, sparsity_outputs, + /* getIndex: */ [&](auto const& elem){ + // elem is a SparsityMap from the vector + return size_t(&elem - sparsity_outputs.data()); + }, + /* getMap: */ [&](auto const& elem){ + // return the SparsityMap key itself + return elem; + }); + } catch (arena_oom&) { + this->split_output(output_start, num_output, host_rect_buffers, entry_counts, buffer_arena); + host_fallback = true; + } + } + + if (host_fallback) { + for (size_t idx = 0; idx < sparsity_outputs.size(); ++idx) { + SparsityMapImpl *impl = SparsityMapImpl::lookup(sparsity_outputs[idx]); + if (this->exclusive) { + impl->set_contributor_count(1); + } + if (entry_counts[idx] > 0) { + span> h_rects_span(host_rect_buffers[idx], entry_counts[idx]); + impl->contribute_dense_rect_list(h_rects_span, true); + deppart_host_free(host_rect_buffers[idx]); + } else { + impl->contribute_nothing(); + } + } + } + } +} diff --git a/src/realm/deppart/preimage_gpu_kernels.hpp b/src/realm/deppart/preimage_gpu_kernels.hpp new file mode 100644 index 0000000000..10d9c5225c --- /dev/null +++ b/src/realm/deppart/preimage_gpu_kernels.hpp @@ -0,0 +1,256 @@ +#pragma once +#include "realm/deppart/preimage.h" + +namespace Realm { + + +template +__global__ void preimage_build_morton_codes( + const SparsityMapEntry* d_targets_entries, + const size_t* d_offsets_rects, + const Rect* d_global_bounds, + size_t total_rects, + size_t num_targets, + uint64_t* d_morton_codes, + uint64_t* d_indices, + uint64_t* d_targets_indices) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= total_rects) return; + const auto &entry = d_targets_entries[idx]; + d_morton_codes[idx] = bvh_morton_code(entry.bounds, *d_global_bounds); + d_indices[idx] = idx; + size_t low = 0, high = num_targets; + while (low < high) { + size_t mid = (low + high) >> 1; + if (d_offsets_rects[mid+1] <= idx) low = mid + 1; + else high = mid; + } + d_targets_indices[idx] = low; +} + +// +// 2) Initialize leaf boxes +// +template +__global__ +void preimage_init_leaf_boxes_kernel( + const SparsityMapEntry *rects, // [G] all flattened Rects + const uint64_t *leafIdx, // [n] maps leaf→orig Rect index + size_t total_rects, + Rect *boxes) // [(2n−1)] +{ + int k = blockIdx.x*blockDim.x + threadIdx.x; + if (k >= total_rects) return; + + size_t orig = leafIdx[k]; + boxes[k + total_rects - 1] = rects[orig].bounds; +} + + template +__device__ void preimage_queryBVH( + const Rect *boxes, + const int* childLeft, + const int* childRight, + const uint64_t* leafIdx, + const size_t* targets_indices, + int root, + size_t numTargetRects, + const Q& in_query, + Point out_point, + uint32_t* d_targets_prefix, + uint32_t* d_target_counters, + PointDesc *d_points) +{ + constexpr int MAX_STACK = 64; // max stack size for BVH traversal + int stack[MAX_STACK]; + int sp = 0; + + // start at the root + stack[sp++] = -1; + int node = root; + do + { + + int left = childLeft[node]; + int right = childRight[node]; + + bool overlapL; + bool overlapR; + + if constexpr (std::is_same_v>) { + overlapL = boxes[left].overlaps(in_query); + overlapR = boxes[right].overlaps(in_query); + } else { + static_assert(std::is_same_v>, + "Q must be Rect or Point"); + overlapL = boxes[left].contains(in_query); + overlapR = boxes[right].contains(in_query); + } + + + if (overlapL && left >= numTargetRects - 1) { + // left child is a leaf + uint64_t rect_idx = leafIdx[left - (numTargetRects - 1)]; + size_t target_idx = targets_indices[rect_idx]; + uint32_t local = atomicAdd(&d_target_counters[target_idx], 1); + if (d_points != nullptr) { + PointDesc point_desc; + point_desc.src_idx = target_idx; + point_desc.point = out_point; + uint32_t out_idx = d_targets_prefix[target_idx] + local; + d_points[out_idx] = point_desc; + } + } + if (overlapR && right >= numTargetRects - 1) { + uint64_t rect_idx = leafIdx[right - (numTargetRects - 1)]; + size_t target_idx = targets_indices[rect_idx]; + uint32_t local = atomicAdd(&d_target_counters[target_idx], 1); + if (d_points != nullptr) { + PointDesc point_desc; + point_desc.src_idx = target_idx; + point_desc.point = out_point; + uint32_t out_idx = d_targets_prefix[target_idx] + local; + d_points[out_idx] = point_desc; + } + } + + bool traverseL = overlapL && left < numTargetRects - 1; + bool traverseR = overlapR && right < numTargetRects - 1; + + if (!traverseL && !traverseR) { + node = stack[--sp]; + } else { + node = (traverseL ? left : right); + if (traverseL && traverseR) { + stack[sp++] = right; + } + } + } while (node != -1); +} + +template < + int N, typename T, + int N2, typename T2, typename Q +> +__global__ +void preimage_gpuPopulateBitmasksPtrsKernel( + AffineAccessor *accessors, + Rect* rects, + size_t* prefix, + uint32_t* inst_offsets, + int root, + int *childLeft, + int *childRight, + uint64_t *indices, + uint64_t *targets_indices, + Rect *boxes, + size_t numPoints, + size_t numRects, + size_t numInsts, + size_t numTargetRects, + uint32_t* d_targets_prefix, + uint32_t* d_target_counters, + PointDesc *d_points +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numPoints) return; + size_t low = 0, high = numRects; + while (low < high) { + size_t mid = (low + high) >> 1; + if (prefix[mid+1] <= idx) low = mid + 1; + else high = mid; + } + size_t r = low; + low = 0, high = numInsts; + while (low < high) { + size_t mid = (low + high) >> 1; + if (inst_offsets[mid+1] <= r) low = mid + 1; + else high = mid; + } + size_t inst_idx = low; + size_t offset = idx - prefix[r]; + Point p; + for (int k = N-1; k >= 0; --k) { + size_t dim = rects[r].hi[k] + 1 - rects[r].lo[k]; + p[k] = rects[r].lo[k] + (offset % dim); + offset /= dim; + } + Q ptr = accessors[inst_idx].read(p); + preimage_queryBVH(boxes, childLeft, childRight, indices, targets_indices, root, numTargetRects, ptr, p, d_targets_prefix, d_target_counters, d_points); +} + +template < + int N, typename T, + int N2, typename T2, typename Q +> +__global__ +void preimage_dense_populate_bitmasks_kernel( + AffineAccessor* accessors, + Rect* rects, + size_t* prefix, + uint32_t* inst_offsets, + SparsityMapEntry* targets_entries, + size_t* target_offsets, + size_t numPoints, + size_t numRects, + size_t numInsts, + size_t numTargets, + uint32_t *d_targets_prefix, + uint32_t *d_target_counters, + PointDesc *d_points +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numPoints) return; + size_t low = 0, high = numRects; + while (low < high) { + size_t mid = (low + high) >> 1; + if (prefix[mid+1] <= idx) low = mid + 1; + else high = mid; + } + size_t r = low; + low = 0, high = numInsts; + while (low < high) { + size_t mid = (low + high) >> 1; + if (inst_offsets[mid+1] <= r) low = mid + 1; + else high = mid; + } + size_t inst_idx = low; + size_t offset = idx - prefix[r]; + Point p; + for (int k = N-1; k >= 0; --k) { + size_t dim = rects[r].hi[k] + 1 - rects[r].lo[k]; + p[k] = rects[r].lo[k] + (offset % dim); + offset /= dim; + } + Q ptr = accessors[inst_idx].read(p); + for (size_t i = 0; i < numTargets; i++) { + bool inside = false; + for (size_t j = target_offsets[i]; j < target_offsets[i+1]; j++) { + if constexpr (std::is_same_v>) { + if (targets_entries[j].bounds.overlaps(ptr)) { + inside = true; + break; + } + } else { + static_assert(std::is_same_v>, + "Q must be Rect or Point"); + if (targets_entries[j].bounds.contains(ptr)) { + inside = true; + break; + } + } + } + if (inside) { + uint32_t local = atomicAdd(&d_target_counters[i], 1); + if (d_points != nullptr) { + PointDesc point_desc; + point_desc.src_idx = i; + point_desc.point = p; + uint32_t out_idx = d_targets_prefix[i] + local; + d_points[out_idx] = point_desc; + } + } + } +} + +} // namespace Realm \ No newline at end of file diff --git a/src/realm/deppart/preimage_gpu_tmpl.cu b/src/realm/deppart/preimage_gpu_tmpl.cu new file mode 100644 index 0000000000..be634fcc34 --- /dev/null +++ b/src/realm/deppart/preimage_gpu_tmpl.cu @@ -0,0 +1,59 @@ +/* Copyright 2024 Stanford University, NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#define REALM_TEMPLATES_ONLY +#include "realm/deppart/preimage_gpu_kernels.hpp" +#include "realm/deppart/preimage_gpu_impl.hpp" + +#ifndef INST_N1 + #error "INST_N1 must be defined before including preimage_gpu_tmpl.cu" +#endif +#ifndef INST_N2 + #error "INST_N2 must be defined before including preimage_gpu_tmpl.cu" +#endif + +// same set of T1,T2 pairs you use on the CPU side: +#define FOREACH_TT(__func__) \ + __func__(int, int) \ + __func__(int, unsigned) \ + __func__(int, long long) \ + __func__(unsigned,int) \ + __func__(unsigned,unsigned) \ + __func__(unsigned,long long) \ + __func__(long long, int) \ + __func__(long long, unsigned) \ + __func__(long long, long long) + +#define FOREACH_T(__func__) \ + __func__(int) \ + __func__(unsigned) \ + __func__(long long) + +namespace Realm { + #define N1 INST_N1 + #define N2 INST_N2 + + #define DO_DOUBLE(T1,T2) \ + template class GPUPreimageMicroOp; \ + template class PreimageMicroOp; + + FOREACH_TT(DO_DOUBLE) + + #undef DO_DOUBLE + #undef N1 + #undef N2 + +} // namespace Realm \ No newline at end of file diff --git a/src/realm/deppart/preimage_tmpl.cc b/src/realm/deppart/preimage_tmpl.cc index 50bc3a1ba8..dadf4b8aa6 100644 --- a/src/realm/deppart/preimage_tmpl.cc +++ b/src/realm/deppart/preimage_tmpl.cc @@ -1,5 +1,5 @@ /* - * Copyright 2025 Stanford University, NVIDIA Corporation +* Copyright 2025 Stanford University, NVIDIA Corporation * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -28,32 +28,43 @@ #endif #define FOREACH_TT(__func__) \ - __func__(int,int) \ - __func__(int,unsigned) \ - __func__(int,long long) \ - __func__(unsigned,int) \ - __func__(unsigned,unsigned) \ - __func__(unsigned,long long) \ - __func__(long long,int) \ - __func__(long long,unsigned) \ - __func__(long long,long long) +__func__(int,int) \ +__func__(int,unsigned) \ +__func__(int,long long) \ +__func__(unsigned,int) \ +__func__(unsigned,unsigned) \ +__func__(unsigned,long long) \ +__func__(long long,int) \ +__func__(long long,unsigned) \ +__func__(long long,long long) namespace Realm { #define N1 INST_N1 #define N2 INST_N2 +#ifdef REALM_USE_CUDA + #define GPU_PREIMAGE_LINE(N1,T1,N2,T2) template class GPUPreimageMicroOp; +#else + #define GPU_PREIMAGE_LINE(N1,T1,N2,T2) /* no CUDA */ +#endif + #define DOIT(T1,T2) \ - template class PreimageMicroOp; \ - template class StructuredPreimageMicroOp; \ - template class PreimageOperation; \ - template PreimageMicroOp::PreimageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \ - template Event IndexSpace::create_subspaces_by_preimage( \ - const DomainTransform &, const std::vector > &, \ - std::vector > &, const ProfilingRequestSet &, Event) \ - const; +template class PreimageMicroOp; \ +GPU_PREIMAGE_LINE(N1,T1,N2,T2) \ +template class StructuredPreimageMicroOp; \ +template class PreimageOperation; \ +template PreimageMicroOp::PreimageMicroOp(NodeID, AsyncMicroOp *, Serialization::FixedBufferDeserializer&); \ +template void IndexSpace::by_preimage_buffer_requirements( \ + const std::vector>&, \ + const std::vector>&, \ + std::vector&) const; \ +template Event IndexSpace::create_subspaces_by_preimage( \ +const DomainTransform &, const std::vector > &, \ +std::vector > &, const ProfilingRequestSet &, Event) \ +const; FOREACH_TT(DOIT) -}; +}; \ No newline at end of file diff --git a/src/realm/deppart/rectlist.inl b/src/realm/deppart/rectlist.inl index 621476e511..233d14c5c2 100644 --- a/src/realm/deppart/rectlist.inl +++ b/src/realm/deppart/rectlist.inl @@ -647,8 +647,10 @@ namespace Realm { // as_map.rbegin()->second << "\n"; // bigger than everything - see if we can merge with the last guy T &last = as_map.rbegin()->second; - if(last == (r.lo[0] - 1)) - last = r.hi[0]; + if(last >= (r.lo[0] - 1)) { + if (last < r.hi[0]) + last = r.hi[0]; + } else if(last < (r.lo[0] - 1)) as_map[r.lo[0]] = r.hi[0]; } else { diff --git a/src/realm/deppart/setops.cc b/src/realm/deppart/setops.cc index 2ab367f13a..d8cdbc902d 100644 --- a/src/realm/deppart/setops.cc +++ b/src/realm/deppart/setops.cc @@ -1073,15 +1073,14 @@ namespace Realm { bitmask.add_rect(it->bounds); } else { SparsityMapImpl *impl = SparsityMapImpl::lookup(it->sparsity); - const std::vector >& entries = impl->get_entries(); - for(typename std::vector >::const_iterator it2 = entries.begin(); - it2 != entries.end(); - it2++) { - Rect isect = it->bounds.intersection(it2->bounds); + span> entries = impl->get_entries(); + for(size_t i = 0; i < entries.size(); i++) { + SparsityMapEntry entry = entries[i]; + Rect isect = it->bounds.intersection(entry.bounds); if(isect.empty()) continue; - assert(!it2->sparsity.exists()); - assert(it2->bitmap == 0); + assert(!entry.sparsity.exists()); + assert(entry.bitmap == 0); bitmask.add_rect(isect); } } @@ -1440,15 +1439,14 @@ namespace Realm { todo.push_back(lhs.bounds); } else { SparsityMapImpl *l_impl = SparsityMapImpl::lookup(lhs.sparsity); - const std::vector >& entries = l_impl->get_entries(); - for(typename std::vector >::const_iterator it = entries.begin(); - it != entries.end(); - it++) { - Rect isect = lhs.bounds.intersection(it->bounds); + span> entries = l_impl->get_entries(); + for(size_t i = 0; i < entries.size(); i++) { + SparsityMapEntry entry = entries[i]; + Rect isect = lhs.bounds.intersection(entry.bounds); if(isect.empty()) continue; - assert(!it->sparsity.exists()); - assert(it->bitmap == 0); + assert(!entry.sparsity.exists()); + assert(entry.bitmap == 0); todo.push_back(isect); } } diff --git a/src/realm/deppart/sparsity_impl.cc b/src/realm/deppart/sparsity_impl.cc index e1cf66c2c9..20c655a62c 100644 --- a/src/realm/deppart/sparsity_impl.cc +++ b/src/realm/deppart/sparsity_impl.cc @@ -25,9 +25,181 @@ #include "realm/deppart/rectlist.h" #include "realm/deppart/inst_helper.h" #include "realm/logging.h" +#include "realm/machine.h" +#ifdef REALM_USE_CUDA +#include +#endif +#include +#include +#include +#include +#include +#include namespace Realm { + namespace { + struct PendingOutputSparsityAllocation { + std::mutex mutex; + std::condition_variable cv; + ID result{ID::ID_NULL}; + bool ready{false}; + }; + + atomic next_output_sparsity_request{1}; + std::mutex pending_output_sparsity_mutex; + std::unordered_map + pending_output_sparsity_allocations; + + struct OutputSparsityAllocationRequest { + uint64_t request_id; + + static void handle_message(NodeID sender, + const OutputSparsityAllocationRequest &msg, + const void *data, + size_t datalen); + }; + + struct OutputSparsityAllocationResponse { + uint64_t request_id; + ID sparsity; + + static void handle_message(NodeID sender, + const OutputSparsityAllocationResponse &msg, + const void *data, + size_t datalen); + }; + + ActiveMessageHandlerReg + output_sparsity_allocation_request_reg; + ActiveMessageHandlerReg + output_sparsity_allocation_response_reg; + + template + inline T *deppart_gpu_host_alloc(size_t count) + { + if(count == 0) return nullptr; +#ifdef REALM_USE_CUDA + void *ptr = nullptr; + cudaError_t err = cudaHostAlloc(&ptr, count * sizeof(T), cudaHostAllocPortable); + assert(err == cudaSuccess); + return reinterpret_cast(ptr); +#else + return static_cast(std::malloc(count * sizeof(T))); +#endif + } + + inline void deppart_gpu_host_free(void *ptr) + { + if(ptr == nullptr) return; +#ifdef REALM_USE_CUDA + cudaError_t err = cudaFreeHost(ptr); + assert(err == cudaSuccess); +#else + std::free(ptr); +#endif + } + + inline bool deppart_sparsity_trace_enabled(void) + { + static int enabled = -1; + if(enabled < 0) + enabled = (std::getenv("REALM_DEPPART_SPARSITY_TRACE") != nullptr) ? 1 : 0; + return (enabled == 1); + } + + inline void deppart_sparsity_trace(const char *tag, + ::realm_id_t sparsity, + NodeID owner, + NodeID node, + int remaining_contrib, + int total_pieces, + int remaining_pieces, + size_t extra0 = 0, + size_t extra1 = 0) + { + if(!deppart_sparsity_trace_enabled()) + return; + std::fprintf(stderr, + "[deppart-trace] %s map=%llx owner=%d node=%d rem_contrib=%d " + "total_pieces=%d rem_pieces=%d extra0=%zu extra1=%zu\n", + tag, + static_cast(sparsity), + owner, + node, + remaining_contrib, + total_pieces, + remaining_pieces, + extra0, + extra1); + std::fflush(stderr); + } + } + + ID create_deppart_output_sparsity(NodeID target_node) + { + if(target_node == Network::my_node_id) { + SparsityMapImplWrapper *wrap = + get_runtime()->get_available_sparsity_impl(target_node); + wrap->add_references(1); + return ID(wrap->me); + } + + PendingOutputSparsityAllocation pending; + uint64_t request_id = next_output_sparsity_request.fetch_add(1); + { + std::lock_guard lock(pending_output_sparsity_mutex); + pending_output_sparsity_allocations.emplace(request_id, &pending); + } + + ActiveMessage amsg(target_node); + amsg->request_id = request_id; + amsg.commit(); + + std::unique_lock lock(pending.mutex); + pending.cv.wait(lock, [&pending]() { return pending.ready; }); + return pending.result; + } + + void OutputSparsityAllocationRequest::handle_message( + NodeID sender, + const OutputSparsityAllocationRequest &msg, + const void *data, + size_t datalen) + { + SparsityMapImplWrapper *wrap = + get_runtime()->get_available_sparsity_impl(Network::my_node_id); + wrap->add_references(1); + + ActiveMessage amsg(sender); + amsg->request_id = msg.request_id; + amsg->sparsity = wrap->me; + amsg.commit(); + } + + void OutputSparsityAllocationResponse::handle_message( + NodeID sender, + const OutputSparsityAllocationResponse &msg, + const void *data, + size_t datalen) + { + PendingOutputSparsityAllocation *pending = nullptr; + { + std::lock_guard lock(pending_output_sparsity_mutex); + auto it = pending_output_sparsity_allocations.find(msg.request_id); + assert(it != pending_output_sparsity_allocations.end()); + pending = it->second; + pending_output_sparsity_allocations.erase(it); + } + + { + std::lock_guard lock(pending->mutex); + pending->result = msg.sparsity; + pending->ready = true; + } + pending->cv.notify_one(); + } + extern Logger log_part; //////////////////////////////////////////////////////////////////////// @@ -353,6 +525,7 @@ namespace Realm { if(map_impl.compare_exchange(impl, new_impl)) { map_deleter = [](void *map_impl) { + delete static_cast *>(map_impl); }; return new_impl; @@ -416,36 +589,30 @@ namespace Realm { // full cross-product test for now - for larger rectangle lists, consider // an acceleration structure? if(approx) { - const std::vector> &rects1 = get_approx_rects(); - const std::vector> &rects2 = other->get_approx_rects(); - for(typename std::vector>::const_iterator it1 = rects1.begin(); - it1 != rects1.end(); it1++) { - Rect isect = it1->intersection(bounds); + span> rects1 = get_approx_rects(); + span> rects2 = other->get_approx_rects(); + for(size_t i = 0; i < rects1.size(); i++) { + Rect isect = rects1[i].intersection(bounds); if(isect.empty()) continue; - for(typename std::vector>::const_iterator it2 = rects2.begin(); - it2 != rects2.end(); it2++) { - if(it2->overlaps(isect)) + for(size_t j = 0; j < rects2.size(); j++) { + if(rects2[j].overlaps(isect)) return true; } } } else { - const std::vector> &entries1 = get_entries(); - const std::vector> &entries2 = other->get_entries(); - for(typename std::vector>::const_iterator it1 = - entries1.begin(); - it1 != entries1.end(); it1++) { - Rect isect = it1->bounds.intersection(bounds); + span> entries1 = get_entries(); + span> entries2 = other->get_entries(); + for(size_t i = 0; i < entries1.size(); i++) { + Rect isect = entries1[i].bounds.intersection(bounds); if(isect.empty()) continue; - for(typename std::vector>::const_iterator it2 = - entries2.begin(); - it2 != entries2.end(); it2++) { - if(!it2->bounds.overlaps(isect)) + for(size_t j = 0; j < entries2.size(); j++) { + if(!entries2[j].bounds.overlaps(isect)) continue; // TODO: handle further sparsity in either side - assert(!it1->sparsity.exists() && (it1->bitmap == 0) && - !it2->sparsity.exists() && (it2->bitmap == 0)); + assert(!entries1[i].sparsity.exists() && (entries1[i].bitmap == 0) && + !entries2[j].sparsity.exists() && (entries2[j].bitmap == 0)); return true; } } @@ -888,6 +1055,334 @@ namespace Realm { } } + template + int SparsityMapPublicImpl::choose_bvh_split_axis( + const std::vector& entry_ids, + size_t lo, size_t hi) const + { + assert(lo < hi); + + Rect bbox = entries[entry_ids[lo]].bounds; + for(size_t i = lo + 1; i < hi; i++) + bbox = bbox.union_bbox(entries[entry_ids[i]].bounds); + + int split_axis = 0; + long double best_extent = + static_cast(bbox.hi[0]) - static_cast(bbox.lo[0]); + + for(int d = 1; d < N; d++) { + long double extent = + static_cast(bbox.hi[d]) - static_cast(bbox.lo[d]); + if(extent > best_extent) { + best_extent = extent; + split_axis = d; + } + } + + return split_axis; + } + + template +bool SparsityMapPublicImpl::bvh_centroid_less(int axis, + uint32_t a, + uint32_t b) const + { + const Rect& ra = entries[a].bounds; + const Rect& rb = entries[b].bounds; + + // comparing (lo + hi) is equivalent to comparing centroids along the axis + const auto sa = ra.lo[axis] + ra.hi[axis]; + const auto sb = rb.lo[axis] + rb.hi[axis]; + if(sa != sb) + return (sa < sb); + + // deterministic tie-break + for(int i = 0; i < N; i++) { + if(ra.lo[i] != rb.lo[i]) return (ra.lo[i] < rb.lo[i]); + if(ra.hi[i] != rb.hi[i]) return (ra.hi[i] < rb.hi[i]); + } + + return (a < b); + } + + template + int SparsityMapPublicImpl::build_bvh_subtree(CPU_BVH& bvh, + std::vector& entry_ids, + size_t lo, + size_t hi) const + { + assert(lo < hi); + + // leaf: exactly one sparsity-map entry + if((hi - lo) == 1) { + const uint32_t entry_idx = entry_ids[lo]; + const uint32_t leaf_slot = static_cast(bvh.leaf_entries.size()); + bvh.leaf_entries.push_back(entry_idx); + + typename CPU_BVH::Node node; + node.bounds = entries[entry_idx].bounds; + node.left = -1; + node.right = -1; + node.begin = leaf_slot; + node.end = leaf_slot + 1; + + const int node_idx = static_cast(bvh.nodes.size()); + bvh.nodes.push_back(node); + return node_idx; + } + + const int split_axis = choose_bvh_split_axis(entry_ids, lo, hi); + const size_t mid = lo + ((hi - lo) >> 1); + + std::nth_element(entry_ids.begin() + lo, + entry_ids.begin() + mid, + entry_ids.begin() + hi, + [this, split_axis](uint32_t a, uint32_t b) { + return bvh_centroid_less(split_axis, a, b); + }); + + const int left_idx = build_bvh_subtree(bvh, entry_ids, lo, mid); + const int right_idx = build_bvh_subtree(bvh, entry_ids, mid, hi); + + typename CPU_BVH::Node node; + node.left = left_idx; + node.right = right_idx; + node.begin = bvh.nodes[left_idx].begin; + node.end = bvh.nodes[right_idx].end; + node.bounds = bvh.nodes[left_idx].bounds.union_bbox(bvh.nodes[right_idx].bounds); + + const int node_idx = static_cast(bvh.nodes.size()); + bvh.nodes.push_back(node); + return node_idx; + } + + template + void SparsityMapPublicImpl::request_bvh(void) + { + // fast path + if(bvh_valid.load_acquire()) + return; + + // the BVH indexes the entry list, so entries must already exist + if(!entries_valid.load_acquire()) + assert(false); + + if (from_gpu) { + auto gpu_entries = get_entries(); + entries = std::vector>(gpu_entries.data(), gpu_entries.data() + gpu_entries.size()); + } + + std::lock_guard lock(bvh_mutex); + + // somebody else may have built it while we were waiting + if(bvh_valid.load()) + return; + + CPU_BVH new_bvh; + new_bvh.clear(); + + const size_t count = entries.size(); + + // empty sparsity map: publish an empty-but-valid BVH + if(count == 0) { + entries_bvh = std::move(new_bvh); + bvh_valid.store_release(true); + return; + } + + // one leaf per sparsity-map entry + std::vector entry_ids(count); + for(uint32_t i = 0; i < count; i++) { + assert(!entries[i].sparsity.exists() && (entries[i].bitmap == 0)); + entry_ids[i] = i; + } + + // exact upper bounds for a binary tree with one entry per leaf + new_bvh.nodes.reserve((2 * count) - 1); + new_bvh.leaf_entries.reserve(count); + + new_bvh.root = build_bvh_subtree(new_bvh, entry_ids, 0, count); + + // publish only after construction is complete + entries_bvh = std::move(new_bvh); + bvh_valid.store_release(true); + } + + template bool SparsityMapPublicImpl::has_bvh() const + { + return bvh_valid.load_acquire(); + } + + + template + bool CPU_BVH::contains(const span>& entries, + const Point& p) const + { + if(!valid()) + return false; + + // Root bbox reject. + if(!nodes[root].bounds.contains(p)) + return false; + + std::vector stack; + stack.reserve(64); + stack.push_back(root); + + while(!stack.empty()) { + const int node_idx = stack.back(); + stack.pop_back(); + + const Node& node = nodes[node_idx]; + if(!node.bounds.contains(p)) + continue; + + if(node.is_leaf()) { + // Leaves currently correspond to exactly one entry, but use the range + // to keep the code compatible with future small-bucket leaves. + for(uint32_t i = node.begin; i < node.end; i++) { + const uint32_t entry_idx = leaf_entries[i]; + const SparsityMapEntry& entry = entries[entry_idx]; + + if(!entry.bounds.contains(p)) + continue; + + if(entry.sparsity.exists()) { + assert(0); + } else if(entry.bitmap != 0) { + assert(0); + } else { + return true; + } + } + } else { + // Push children whose bbox might still contain the point. + const int left = node.left; + const int right = node.right; + + if((right >= 0) && nodes[right].bounds.contains(p)) + stack.push_back(right); + if((left >= 0) && nodes[left].bounds.contains(p)) + stack.push_back(left); + } + } + + return false; + } + + template + bool CPU_BVH::contains_any(const span>& entries, + const Rect& r) const + { + if(!valid()) + return false; + + // Root bbox reject. + if(!nodes[root].bounds.overlaps(r)) + return false; + + std::vector stack; + stack.reserve(64); + stack.push_back(root); + + while(!stack.empty()) { + const int node_idx = stack.back(); + stack.pop_back(); + + const Node& node = nodes[node_idx]; + if(!node.bounds.overlaps(r)) + continue; + + if(node.is_leaf()) { + for(uint32_t i = node.begin; i < node.end; i++) { + const uint32_t entry_idx = leaf_entries[i]; + const SparsityMapEntry& entry = entries[entry_idx]; + + if(!entry.bounds.overlaps(r)) + continue; + + if(entry.sparsity.exists()) { + assert(0); + } else if(entry.bitmap != 0) { + assert(0); + } else { + return true; + } + } + } else { + const int left = node.left; + const int right = node.right; + + if((right >= 0) && nodes[right].bounds.overlaps(r)) + stack.push_back(right); + if((left >= 0) && nodes[left].bounds.overlaps(r)) + stack.push_back(left); + } + } + + return false; + } + + template + bool CPU_BVH::contains_all(const span>& entries, + const Rect& r) const + { + if(!valid()) + return false; + + // Root bbox reject. + if(!nodes[root].bounds.contains(r)) + return false; + + size_t total_volume = 0; + + std::vector stack; + stack.reserve(64); + stack.push_back(root); + + while(!stack.empty()) { + const int node_idx = stack.back(); + stack.pop_back(); + + const Node& node = nodes[node_idx]; + if(!node.bounds.overlaps(r)) + continue; + + if(node.is_leaf()) { + for(uint32_t i = node.begin; i < node.end; i++) { + const uint32_t entry_idx = leaf_entries[i]; + const SparsityMapEntry& entry = entries[entry_idx]; + + if(!entry.bounds.overlaps(r)) + continue; + + if(entry.sparsity.exists()) { + assert(0); + } else if(entry.bitmap != 0) { + assert(0); + } else { + Rect isect = entry.bounds.intersection(r); + total_volume += isect.volume(); + + // Early out as soon as we know we've covered enough. + if(total_volume >= r.volume()) + return true; + } + } + } else { + const int left = node.left; + const int right = node.right; + + if((right >= 0) && nodes[right].bounds.overlaps(r)) + stack.push_back(right); + if((left >= 0) && nodes[left].bounds.overlaps(r)) + stack.push_back(left); + } + } + + return (total_volume >= r.volume()); + } + //////////////////////////////////////////////////////////////////////// // // class SparsityMapImpl @@ -907,6 +1402,13 @@ namespace Realm { , sparsity_comm(_sparsity_comm) {} +template +SparsityMapImpl::~SparsityMapImpl(void) +{ + deppart_gpu_host_free(this->gpu_entries); + deppart_gpu_host_free(this->gpu_approx_rects); +} + template inline /*static*/ SparsityMapImpl * SparsityMapImpl::lookup(SparsityMap sparsity) @@ -989,6 +1491,14 @@ namespace Realm { template void SparsityMapImpl::set_contributor_count(int count) { + deppart_sparsity_trace("set_contributor_count.enter", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + count); if(NodeID(ID(me).sparsity_creator_node()) == Network::my_node_id) { // increment the count atomically - if it brings the total up to 0 // (which covers count == 0), immediately propagate the total piece @@ -1006,8 +1516,23 @@ namespace Realm { } } else { // send the contributor count to the owner node - sparsity_comm->send_contribute(me, count, 0, false); + // NOTE: must use SetContribCountMessage, not send_contribute! + // send_contribute arrives as contribute_raw_rects which DECREMENTS + // remaining_contributor_count by 1 (treating it as one contributor's piece), + // but set_contributor_count should INCREMENT by count. + ActiveMessage amsg(ID(me).sparsity_creator_node()); + amsg->sparsity = me; + amsg->count = count; + amsg.commit(); } + deppart_sparsity_trace("set_contributor_count.exit", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + count); } template @@ -1075,6 +1600,13 @@ namespace Realm { template void SparsityMapImpl::contribute_nothing(void) { + deppart_sparsity_trace("contribute_nothing.enter", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load()); NodeID owner = ID(me).sparsity_creator_node(); if(owner != Network::my_node_id) { @@ -1097,6 +1629,13 @@ namespace Realm { if(have_all_pieces) finalize(); } + deppart_sparsity_trace("contribute_nothing.exit", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load()); } template @@ -1137,11 +1676,33 @@ namespace Realm { contribute_raw_rects((rects.empty() ? 0 : &rects[0]), rects.size(), 1, disjoint, 0); } + template + void + SparsityMapImpl::contribute_dense_rect_list(const span> &rects, + bool disjoint) + { + + HybridRectangleList h_rect_list; + for (size_t i = 0; i < rects.size(); ++i) { + h_rect_list.add_rect(rects[i]); + } + contribute_dense_rect_list(h_rect_list.convert_to_vector(), disjoint); + } + template void SparsityMapImpl::contribute_raw_rects(const Rect *rects, size_t count, size_t piece_count, bool disjoint, size_t total_count) { + deppart_sparsity_trace("contribute_raw_rects.enter", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + count, + piece_count); if(count > 0) { AutoLock<> al(mutex); @@ -1192,8 +1753,7 @@ namespace Realm { old_data.swap(this->entries); size_t i = 0; size_t n = 0; - typename std::vector>::const_iterator old_it = - old_data.begin(); + typename std::vector>::iterator old_it = old_data.begin(); while((i < count) && (old_it != old_data.end())) { if(rects[i].hi[0] < (old_it->bounds.lo[0] - 1)) { this->entries.resize(n + 1); @@ -1380,6 +1940,15 @@ namespace Realm { finalize(); } + deppart_sparsity_trace("contribute_raw_rects.exit", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + count, + piece_count); } // adds a microop as a waiter for valid sparsity map data - returns true @@ -1388,6 +1957,14 @@ namespace Realm { template bool SparsityMapImpl::add_waiter(PartitioningMicroOp *uop, bool precise) { + deppart_sparsity_trace("add_waiter.enter", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + precise ? 1 : 0); // early out if(precise ? this->entries_valid.load_acquire() : this->approx_valid.load_acquire()) @@ -1437,6 +2014,15 @@ namespace Realm { sparsity_comm->send_request(me, request_precise, request_approx); } + deppart_sparsity_trace("add_waiter.exit", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + precise ? 1 : 0, + registered ? 1 : 0); return registered; } @@ -1480,6 +2066,15 @@ namespace Realm { void SparsityMapImpl::remote_data_reply(NodeID requestor, bool reply_precise, bool reply_approx) { + deppart_sparsity_trace("remote_data_reply.enter", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + reply_precise ? 1 : 0, + reply_approx ? 1 : 0); if(reply_approx) { // TODO if(!this->approx_valid.load_acquire()) @@ -1494,17 +2089,16 @@ namespace Realm { assert(false); // scan the entry list, sending bitmaps first and making a list of rects std::vector> rects; - for(typename std::vector>::const_iterator it = - this->entries.begin(); - it != this->entries.end(); it++) { - if(it->bitmap) { + for(size_t i = 0; i < this->get_entries().size(); i++) { + const SparsityMapEntry &entry = this->get_entries()[i]; + if(entry.bitmap) { // TODO: send bitmap assert(0); - } else if(it->sparsity.exists()) { + } else if(entry.sparsity.exists()) { // TODO: ? assert(0); } else { - rects.push_back(it->bounds); + rects.push_back(entry.bounds); } } @@ -1533,6 +2127,15 @@ namespace Realm { sparsity_comm->send_contribute(requestor, me, num_pieces + 1, total_count, /*disjoint=*/true, rdata, bytes); } + deppart_sparsity_trace("remote_data_reply.exit", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + reply_precise ? 1 : 0, + reply_approx ? 1 : 0); } template @@ -1557,7 +2160,7 @@ namespace Realm { }; template - static void compute_approximation(const std::vector> &entries, + static void compute_approximation(const span> &entries, std::vector> &approx_rects, int max_rects) { size_t n = entries.size(); @@ -1579,7 +2182,7 @@ namespace Realm { } template - static void compute_approximation(const std::vector> &entries, + static void compute_approximation(const span> &entries, std::vector> &approx_rects, int max_rects) { int n = entries.size(); @@ -1693,6 +2296,17 @@ namespace Realm { template void SparsityMapImpl::finalize(void) { + deppart_sparsity_trace("finalize.enter", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + this->entries.size()); + + this->from_gpu = false; + // in order to organize the data a little better and handle common coalescing // cases, we do N sort/merging passes, with each dimension appearing last // in the sort order at least once (so that we can merge in that dimension) @@ -1748,7 +2362,7 @@ namespace Realm { // now that we've got our entries nice and tidy, build a bounded approximation of them if(true /*ID(me).sparsity_creator_node() == Network::my_node_id*/) { assert(!this->approx_valid.load()); - compute_approximation(this->entries, this->approx_rects, + compute_approximation(span>(this->entries.data(), this->entries.size()), this->approx_rects, DeppartConfig::cfg_max_rects_in_approximation); this->approx_valid.store_release(true); } @@ -1830,6 +2444,146 @@ namespace Realm { if(trigger_precise.exists()) GenEventImpl::trigger(trigger_precise, false /*!poisoned*/); + + deppart_sparsity_trace("finalize.exit", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + this->entries.size()); + + } + + + //Here, we copy everything the CPU finalize does except manipulating the entries further + //and we indicate that the sparsity map was constructed from the cpu + + template + void SparsityMapImpl::gpu_finalize(void) + { + deppart_sparsity_trace("gpu_finalize.enter", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + this->num_entries, + this->num_approx); + this->from_gpu = ((this->gpu_entries != nullptr) || (this->gpu_approx_rects != nullptr)); + + if(true /*ID(me).sparsity_creator_node() == Network::my_node_id*/) { + assert(!this->approx_valid.load()); + this->approx_valid.store_release(true); + } + + { + LoggerMessage msg = log_part.info(); + if(msg.is_active()) { + msg << "finalizing " << me << "(" << this << "), " << this->entries.size() + << " entries"; + for(size_t i = 0; i < this->entries.size(); i++) + msg << "\n [" << i << "]: bounds=" << this->entries[i].bounds + << " sparsity=" << this->entries[i].sparsity + << " bitmap=" << this->entries[i].bitmap; + } + } + +#ifdef DEBUG_PARTITIONING + std::cout << "finalizing " << this << ", " << this->entries.size() << " entries" + << std::endl; + for(size_t i = 0; i < this->entries.size(); i++) + std::cout << " [" << i << "]: bounds=" << this->entries[i].bounds + << " sparsity=" << this->entries[i].sparsity + << " bitmap=" << this->entries[i].bitmap << std::endl; +#endif + NodeSet sendto_precise, sendto_approx; + Event trigger_precise = Event::NO_EVENT; + Event trigger_approx = Event::NO_EVENT; + std::vector precise_waiters_copy, approx_waiters_copy; + { + AutoLock<> al(mutex); + + assert(!this->entries_valid.load()); + this->entries_valid.store_release(true); + + precise_requested = false; + if(precise_ready_event.exists()) { + trigger_precise = precise_ready_event; + precise_ready_event = Event::NO_EVENT; + } + + precise_waiters_copy.swap(precise_waiters); + approx_waiters_copy.swap(approx_waiters); + + remote_precise_waiters.swap(sendto_precise); + remote_approx_waiters.swap(sendto_approx); + } + + for(std::vector::const_iterator it = + precise_waiters_copy.begin(); + it != precise_waiters_copy.end(); it++) + (*it)->sparsity_map_ready(this, true); + + for(std::vector::const_iterator it = + approx_waiters_copy.begin(); + it != approx_waiters_copy.end(); it++) + (*it)->sparsity_map_ready(this, false); + + if(!sendto_approx.empty()) { + for(NodeID i = 0; (i <= Network::max_node_id) && !sendto_approx.empty(); i++) + if(sendto_approx.contains(i)) { + bool also_precise = sendto_precise.contains(i); + if(also_precise) + sendto_precise.remove(i); + remote_data_reply(i, also_precise, true); + sendto_approx.remove(i); + } + } + + if(!sendto_precise.empty()) { + for(NodeID i = 0; (i <= Network::max_node_id) && !sendto_precise.empty(); i++) + if(sendto_precise.contains(i)) { + remote_data_reply(i, true, false); + sendto_precise.remove(i); + } + } + + if(trigger_approx.exists()) + GenEventImpl::trigger(trigger_approx, false /*!poisoned*/); + + if(trigger_precise.exists()) + GenEventImpl::trigger(trigger_precise, false /*!poisoned*/); + deppart_sparsity_trace("gpu_finalize.exit", + me.id, + ID(me).sparsity_creator_node(), + Network::my_node_id, + remaining_contributor_count.load(), + total_piece_count.load(), + remaining_piece_count.load(), + this->num_entries, + this->num_approx); + } + + + template + void SparsityMapImpl::set_gpu_entries(SparsityMapEntry *entries, size_t size) + { + deppart_gpu_host_free(this->gpu_entries); + this->gpu_entries = entries; + this->entries.clear(); + this->num_entries = size; + } + + template + void SparsityMapImpl::set_gpu_approx_rects(Rect *approx_rects, size_t size) + { + deppart_gpu_host_free(this->gpu_approx_rects); + this->gpu_approx_rects = approx_rects; + this->approx_rects.clear(); + this->num_approx = size; } template @@ -1844,6 +2598,10 @@ namespace Realm { /*static*/ ActiveMessageHandlerReg< typename SparsityMapImpl::SetContribCountMessage> SparsityMapImpl::set_contrib_count_msg_reg; + template + /*static*/ ActiveMessageHandlerReg< + typename SparsityMapImpl::RemoteGpuFinalizeMessage> + SparsityMapImpl::remote_gpu_finalize_msg_reg; /*static*/ ActiveMessageHandlerReg< typename SparsityMapRefCounter::SparsityMapAddReferenceMessage> @@ -1901,10 +2659,47 @@ namespace Realm { SparsityMapImpl::lookup(msg.sparsity)->set_contributor_count(msg.count); } + //////////////////////////////////////////////////////////////////////// + // + // class SparsityMapImpl::RemoteGpuFinalizeMessage + + template + inline /*static*/ void SparsityMapImpl::RemoteGpuFinalizeMessage::handle_message( + NodeID sender, const SparsityMapImpl::RemoteGpuFinalizeMessage &msg, + const void *data, size_t datalen) + { + size_t expected = (msg.num_entries * sizeof(SparsityMapEntry)) + + (msg.num_approx * sizeof(Rect)); + assert(datalen == expected); + (void)sender; + + const char *payload = static_cast(data); + SparsityMapImpl *impl = SparsityMapImpl::lookup(msg.sparsity); + + if(msg.num_entries > 0) { + SparsityMapEntry *entries = deppart_gpu_host_alloc>(msg.num_entries); + std::memcpy(entries, payload, msg.num_entries * sizeof(SparsityMapEntry)); + impl->set_gpu_entries(entries, msg.num_entries); + payload += msg.num_entries * sizeof(SparsityMapEntry); + } else { + impl->set_gpu_entries(nullptr, 0); + } + + if(msg.num_approx > 0) { + Rect *approx = deppart_gpu_host_alloc>(msg.num_approx); + std::memcpy(approx, payload, msg.num_approx * sizeof(Rect)); + impl->set_gpu_approx_rects(approx, msg.num_approx); + } else { + impl->set_gpu_approx_rects(nullptr, 0); + } + impl->gpu_finalize(); + } + #define DOIT(N, T) \ template class SparsityMapPublicImpl; \ template class SparsityMapImpl; \ - template class SparsityMap; + template class SparsityMap; \ + template struct CPU_BVH; FOREACH_NT(DOIT) }; // namespace Realm diff --git a/src/realm/deppart/sparsity_impl.h b/src/realm/deppart/sparsity_impl.h index 4a3ed14349..aa94d7200f 100644 --- a/src/realm/deppart/sparsity_impl.h +++ b/src/realm/deppart/sparsity_impl.h @@ -33,6 +33,9 @@ namespace Realm { + REALM_INTERNAL_API_EXTERNAL_LINKAGE + ID create_deppart_output_sparsity(NodeID target_node); + class PartitioningMicroOp; /** @@ -109,6 +112,8 @@ namespace Realm { SparsityMapImpl(SparsityMap _me, NodeSet &subscribers, SparsityMapCommunicator *_sparsity_comm); + ~SparsityMapImpl(); + // actual implementation - SparsityMapPublicImpl's version just calls this one Event make_valid(bool precise = true); @@ -125,6 +130,7 @@ namespace Realm { void contribute_nothing(void); void contribute_dense_rect_list(const std::vector> &rects, bool disjoint); + void contribute_dense_rect_list(const span> &rects, bool disjoint); void contribute_raw_rects(const Rect *rects, size_t count, size_t piece_count, bool disjoint, size_t total_count); @@ -136,6 +142,10 @@ namespace Realm { void remote_data_request(NodeID requestor, bool send_precise, bool send_approx); void remote_data_reply(NodeID requestor, bool send_precise, bool send_approx); + void set_gpu_entries(SparsityMapEntry *entries, size_t size); + void set_gpu_approx_rects(Rect *approx_rects, size_t size); + void gpu_finalize(void); + SparsityMap me; struct RemoteSparsityRequest { @@ -167,12 +177,22 @@ namespace Realm { const void *data, size_t datalen); }; + struct RemoteGpuFinalizeMessage { + SparsityMap sparsity; + size_t num_entries; + size_t num_approx; + + static void handle_message(NodeID sender, const RemoteGpuFinalizeMessage &msg, + const void *data, size_t datalen); + }; + protected: void finalize(void); static ActiveMessageHandlerReg remote_sparsity_request_reg; static ActiveMessageHandlerReg remote_sparsity_contrib_reg; static ActiveMessageHandlerReg set_contrib_count_msg_reg; + static ActiveMessageHandlerReg remote_gpu_finalize_msg_reg; atomic remaining_contributor_count{0}; atomic total_piece_count{0}, remaining_piece_count{0}; diff --git a/src/realm/deppart/untemplated_gpu_kernels.cu b/src/realm/deppart/untemplated_gpu_kernels.cu new file mode 100644 index 0000000000..a45e8f8962 --- /dev/null +++ b/src/realm/deppart/untemplated_gpu_kernels.cu @@ -0,0 +1,119 @@ +#include "realm/deppart/partitions.h" + +namespace Realm { + +__device__ __forceinline__ +int bvh_common_prefix(const uint64_t *morton, const uint64_t *leafIdx, int i, int j, int n) { + if (j < 0 || j >= n) return -1; + uint64_t x = morton[i] ^ morton[j]; + uint64_t y = leafIdx[i] ^ leafIdx[j]; + if (x == 0) { + return 64 + __clzll(y); + } + return __clzll(x); +} + +__global__ +void bvh_build_radix_tree_kernel( + const uint64_t *morton, // [n] + const uint64_t *leafIdx, // [n] (unused here but kept for symmetry) + int n, + int *childLeft, // [2n−1] + int *childRight, // [2n−1] + int *parent) // [2n−1], pre‐initialized to −1 +{ + int idx = blockIdx.x*blockDim.x + threadIdx.x; + int i = idx; + if (i >= n-1) return; // we only build n−1 internal nodes + + int left, right; + int dL = bvh_common_prefix(morton, leafIdx, i, i-1, n); + int dR = bvh_common_prefix(morton, leafIdx, i, i+1, n); + int d = (dR > dL ? +1 : -1); + int deltaMin = (dR > dL ? dL : dR); + + // 3) find j by exponential + binary search + int l_max = 2; + int delta = -1; + int i_tmp = i + d * l_max; + if (0 <= i_tmp && i_tmp < n) { + delta = bvh_common_prefix(morton, leafIdx, i, i_tmp, n); + } + while (delta > deltaMin) { + l_max <<= 1; + i_tmp = i + d * l_max; + delta = -1; + if (0 <= i_tmp && i_tmp < n) { + delta = bvh_common_prefix(morton, leafIdx, i, i_tmp, n); + } + } + int l = 0; + int t = (l_max) >> 1; + while (t > 0) { + i_tmp = i + d*(l + t); + delta = -1; + if (0 <= i_tmp && i_tmp < n) { + delta = bvh_common_prefix(morton, leafIdx, i, i_tmp, n); + } + if (delta > deltaMin) { + l += t; + } + t >>= 1; + } + if (d < 0) { + right = i; + left = i + d*l; + } else { + left = i; + right = i + d*l; + } + + int gamma; + if (morton[left] == morton[right] && leafIdx[left] == leafIdx[right]) { + gamma = (left+right) >> 1; + } else { + int deltaNode = bvh_common_prefix(morton, leafIdx, left, right, n); + int split = left; + int stride = right - left; + do { + stride = (stride + 1) >> 1; + int middle = split + stride; + if (middle < right) { + int delta = bvh_common_prefix(morton, leafIdx, left, middle, n); + if (delta > deltaNode) { + split = middle; + } + } + } while (stride > 1); + gamma = split; + } + + int left_node = gamma; + int right_node = gamma + 1; + if (left == gamma) { + left_node += n-1; + } + if (right == gamma + 1) { + right_node += n-1; + } + + childLeft [idx] = left_node; + childRight[idx] = right_node; + parent[left_node] = idx; + parent[right_node] = idx; +} + +__global__ +void bvh_build_root_kernel( + int *root, + int *parent, + size_t total_rects) { + + int tid = blockIdx.x*blockDim.x + threadIdx.x; + if (tid >= 2 * total_rects - 1) return; + if (parent[tid] == -1) { + *root = tid; + } +} + +} \ No newline at end of file diff --git a/src/realm/indexspace.h b/src/realm/indexspace.h index 842213c467..c1a61b21cb 100644 --- a/src/realm/indexspace.h +++ b/src/realm/indexspace.h @@ -29,6 +29,7 @@ #include "realm/realm_c.h" #include "realm/realm_config.h" +#include "realm/realm_assert.h" #include "realm/sparsity.h" #include "realm/dynamic_templates.h" @@ -108,6 +109,26 @@ namespace Realm { IS index_space; RegionInstance inst; size_t field_offset; + RegionInstance scratch_buffer = RegionInstance::NO_INST; + }; + + template + struct DeppartSubspace { + IndexSpace space; + size_t entries; + }; + + template + struct DeppartEstimateInput { + IndexSpace space; + Memory location; + }; + + struct DeppartBufferRequirements { + size_t lower_bound = 0; + size_t upper_bound = 0; + size_t minimum_alignment = 0; + Processor affinity_processor; }; /** @@ -716,6 +737,10 @@ namespace Realm { const std::vector &colors, std::vector> &subspaces, const ProfilingRequestSet &reqs, Event wait_on = Event::NO_EVENT) const; + REALM_PUBLIC_API void by_field_buffer_requirements( + const std::vector>& inputs, + std::vector& requirements) const; + ///@{ /** * Allows the "function" described by the field to be composed with a @@ -780,8 +805,17 @@ namespace Realm { const std::vector> &sources, std::vector> &images, const ProfilingRequestSet &reqs, Event wait_on = Event::NO_EVENT) const; + + template + REALM_PUBLIC_API void by_image_buffer_requirements( + const std::vector>& source_spaces, + const std::vector>& inputs, + std::vector& requirements) const; + + ///@} + ///@{ /** * Computes subspaces of this index space by determining what subsets are @@ -898,6 +932,12 @@ namespace Realm { const std::vector> &targets, std::vector> &preimages, const ProfilingRequestSet &reqs, Event wait_on = Event::NO_EVENT) const; + + template + REALM_PUBLIC_API void by_preimage_buffer_requirements( + const std::vector>& target_spaces, + const std::vector>& inputs, + std::vector& requirements) const; ///@} ///@{ diff --git a/src/realm/indexspace.inl b/src/realm/indexspace.inl index cb0a83e6cb..b55e8b1aee 100644 --- a/src/realm/indexspace.inl +++ b/src/realm/indexspace.inl @@ -488,13 +488,12 @@ namespace Realm { SparsityMapPublicImpl *impl = sparsity.impl(); // if we don't have the data, it's too late - somebody should have waited - // we should have the metadata valid REALM_ASSERT(impl->is_valid(precise)); // always use precise info if it's available if(impl->is_valid(true /*precise*/)) { IndexSpace result; - const std::vector> &entries = impl->get_entries(); + span> entries = impl->get_entries(); // three cases: // 1) empty index space if(entries.empty()) { @@ -534,7 +533,7 @@ namespace Realm { log_dpops.info() << "tighten: " << *this << " = " << result; return result; } else { - const std::vector> &approx_rects = impl->get_approx_rects(); + span> approx_rects = impl->get_approx_rects(); // two cases: // 1) empty index space @@ -561,7 +560,7 @@ namespace Realm { // the index of the entry that contains the point, or the first one to appear after // that point template - static size_t bsearch_map_entries(const std::vector> &entries, + static size_t bsearch_map_entries(const span> &entries, const Point &p) { assert(N == 1); @@ -592,41 +591,45 @@ namespace Realm { if(dense()) return true; - SparsityMapPublicImpl *impl = sparsity.impl(); - const std::vector> &entries = impl->get_entries(); + SparsityMapPublicImpl *impl = sparsity.impl(); + span> entries = impl->get_entries(); if(N == 1) { // binary search to find the element we want - size_t idx = bsearch_map_entries(entries, p); - if(idx >= entries.size()) - return false; + size_t idx = bsearch_map_entries(entries, p); + if(idx >= entries.size()) return false; - const SparsityMapEntry &e = entries[idx]; + const SparsityMapEntry& e = entries[idx]; // the search guaranteed we're below the upper bound of the returned entry, // but we might be below the lower bound if(p[0] < e.bounds.lo[0]) - return false; + return false; if(e.sparsity.exists()) { - assert(0); + assert(0); } if(e.bitmap != 0) { - assert(0); + assert(0); } return true; } else { - for(typename std::vector>::const_iterator it = - entries.begin(); - it != entries.end(); it++) { - if(!it->bounds.contains(p)) - continue; - if(it->sparsity.exists()) { - assert(0); - } else if(it->bitmap != 0) { - assert(0); - } else { - return true; - } + + if (impl->has_bvh()) { + return impl->entries_bvh.contains(entries, p); + } + + for(size_t i = 0; i < entries.size(); i++) { + SparsityMapEntry entry = entries[i]; + if(!entry.bounds.contains(p)) { + continue; + } + if(entry.sparsity.exists()) { + assert(0); + } else if(entry.bitmap != 0) { + assert(0); + } else { + return true; + } } } @@ -641,32 +644,34 @@ namespace Realm { if(!bounds.contains(r)) return false; - if(!dense()) { - // test against sparsity map too - size_t total_volume = 0; - SparsityMapPublicImpl *impl = sparsity.impl(); - const std::vector> &entries = impl->get_entries(); - for(typename std::vector>::const_iterator it = - entries.begin(); - it != entries.end(); it++) { - if(!it->bounds.overlaps(r)) - continue; - if(it->sparsity.exists()) { - assert(0); - } else if(it->bitmap != 0) { - assert(0); - } else { - Rect isect = it->bounds.intersection(r); - total_volume += isect.volume(); - } - } + if(dense()) { + return true; + } + // test against sparsity map too + size_t total_volume = 0; + SparsityMapPublicImpl *impl = sparsity.impl(); + span> entries = impl->get_entries(); - // did we miss anything? - if(total_volume < r.volume()) - return false; + if(impl->has_bvh()) { + return impl->entries_bvh.contains_all(entries, r); } - return true; + for(size_t i = 0; i < entries.size(); i++) { + SparsityMapEntry entry = entries[i]; + if(!entry.bounds.overlaps(r)) + continue; + if(entry.sparsity.exists()) { + assert(0); + } else if(entry.bitmap != 0) { + assert(0); + } else { + Rect isect = entry.bounds.intersection(r); + total_volume += isect.volume(); + } + } + + // did we miss anything? + return (total_volume == r.volume()); } template @@ -676,28 +681,31 @@ namespace Realm { if(!bounds.overlaps(r)) return false; - if(!dense()) { - // test against sparsity map too - SparsityMapPublicImpl *impl = sparsity.impl(); - const std::vector> &entries = impl->get_entries(); - for(typename std::vector>::const_iterator it = - entries.begin(); - it != entries.end(); it++) { - if(!it->bounds.overlaps(r)) - continue; - if(it->sparsity.exists()) { - assert(0); - } else if(it->bitmap != 0) { - assert(0); - } else { - return true; - } - } + if(dense()) { + return true; + } + // test against sparsity map too + SparsityMapPublicImpl *impl = sparsity.impl(); + span> entries = impl->get_entries(); - return false; + if(impl->has_bvh()) { + return impl->entries_bvh.contains_any(entries, r); } - return true; + for(size_t i = 0; i < entries.size(); i++) { + SparsityMapEntry entry = entries[i]; + if(!entry.bounds.overlaps(r)) + continue; + if(entry.sparsity.exists()) { + assert(0); + } else if(entry.bitmap != 0) { + assert(0); + } else { + return true; + } + } + + return false; } template @@ -732,15 +740,15 @@ namespace Realm { size_t total = 0; SparsityMapPublicImpl *impl = sparsity.impl(); - const std::vector> &entries = impl->get_entries(); - for(typename std::vector>::const_iterator it = entries.begin(); - it != entries.end(); it++) { - Rect isect = bounds.intersection(it->bounds); + span> entries = impl->get_entries(); + for(size_t i = 0; i < entries.size(); i++) { + SparsityMapEntry entry = entries[i]; + Rect isect = bounds.intersection(entry.bounds); if(isect.empty()) continue; - if(it->sparsity.exists()) { + if(entry.sparsity.exists()) { assert(0); - } else if(it->bitmap != 0) { + } else if(entry.bitmap != 0) { assert(0); } else { total += isect.volume(); @@ -764,19 +772,20 @@ namespace Realm { if(dense()) return true; - SparsityMapPublicImpl *impl = sparsity.impl(); - const std::vector> &approx_rects = impl->get_approx_rects(); - for(typename std::vector>::const_iterator it = approx_rects.begin(); - it != approx_rects.end(); it++) - if(it->contains(p)) - return true; + SparsityMapPublicImpl *impl = sparsity.impl(); + span> approx_rects = impl->get_approx_rects(); + for(size_t i = 0; i < approx_rects.size(); i++) { + Rect entry = approx_rects[i]; + if(entry.contains(p)) + return true; + } // no entries matched, so the point is definitely not contained in this space return false; } template - inline bool IndexSpace::contains_all_approx(const Rect &r) const + inline bool IndexSpace::contains_all_approx(const Rect& r) const { // test on bounding box first if(!bounds.contains(r)) @@ -786,14 +795,14 @@ namespace Realm { if(dense()) return true; - SparsityMapPublicImpl *impl = sparsity.impl(); - const std::vector> &approx_rects = impl->get_approx_rects(); - for(typename std::vector>::const_iterator it = approx_rects.begin(); - it != approx_rects.end(); it++) { - if(it->contains(r)) - return true; - if(it->overlaps(r)) - assert(0); + SparsityMapPublicImpl *impl = sparsity.impl(); + span> approx_rects = impl->get_approx_rects(); + for(size_t i = 0; i < approx_rects.size(); i++) { + Rect entry = approx_rects[i]; + if(entry.contains(r)) + return true; + if(entry.overlaps(r)) + assert(0); } // no entries matched, so the point is definitely not contained in this space @@ -811,12 +820,12 @@ namespace Realm { if(dense()) return true; - SparsityMapPublicImpl *impl = sparsity.impl(); - const std::vector> &approx_rects = impl->get_approx_rects(); - for(typename std::vector>::const_iterator it = approx_rects.begin(); - it != approx_rects.end(); it++) { - if(it->overlaps(r)) - return true; + SparsityMapPublicImpl *impl = sparsity.impl(); + span> approx_rects = impl->get_approx_rects(); + for(size_t i = 0; i < approx_rects.size(); i++) { + Rect entry = approx_rects[i]; + if(entry.overlaps(r)) + return true; } // no entries matched, so the point is definitely not contained in this space @@ -838,29 +847,27 @@ namespace Realm { return contains_any_approx(other.bounds); // both sparse case can be expensive... - SparsityMapPublicImpl *impl = sparsity.impl(); - SparsityMapPublicImpl *other_impl = other.sparsity.impl(); + SparsityMapPublicImpl *impl = sparsity.impl(); + SparsityMapPublicImpl *other_impl = other.sparsity.impl(); // overlap can only be within intersecion of bounds - Rect isect = bounds.intersection(other.bounds); + Rect isect = bounds.intersection(other.bounds); return impl->overlaps(other_impl, isect, true /*approx*/); } - // approximage number of points in index space (may be less than volume of bounding box, - // but larger than + // approximage number of points in index space (may be less than volume of bounding box, but larger than // actual volume) template - inline size_t IndexSpace::volume_approx(void) const + inline size_t IndexSpace::volume_approx(void) const { if(dense()) return bounds.volume(); size_t total = 0; - SparsityMapPublicImpl *impl = sparsity.impl(); - const std::vector> &approx_rects = impl->get_approx_rects(); - for(typename std::vector>::const_iterator it = approx_rects.begin(); - it != approx_rects.end(); it++) - total += it->volume(); + SparsityMapPublicImpl *impl = sparsity.impl(); + span> approx_rects = impl->get_approx_rects(); + for(size_t i = 0; i < approx_rects.size(); i++) + total += approx_rects[i].volume(); return total; } @@ -1320,7 +1327,7 @@ namespace Realm { rect = Rect::make_empty(); - const std::vector> &entries = s_impl->get_entries(); + span> entries = s_impl->get_entries(); // find the first entry that overlaps our restriction - speed this up with a // binary search on the low end of the restriction if we're 1-D @@ -1356,7 +1363,7 @@ namespace Realm { // TODO: handle iteration within a sparsity entry // move onto the next sparsity entry (that overlaps our restriction) - const std::vector> &entries = s_impl->get_entries(); + const span> entries = s_impl->get_entries(); for(cur_entry++; cur_entry < entries.size(); cur_entry++) { const SparsityMapEntry &e = entries[cur_entry]; rect = restriction.intersection(e.bounds); diff --git a/src/realm/inst_layout.inl b/src/realm/inst_layout.inl index 0ee4db6960..acb2896e41 100644 --- a/src/realm/inst_layout.inl +++ b/src/realm/inst_layout.inl @@ -90,13 +90,13 @@ namespace Realm { // we need precise data for non-dense index spaces (the original // 'bounds' on the IndexSpace is often VERY conservative) SparsityMapPublicImpl *impl = is.sparsity.impl(); - const std::vector> &entries = impl->get_entries(); + span> entries = impl->get_entries(); if(!entries.empty()) { // TODO: set some sort of threshold for merging entries - typename std::vector>::const_iterator it = entries.begin(); - Rect bbox = is.bounds.intersection(it->bounds); - while(++it != entries.end()) - bbox = bbox.union_bbox(is.bounds.intersection(it->bounds)); + size_t i = 0; + Rect bbox = is.bounds.intersection(entries[i].bounds); + while(++i < entries.size()) + bbox = bbox.union_bbox(is.bounds.intersection(entries[i].bounds)); if(!bbox.empty()) piece_bounds.push_back(bbox); } diff --git a/src/realm/sparsity.h b/src/realm/sparsity.h index 1dc402a709..b16fce5ed7 100644 --- a/src/realm/sparsity.h +++ b/src/realm/sparsity.h @@ -30,6 +30,7 @@ #include "realm/atomics.h" #include +#include #include /** @@ -153,6 +154,44 @@ namespace Realm { HierarchicalBitMap *bitmap; }; + template + struct REALM_INTERNAL_API_EXTERNAL_LINKAGE CPU_BVH { + struct Node { + Rect bounds; + int left = -1; + int right = -1; + + // range in leaf_entries covered by this subtree + uint32_t begin = 0; + uint32_t end = 0; + + bool is_leaf() const { return left < 0; } + }; + + std::vector nodes; + std::vector leaf_entries; + int root = -1; + + bool valid() const { + return root >= 0; + } + + void clear() { + nodes.clear(); + leaf_entries.clear(); + root = -1; + } + + bool contains(const span>& entries, + const Point& p) const; + + bool contains_any(const span>& entries, + const Rect& r) const; + + bool contains_all(const span>& entries, + const Rect& r) const; + }; + template REALM_PUBLIC_API std::ostream &operator<<(std::ostream &os, const SparsityMapEntry &entry); @@ -173,6 +212,12 @@ namespace Realm { // cannot be constructed directly SparsityMapPublicImpl(void); + int choose_bvh_split_axis(const std::vector& entry_ids, + size_t lo, size_t hi) const; + bool bvh_centroid_less(int axis, uint32_t a, uint32_t b) const; + int build_bvh_subtree(CPU_BVH &bvh, std::vector &entry_ids, + size_t lo, size_t hi) const; + public: /** * Make this sparsity map valid. @@ -205,7 +250,7 @@ namespace Realm { * @return the entries of this sparsity map */ REALM_PUBLIC_API - const std::vector> &get_entries(void); + const span> get_entries(void); /** * Get the approximate rectangles of this sparsity map. @@ -215,7 +260,7 @@ namespace Realm { * @return the approximate rectangles of this sparsity map */ REALM_PUBLIC_API - const std::vector> &get_approx_rects(void); + const span> get_approx_rects(void); /** * Check if this sparsity map overlaps another sparsity map. @@ -244,10 +289,44 @@ namespace Realm { bool compute_covering(const Rect &bounds, size_t max_rects, int max_overhead, std::vector> &covering); + /** + * If this sparsity map doesn't already have an acceleration structure, + * build a BVH over the entries. + */ + REALM_PUBLIC_API + void request_bvh(void); + + /** + * Determine whether this sparsity map has an acceleration structure. + * @return true if the sparsity map has a valid bvh, false otherwise + */ + bool has_bvh() const; + + CPU_BVH entries_bvh; + + + protected: - atomic entries_valid{false}, approx_valid{false}; - std::vector> entries; - std::vector> approx_rects; + atomic entries_valid{false}, approx_valid{false}, bvh_valid{false}; + + std::mutex bvh_mutex; + + //BOTH RegionInstance and vector are returned as a span + //only on can be valid (i.e. only finalize or gpu_finalize can be called, not both) + + //Stores rectangles for CPU deppart (easy manipulation for sort/merge entries) + std::vector > entries; + std::vector > approx_rects; + + // Stores rectangles for GPU deppart in host buffers owned by the sparsity map. + SparsityMapEntry *gpu_entries = nullptr; + size_t num_entries = 0; + + Rect *gpu_approx_rects = nullptr; + size_t num_approx = 0; + + //Tracks whether to use instance or vector + bool from_gpu = false; }; }; // namespace Realm diff --git a/src/realm/sparsity.inl b/src/realm/sparsity.inl index a4a72fec05..60ffa41a70 100644 --- a/src/realm/sparsity.inl +++ b/src/realm/sparsity.inl @@ -18,9 +18,9 @@ // sparsity maps for Realm // nop, but helps IDEs +#include "realm/inst_layout.h" #include "realm/sparsity.h" -#include "realm/realm_assert.h" #include "realm/serialize.h" TEMPLATE_TYPE_IS_SERIALIZABLE2(int N, typename T, Realm::SparsityMap); @@ -84,19 +84,31 @@ namespace Realm { } template - inline const std::vector> & - SparsityMapPublicImpl::get_entries(void) + inline const span> SparsityMapPublicImpl::get_entries(void) { REALM_ASSERT(entries_valid.load_acquire()); - return entries; + if(from_gpu) { + if (num_entries == 0) { + return span>(); + } + return span>(gpu_entries, num_entries); + } else { + return span>(entries.data(), entries.size()); + } } template - inline const std::vector> & - SparsityMapPublicImpl::get_approx_rects(void) + inline const span> SparsityMapPublicImpl::get_approx_rects(void) { REALM_ASSERT(approx_valid.load_acquire()); - return approx_rects; + if(from_gpu) { + if (num_approx == 0) { + return span>(); + } + return span>(gpu_approx_rects, num_approx); + } else { + return span>(approx_rects.data(), approx_rects.size()); + } } }; // namespace Realm diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a6213d8b46..bc6123b299 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -277,6 +277,7 @@ add_integration_test(transpose "${REALM_TEST_DIR}/transpose.cc") set(proc_group_ARGS -ll:cpu 4) add_integration_test(proc_group "${REALM_TEST_DIR}/proc_group.cc") add_integration_test(deppart "${REALM_TEST_DIR}/deppart.cc") +add_integration_test(benchmark "${REALM_TEST_DIR}/benchmark.cc") set(scatter_ARGS -p1 2 -p2 2) add_integration_test(scatter "${REALM_TEST_DIR}/scatter.cc") set(proc_group_ARGS -ll:cpu 4) @@ -439,6 +440,10 @@ if(TEST_USE_GPU) task_stream "${REALM_TEST_DIR}/task_stream.cc" "${REALM_TEST_DIR}/task_stream_gpu.cu" ) target_link_libraries(task_stream ${TEST_GPU_LIBS}) + set(gpu_deppart_1d_ARGS -ll:gpu 1) + set(gpu_deppart_1d_RESOURCE_LOCK gpu) + add_integration_test(gpu_deppart_1d "${REALM_TEST_DIR}/gpu_deppart_1d.cc") + target_link_libraries(gpu_deppart_1d ${TEST_GPU_LIBS}) endif() #### C API tests diff --git a/tests/benchmark.cc b/tests/benchmark.cc new file mode 100644 index 0000000000..9277436a9f --- /dev/null +++ b/tests/benchmark.cc @@ -0,0 +1,2107 @@ +/* + * Copyright 2025 Stanford University, NVIDIA Corporation + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "realm.h" + +#include +#include +#include +#include +#include +#include + +#include + +#include "osdep.h" + +#include "philox.h" + +using namespace Realm; + +#define USE_IMAGE_DIFF + +Logger log_app("app"); + +// Task IDs, some IDs are reserved so start at first available number +enum +{ + TOP_LEVEL_TASK = Processor::TASK_ID_FIRST_AVAILABLE + 0, + INIT_BYFIELD_DATA_TASK, + INIT_IMAGE_DATA_TASK, + INIT_IMAGE_RANGE_DATA_TASK, + INIT_PREIMAGE_DATA_TASK, + INIT_PREIMAGE_RANGE_DATA_TASK +}; + +namespace std { + template + std::ostream &operator<<(std::ostream &os, const std::vector &v) + { + os << v.size() << "{"; + if(v.empty()) { + os << "}"; + } else { + os << " "; + typename std::vector::const_iterator it = v.begin(); + os << *it; + ++it; + while(it != v.end()) { + os << ", " << *it; + ++it; + } + os << " }"; + } + return os; + } +}; // namespace std + +// we're going to use alarm() as a watchdog to detect deadlocks +void sigalrm_handler(int sig) +{ + fprintf(stderr, "HELP! Alarm triggered - likely deadlock!\n"); + exit(1); +} + +class TestInterface { +public: + virtual ~TestInterface(void) {} + + virtual void print_info(void) = 0; + + virtual Event initialize_data(const std::vector &memories, + const std::vector &procs) = 0; + + virtual Event perform_partitioning(void) = 0; + + virtual int perform_dynamic_checks(void) = 0; + + virtual int check_partitioning(void) = 0; +}; + +// generic configuration settings +namespace { + int random_seed = 12345; + bool random_colors = false; + bool wait_on_events = false; + bool show_graph = false; + bool skip_check = false; + int dimension1 = 1; + int dimension2 = 1; + std::string op; + TestInterface *testcfg = 0; +}; // namespace + +template +Event copy_piece(FieldDataDescriptor src_data, FieldDataDescriptor &dst_data, const std::vector &fields, size_t field_idx, Memory dst_memory) +{ + size_t offset = 0; + for (size_t i = 0; i < field_idx; i++) { + offset += fields[i]; + } + size_t size = fields[field_idx]; + dst_data.index_space = src_data.index_space; + RegionInstance::create_instance(dst_data.inst, + dst_memory, + src_data.index_space, + fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + CopySrcDstField src_field, dst_field; + src_field.inst = src_data.inst; + src_field.size = size; + src_field.field_id = offset; + dst_field.inst = dst_data.inst; + dst_field.size = size; + dst_field.field_id = offset; + dst_data.field_offset = src_data.field_offset; + std::vector src_fields = {src_field}; + std::vector dst_fields = {dst_field}; + return src_data.index_space.copy(src_fields, dst_fields, Realm::ProfilingRequestSet()); +} + +Event alloc_piece(RegionInstance &result, size_t size, Memory location) { + assert(location != Memory::NO_MEMORY); + assert(size > 0); + std::vector byte_fields = {sizeof(char)}; + IndexSpace<1, long long> instance_index_space(Rect<1, long long>(0, size-1)); + return RegionInstance::create_instance(result, location, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()); +} + +template +IndexSpace create_sparse_index_space(const Rect &bounds, size_t sparse_factor, + bool randomize, size_t idx) +{ + std::vector> points; + for(PointInRectIterator it(bounds); it.valid; it.step()) { + size_t flattened = idx * bounds.volume(); + size_t stride = 1; + for (int d = 0; d < N; d++) { + flattened += (it.p[d] - bounds.lo[d]) * stride; + stride *= (bounds.hi[d] - bounds.lo[d] + 1); + } + if(randomize) { + if(Philox_2x32<>::rand_int(random_seed, flattened, 0, 100) < sparse_factor) { + points.push_back(it.p); + } + } else { + if( (99 * flattened) % 100 < sparse_factor) { + points.push_back(it.p); + } + } + } + return IndexSpace(points, true); +} + +/* + * Byfield test - create a graph, partition it by + * node subgraph id and then check that the partitioning + * is correct + */ +template +class ByfieldTest : public TestInterface { +public: + // graph config parameters + int num_nodes = 1000; + int num_pieces = 4; + int num_colors = 4; + size_t buffer_size = 100; + std::string filename; + + ByfieldTest(int argc, const char *argv[]) + { + for(int i = 1; i < argc; i++) { + + if(!strcmp(argv[i], "-p")) { + num_pieces = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-n")) { + num_nodes = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-c")) { + num_colors = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-b")) { + buffer_size = atoi(argv[++i]); + continue; + } + } + + + if (num_nodes <= 0 || num_pieces <= 0 || num_colors <= 0 || buffer_size <= 0 || buffer_size > 100) { + log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_colors << " pieces=" << num_pieces << " buffer size=" << buffer_size << "\n"; + exit(1); + } + } + + struct InitDataArgs { + int index; + RegionInstance ri_colors; + }; + + enum PRNGStreams + { + NODE_SUBGRAPH_STREAM, + }; + + // assign subgraph ids to nodes + void color_point(int idx, int& color) + { + if(random_colors) + color = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_colors); + else + color = (idx * num_colors / num_nodes) % num_colors; + } + + static void init_data_task_wrapper(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) + { + ByfieldTest *me = (ByfieldTest *)testcfg; + me->init_data_task(args, arglen, p); + } + + //Each piece has a task to initialize its data + void init_data_task(const void *args, size_t arglen, Processor p) + { + const InitDataArgs &i_args = *(const InitDataArgs *)args; + + log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_colors + << ")"; + + i_args.ri_colors.fetch_metadata(p).wait(); + + IndexSpace colors_space = i_args.ri_colors.template get_indexspace(); + + log_app.debug() << "N: " << is_colors; + + //For each node in the graph, mark it with a random (or deterministic) subgraph id + { + AffineAccessor a_piece_id(i_args.ri_colors, 0 /* offset */); + + for (IndexSpaceIterator it(is_colors); it.valid; it.step()) { + for (PointInRectIterator point(it.rect); point.valid; point.step()) { + int idx = 0; + int stride = 1; + for (int d = 0; d < N; d++) { + idx += (point.p[d] - is_colors.bounds.lo[d]) * stride; + stride *= (is_colors.bounds.hi[d] - is_colors.bounds.lo[d] + 1); + } + int subgraph; + color_point(idx, subgraph); + a_piece_id.write(point.p, subgraph); + } + } + } + } + + IndexSpace is_colors; + std::vector ri_colors; + std::vector, int> > piece_id_field_data; + + virtual void print_info(void) + { + //printf("Realm %dD Byfield dependent partitioning test: %d nodes, %d colors, %d pieces, %lu tile size\n", (int) N, + //(int)num_nodes, (int) num_colors, (int)num_pieces, buffer_size); + } + + virtual Event initialize_data(const std::vector &memories, + const std::vector &procs) + { + // now create index space for nodes + Point lo, hi; + for (int d = 0; d < N; d++) { + lo[d] = 0; + hi[d] = num_nodes - 1; + } + is_colors = Rect(lo, hi); + + // equal partition is used to do initial population of edges and nodes + std::vector > ss_nodes_eq; + + log_app.info() << "Creating equal subspaces\n"; + + is_colors.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); + + // create instances for each of these subspaces + std::vector node_fields; + node_fields.push_back(sizeof(int)); + + ri_colors.resize(num_pieces); + piece_id_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_nodes_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i], + node_fields, 0 /*SOA*/, + Realm::ProfilingRequestSet()) + .wait(); + ri_colors[i] = ri; + + piece_id_field_data[i].index_space = ss_nodes_eq[i]; + piece_id_field_data[i].inst = ri_colors[i]; + piece_id_field_data[i].field_offset = 0; + } + + // fire off tasks to initialize data + std::set events; + for(int i = 0; i < num_pieces; i++) { + Processor p = procs[i % procs.size()]; + InitDataArgs args; + args.index = i; + args.ri_colors = ri_colors[i]; + Event e = p.spawn(INIT_BYFIELD_DATA_TASK, &args, sizeof(args)); + events.insert(e); + } + + return Event::merge_events(events); + } + + // the outputs of our partitioning will be: + // p_nodes - nodes partitioned by subgraph id (from GPU) + // p_nodes_cpu - nodes partitioned by subgraph id (from CPU) + + + std::vector > p_nodes, p_garbage_nodes, p_nodes_cpu; + + virtual Event perform_partitioning(void) + { + // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU + // Ensure that the results are identical + + std::vector colors(num_colors); + for(int i = 0; i < num_colors; i++) + colors[i] = i; + + // We need a GPU memory for GPU partitioning + Memory gpu_memory; + bool found_gpu_memory = false; + Machine machine = Machine::get_machine(); + std::set all_memories; + machine.get_all_memories(all_memories); + for(Memory memory : all_memories) { + if(memory.kind() == Memory::GPU_FB_MEM) { + gpu_memory = memory; + found_gpu_memory = true; + break; + } + } + if (!found_gpu_memory) { + log_app.error() << "No GPU memory found for partitioning test\n"; + return Event::NO_EVENT; + } + + + std::vector node_fields; + node_fields.push_back(sizeof(int)); + + std::vector, int> > piece_field_data_gpu; + piece_field_data_gpu.resize(num_pieces); + + for (int i = 0; i < num_pieces; i++) { + copy_piece(piece_id_field_data[i], piece_field_data_gpu[i], node_fields, 0, gpu_memory).wait(); + } + + std::vector> byfield_inputs(num_pieces); + std::vector byfield_requirements(num_pieces); + + for (int i = 0; i < num_pieces; i++) { + byfield_inputs[i].location = piece_field_data_gpu[i].inst.get_location(); + byfield_inputs[i].space = piece_field_data_gpu[i].index_space; + } + + is_colors.by_field_buffer_requirements(byfield_inputs, byfield_requirements); + + + for (int i = 0; i < num_pieces; i++) { + size_t alloc_size = byfield_requirements[i].lower_bound + (byfield_requirements[i].upper_bound - byfield_requirements[i].lower_bound) * buffer_size / 100; + alloc_piece(piece_field_data_gpu[i].scratch_buffer, alloc_size, gpu_memory).wait(); + } + + log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; + Event warmup = is_colors.create_subspaces_by_field(piece_field_data_gpu, + colors, + p_garbage_nodes, + Realm::ProfilingRequestSet()); + warmup.wait(); + + long long start_gpu = Clock::current_time_in_microseconds(); + Event gpu_call = is_colors.create_subspaces_by_field(piece_field_data_gpu, + colors, + p_nodes, + Realm::ProfilingRequestSet()); + + gpu_call.wait(); + long long gpu_time = Clock::current_time_in_microseconds() - start_gpu; + long long start_cpu = Clock::current_time_in_microseconds(); + + Event cpu_call = is_colors.create_subspaces_by_field(piece_id_field_data, + colors, + p_nodes_cpu, + Realm::ProfilingRequestSet()); + + cpu_call.wait(); + long long cpu_time = Clock::current_time_in_microseconds() - start_cpu; + + printf("RESULT,op=byfield,d1=%d,num_nodes=%d,buffer_size=%zu,gpu_us=%lld,cpu_us=%lld\n", + N, num_nodes, buffer_size, gpu_time, cpu_time); + + return Event::merge_events({gpu_call, cpu_call}); + + } + + virtual int perform_dynamic_checks(void) + { + // Nothing to do here + return 0; + } + + virtual int check_partitioning(void) + { + int errors = 0; + + if (!p_nodes.size()) { + return p_nodes.size() == p_nodes_cpu.size(); + } + + log_app.info() << "Checking correctness of partitioning " << "\n"; + + for(int i = 0; i < num_pieces; i++) { + if (!p_nodes[i].dense() && (N > 1)) { + p_nodes[i].sparsity.impl()->request_bvh(); + if (!p_nodes_cpu[i].dense()) { + p_nodes_cpu[i].sparsity.impl()->request_bvh(); + } + } + for(IndexSpaceIterator it(p_nodes[i]); it.valid; it.step()) { + for(PointInRectIterator point(it.rect); point.valid; point.step()) { + if (!p_nodes_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra byfield point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for(IndexSpaceIterator it(p_nodes_cpu[i]); it.valid; it.step()) { + for(PointInRectIterator point(it.rect); point.valid; point.step()) { + if (!p_nodes[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing byfield point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + + } + return errors; + } +}; + +template +class ImageTest : public TestInterface { +public: + // graph config parameters + int num_nodes = 1000; + int num_edges = 1000; + int sparse_factor = 50; + int num_spaces = 4; + int num_pieces = 4; + size_t buffer_size = 100; + std::string filename; + + ImageTest(int argc, const char *argv[]) + { + for(int i = 1; i < argc; i++) { + + if(!strcmp(argv[i], "-p")) { + num_pieces = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-n")) { + num_nodes = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-e")) { + num_edges = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-s")) { + num_spaces = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-f")) { + sparse_factor = atoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "-b")) { + buffer_size = atoi(argv[++i]); + continue; + } + } + + + if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_spaces <= 0) { + log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " sources=" << num_spaces << " buffer size=" << buffer_size << "\n"; + exit(1); + } + } + + struct InitDataArgs { + int index; + RegionInstance ri_nodes; + }; + + enum PRNGStreams + { + NODE_SUBGRAPH_STREAM, + }; + + // assign subgraph ids to nodes + void chase_point(int idx, Point& color) + { + for (int d = 0; d < N1; d++) { + if(random_colors) + color[d] = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_edges); + else + color[d] = (idx * num_edges / num_nodes) % num_edges; + } + } + + static void init_data_task_wrapper(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) + { + ImageTest *me = (ImageTest *)testcfg; + me->init_data_task(args, arglen, p); + } + + //Each piece has a task to initialize its data + void init_data_task(const void *args, size_t arglen, Processor p) + { + const InitDataArgs &i_args = *(const InitDataArgs *)args; + + log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes + << ")"; + + i_args.ri_nodes.fetch_metadata(p).wait(); + + IndexSpace nodes_space = i_args.ri_nodes.template get_indexspace(); + + log_app.debug() << "N: " << is_nodes; + + //For each node in the graph, mark it with a random (or deterministic) subgraph id + { + AffineAccessor, N2> a_point(i_args.ri_nodes, 0 /* offset */); + + for (IndexSpaceIterator it(is_nodes); it.valid; it.step()) { + for (PointInRectIterator point(it.rect); point.valid; point.step()) { + int idx = 0; + int stride = 1; + for (int d = 0; d < N2; d++) { + idx += (point.p[d] - is_nodes.bounds.lo[d]) * stride; + stride *= (is_nodes.bounds.hi[d] - is_nodes.bounds.lo[d] + 1); + } + Point destination; + chase_point(idx, destination); + a_point.write(point.p, destination); + } + } + } + } + + IndexSpace is_nodes; + IndexSpace is_edges; + std::vector ri_nodes; + std::vector, Point> > point_field_data; + + virtual void print_info(void) + { + //printf("Realm %dD -> %dD Image dependent partitioning test: %d nodes, %d edges, %d pieces ,%d sources, %d sparse factor, %lu tile size\n", (int) N2, (int) N1, + //(int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) sparse_factor, buffer_size); + } + + virtual Event initialize_data(const std::vector &memories, + const std::vector &procs) + { + // now create index space for nodes + Point node_lo, node_hi; + for (int d = 0; d < N2; d++) { + node_lo[d] = 0; + node_hi[d] = num_nodes - 1; + } + is_nodes = Rect(node_lo, node_hi); + + Point edge_lo, edge_hi; + for (int d = 0; d < N1; d++) { + edge_lo[d] = 0; + edge_hi[d] = num_edges - 1; + } + is_edges = Rect(edge_lo, edge_hi); + + // equal partition is used to do initial population of edges and nodes + std::vector > ss_nodes_eq; + + log_app.info() << "Creating equal subspaces\n"; + + is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); + + // create instances for each of these subspaces + std::vector node_fields; + node_fields.push_back(sizeof(Point)); + + ri_nodes.resize(num_pieces); + point_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_nodes_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i], + node_fields, 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_nodes[i] = ri; + + point_field_data[i].index_space = ss_nodes_eq[i]; + point_field_data[i].inst = ri_nodes[i]; + point_field_data[i].field_offset = 0; + } + + // fire off tasks to initialize data + std::set events; + for(int i = 0; i < num_pieces; i++) { + Processor p = procs[i % procs.size()]; + InitDataArgs args; + args.index = i; + args.ri_nodes = ri_nodes[i]; + Event e = p.spawn(INIT_IMAGE_DATA_TASK, &args, sizeof(args)); + events.insert(e); + } + + return Event::merge_events(events); + } + + // the outputs of our partitioning will be: + // p_nodes - nodes partitioned by subgraph id (from GPU) + // p_nodes_cpu - nodes partitioned by subgraph id (from CPU) + + + std::vector > p_edges, p_garbage_edges, p_edges_cpu; + + virtual Event perform_partitioning(void) + { + // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU + // Ensure that the results are identical + + std::vector> sources(num_spaces); + for(int i = 0; i < num_spaces; i++) { + if (sparse_factor <= 1) { + sources[i] = point_field_data[i % num_pieces].index_space; + } else { + sources[i] = create_sparse_index_space(is_nodes.bounds, sparse_factor, random_colors, i); + } + } + + // We need a GPU memory for GPU partitioning + Memory gpu_memory; + bool found_gpu_memory = false; + Machine machine = Machine::get_machine(); + std::set all_memories; + machine.get_all_memories(all_memories); + for(Memory memory : all_memories) { + if(memory.kind() == Memory::GPU_FB_MEM) { + gpu_memory = memory; + found_gpu_memory = true; + break; + } + } + if (!found_gpu_memory) { + log_app.error() << "No GPU memory found for partitioning test\n"; + return Event::NO_EVENT; + } + + + std::vector node_fields; + node_fields.push_back(sizeof(Point)); + + std::vector, Point>> point_field_data_gpu; + point_field_data_gpu.resize(num_pieces); + + for (int i = 0; i < num_pieces; i++) { + copy_piece(point_field_data[i], point_field_data_gpu[i], node_fields, 0, gpu_memory).wait(); + } + + std::vector> image_inputs(num_pieces); + std::vector> image_subspaces(num_spaces); + std::vector image_requirements(num_pieces); + + for (int i = 0; i < num_pieces; i++) { + image_inputs[i].location = point_field_data_gpu[i].inst.get_location(); + image_inputs[i].space = point_field_data_gpu[i].index_space; + } + + for (int i = 0; i < num_spaces; i++) { + image_subspaces[i].space = sources[i]; + image_subspaces[i].entries = sources[i].dense() ? 1 : sources[i].sparsity.impl()->get_entries().size(); + } + + is_edges.by_image_buffer_requirements(image_subspaces, image_inputs, image_requirements); + + for (int i = 0; i < num_pieces; i++) { + size_t alloc_size = image_requirements[i].lower_bound + (image_requirements[i].upper_bound - image_requirements[i].lower_bound) * buffer_size / 100; + alloc_piece(point_field_data_gpu[i].scratch_buffer, alloc_size, gpu_memory).wait(); + } + + log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; + Event warmup = is_edges.create_subspaces_by_image(point_field_data_gpu, + sources, + p_garbage_edges, + Realm::ProfilingRequestSet()); + warmup.wait(); + + long long start_gpu = Clock::current_time_in_microseconds(); + Event gpu_call = is_edges.create_subspaces_by_image(point_field_data_gpu, + sources, + p_edges, + Realm::ProfilingRequestSet()); + + gpu_call.wait(); + long long gpu_us = Clock::current_time_in_microseconds() - start_gpu; + long long start_cpu = Clock::current_time_in_microseconds(); + Event cpu_call = is_edges.create_subspaces_by_image(point_field_data, + sources, + p_edges_cpu, + Realm::ProfilingRequestSet()); + + cpu_call.wait(); + long long cpu_us = Clock::current_time_in_microseconds() - start_cpu; + printf("RESULT,op=image,d1=%d,d2=%d,num_nodes=%d,num_edges=%d,num_spaces=%d,sparse_factor=%d,buffer_size=%zu,gpu_us=%lld,cpu_us=%lld\n", + N1, N2, num_nodes, num_edges, num_spaces, sparse_factor, buffer_size, gpu_us, cpu_us); + + return Event::merge_events({gpu_call, cpu_call}); + + } + + virtual int perform_dynamic_checks(void) + { + // Nothing to do here + return 0; + } + + virtual int check_partitioning(void) + { + int errors = 0; + + if (!p_edges.size()) { + return p_edges.size() == p_edges_cpu.size(); + } + + log_app.info() << "Checking correctness of partitioning " << "\n"; + + for(int i = 0; i < num_pieces; i++) { + if (N1 > 1) { + if (!p_edges[i].dense()) { + p_edges[i].sparsity.impl()->request_bvh(); + } + if (!p_edges_cpu[i].dense()) { + p_edges_cpu[i].sparsity.impl()->request_bvh(); + } + } + for(IndexSpaceIterator it(p_edges[i]); it.valid; it.step()) { + for(PointInRectIterator point(it.rect); point.valid; point.step()) { + if (!p_edges_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra image point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for(IndexSpaceIterator it(p_edges_cpu[i]); it.valid; it.step()) { + for(PointInRectIterator point(it.rect); point.valid; point.step()) { + if (!p_edges[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing image point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + + } + return errors; + } +}; + +template +class ImageRangeTest : public TestInterface { +public: + // graph config parameters + int num_nodes = 1000; + int num_edges = 1000; + int rect_size = 10; + int num_spaces = 4; + int num_pieces = 4; + int sparse_factor = 50; + size_t buffer_size = 100; + std::string filename; + + ImageRangeTest(int argc, const char *argv[]) + { + for(int i = 1; i < argc; i++) { + + if(!strcmp(argv[i], "-p")) { + num_pieces = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-n")) { + num_nodes = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-e")) { + num_edges = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-r")) { + rect_size = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-s")) { + num_spaces = atoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "-f")) { + sparse_factor = atoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "-b")) { + buffer_size = atoi(argv[++i]); + continue; + } + } + + + if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_spaces <= 0 || rect_size <= 0 || sparse_factor < 0 || sparse_factor > 100 || buffer_size < 0 || buffer_size > 100) { + log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " sources=" << num_spaces << " rect size=" << rect_size << " sparse factor=" << sparse_factor << " buffer_size=" << buffer_size << "\n"; + exit(1); + } + } + + struct InitDataArgs { + int index; + RegionInstance ri_nodes; + }; + + enum PRNGStreams + { + NODE_SUBGRAPH_STREAM, + }; + + // assign subgraph ids to nodes + void chase_rect(int idx, Rect& color) + { + for (int d = 0; d < N1; d++) { + if(random_colors) { + color.lo[d] = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_edges); + color.hi[d] = color.lo[d] + Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, 2 * rect_size); + } else { + color.lo[d] = (idx * num_edges / num_nodes) % num_edges; + color.hi[d] = color.lo[d] + rect_size; + } + } + } + + static void init_data_task_wrapper(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) + { + ImageRangeTest *me = (ImageRangeTest *)testcfg; + me->init_data_task(args, arglen, p); + } + + //Each piece has a task to initialize its data + void init_data_task(const void *args, size_t arglen, Processor p) + { + const InitDataArgs &i_args = *(const InitDataArgs *)args; + + log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes + << ")"; + + i_args.ri_nodes.fetch_metadata(p).wait(); + + IndexSpace nodes_space = i_args.ri_nodes.template get_indexspace(); + + log_app.debug() << "N: " << is_nodes; + + //For each node in the graph, mark it with a random (or deterministic) subgraph id + { + AffineAccessor, N2> a_rect(i_args.ri_nodes, 0 /* offset */); + + for (IndexSpaceIterator it(is_nodes); it.valid; it.step()) { + for (PointInRectIterator point(it.rect); point.valid; point.step()) { + int idx = 0; + int stride = 1; + for (int d = 0; d < N2; d++) { + idx += (point.p[d] - is_nodes.bounds.lo[d]) * stride; + stride *= (is_nodes.bounds.hi[d] - is_nodes.bounds.lo[d] + 1); + } + Rect destination; + chase_rect(idx, destination); + a_rect.write(point.p, destination); + } + } + } + } + + IndexSpace is_nodes; + IndexSpace is_edges; + std::vector ri_nodes; + std::vector, Rect> > rect_field_data; + + virtual void print_info(void) + { + //printf("Realm %dD -> %dD Image Range dependent partitioning test: %d nodes, %d edges, %d pieces ,%d sources, %d rect size, %d sparse factor, %lu tile size\n", (int) N2, (int) N1, + // (int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) rect_size, (int) sparse_factor, buffer_size); + } + + virtual Event initialize_data(const std::vector &memories, + const std::vector &procs) + { + // now create index space for nodes + Point node_lo, node_hi; + for (int d = 0; d < N2; d++) { + node_lo[d] = 0; + node_hi[d] = num_nodes - 1; + } + is_nodes = Rect(node_lo, node_hi); + + Point edge_lo, edge_hi; + for (int d = 0; d < N1; d++) { + edge_lo[d] = 0; + edge_hi[d] = num_edges - 1; + } + is_edges = Rect(edge_lo, edge_hi); + + // equal partition is used to do initial population of edges and nodes + std::vector > ss_nodes_eq; + + log_app.info() << "Creating equal subspaces\n"; + + is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); + + // create instances for each of these subspaces + std::vector node_fields; + node_fields.push_back(sizeof(Rect)); + + ri_nodes.resize(num_pieces); + rect_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_nodes_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i], + node_fields, 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_nodes[i] = ri; + + rect_field_data[i].index_space = ss_nodes_eq[i]; + rect_field_data[i].inst = ri_nodes[i]; + rect_field_data[i].field_offset = 0; + } + + // fire off tasks to initialize data + std::set events; + for(int i = 0; i < num_pieces; i++) { + Processor p = procs[i % procs.size()]; + InitDataArgs args; + args.index = i; + args.ri_nodes = ri_nodes[i]; + Event e = p.spawn(INIT_IMAGE_RANGE_DATA_TASK, &args, sizeof(args)); + events.insert(e); + } + + return Event::merge_events(events); + } + + // the outputs of our partitioning will be: + // p_nodes - nodes partitioned by subgraph id (from GPU) + // p_nodes_cpu - nodes partitioned by subgraph id (from CPU) + + + std::vector > p_edges, p_garbage_edges, p_edges_cpu; + + virtual Event perform_partitioning(void) + { + // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU + // Ensure that the results are identical + + std::vector> sources(num_spaces); + for(int i = 0; i < num_spaces; i++) { + if (sparse_factor <= 1) { + sources[i] = rect_field_data[i % num_pieces].index_space; + } else { + sources[i] = create_sparse_index_space(is_nodes.bounds, sparse_factor, random_colors, i); + } + } + + // We need a GPU memory for GPU partitioning + Memory gpu_memory; + bool found_gpu_memory = false; + Machine machine = Machine::get_machine(); + std::set all_memories; + machine.get_all_memories(all_memories); + for(Memory memory : all_memories) { + if(memory.kind() == Memory::GPU_FB_MEM) { + gpu_memory = memory; + found_gpu_memory = true; + break; + } + } + if (!found_gpu_memory) { + log_app.error() << "No GPU memory found for partitioning test\n"; + return Event::NO_EVENT; + } + + + std::vector node_fields; + node_fields.push_back(sizeof(Rect)); + + std::vector, Rect>> rect_field_data_gpu; + rect_field_data_gpu.resize(num_pieces); + + for (int i = 0; i < num_pieces; i++) { + copy_piece(rect_field_data[i], rect_field_data_gpu[i], node_fields, 0, gpu_memory).wait(); + } + + std::vector> image_inputs(num_pieces); + std::vector> image_subspaces(num_spaces); + std::vector image_requirements(num_pieces); + + for (int i = 0; i < num_pieces; i++) { + image_inputs[i].location = rect_field_data_gpu[i].inst.get_location(); + image_inputs[i].space = rect_field_data_gpu[i].index_space; + } + + for (int i = 0; i < num_spaces; i++) { + image_subspaces[i].space = sources[i]; + image_subspaces[i].entries = sources[i].dense() ? 1 : sources[i].sparsity.impl()->get_entries().size(); + } + + is_edges.by_image_buffer_requirements(image_subspaces, image_inputs, image_requirements); + + for (int i = 0; i < num_pieces; i++) { + size_t alloc_size = image_requirements[i].lower_bound + (image_requirements[i].upper_bound - image_requirements[i].lower_bound) * buffer_size / 100; + alloc_piece(rect_field_data_gpu[i].scratch_buffer, alloc_size, gpu_memory).wait(); + } + + log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; + Event warmup = is_edges.create_subspaces_by_image(rect_field_data_gpu, + sources, + p_garbage_edges, + Realm::ProfilingRequestSet()); + warmup.wait(); + + long long start_gpu = Clock::current_time_in_microseconds(); + Event gpu_call = is_edges.create_subspaces_by_image(rect_field_data_gpu, + sources, + p_edges, + Realm::ProfilingRequestSet()); + + + gpu_call.wait(); + long long gpu_us = Clock::current_time_in_microseconds() - start_gpu; + long long start_cpu = Clock::current_time_in_microseconds(); + Event cpu_call = is_edges.create_subspaces_by_image(rect_field_data, + sources, + p_edges_cpu, + Realm::ProfilingRequestSet()); + + cpu_call.wait(); + long long cpu_us = Clock::current_time_in_microseconds() - start_cpu; + + printf("RESULT,op=image,d1=%d,d2=%d,num_nodes=%d,num_edges=%d,num_spaces=%d,sparse_factor=%d,buffer_size=%zu,gpu_us=%lld,cpu_us=%lld\n", + N1, N2, num_nodes, num_edges, num_spaces, sparse_factor, buffer_size, gpu_us, cpu_us); + + return Event::merge_events({gpu_call, cpu_call}); + + } + + virtual int perform_dynamic_checks(void) + { + // Nothing to do here + return 0; + } + + virtual int check_partitioning(void) + { + int errors = 0; + + if (!p_edges.size()) { + return p_edges.size() == p_edges_cpu.size(); + } + + log_app.info() << "Checking correctness of partitioning " << "\n"; + + for(int i = 0; i < num_spaces; i++) { + + if (N1 > 1) { + if (!p_edges[i].dense()) { + p_edges[i].sparsity.impl()->request_bvh(); + } + if (!p_edges_cpu[i].dense()) { + p_edges_cpu[i].sparsity.impl()->request_bvh(); + } + } + + for(IndexSpaceIterator it(p_edges[i]); it.valid; it.step()) { + for(PointInRectIterator point(it.rect); point.valid; point.step()) { + if (!p_edges_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra image point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for(IndexSpaceIterator it(p_edges_cpu[i]); it.valid; it.step()) { + for(PointInRectIterator point(it.rect); point.valid; point.step()) { + if (!p_edges[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing image point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + + } + return errors; + } +}; + +template +class PreimageTest : public TestInterface { +public: + // graph config parameters + int num_nodes = 1000; + int num_edges = 1000; + int num_spaces = 4; + int num_pieces = 4; + int sparse_factor = 50; + size_t buffer_size = 100; + std::string filename; + + PreimageTest(int argc, const char *argv[]) + { + for(int i = 1; i < argc; i++) { + + if(!strcmp(argv[i], "-p")) { + num_pieces = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-n")) { + num_nodes = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-e")) { + num_edges = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-s")) { + num_spaces = atoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "-f")) { + sparse_factor = atoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "-b")) { + buffer_size = atoi(argv[++i]); + continue; + } + } + + + if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_spaces <= 0 || sparse_factor < 0 || sparse_factor > 100 || buffer_size < 0 || buffer_size > 100) { + log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " targets=" << num_spaces << " sparse factor=" << sparse_factor << " buffer size=" << buffer_size << "\n"; + exit(1); + } + } + + struct InitDataArgs { + int index; + RegionInstance ri_nodes; + }; + + enum PRNGStreams + { + NODE_SUBGRAPH_STREAM, + }; +ci + // assign subgraph ids to nodes + void chase_point(int idx, Point& color) + { + for (int d = 0; d < N2; d++) { + if(random_colors) + color[d] = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_edges); + else + color[d] = (idx * num_edges / num_nodes) % num_edges; + } + } + + static void init_data_task_wrapper(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) + { + PreimageTest *me = (PreimageTest *)testcfg; + me->init_data_task(args, arglen, p); + } + + //Each piece has a task to initialize its data + void init_data_task(const void *args, size_t arglen, Processor p) + { + const InitDataArgs &i_args = *(const InitDataArgs *)args; + + log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes + << ")"; + + i_args.ri_nodes.fetch_metadata(p).wait(); + + IndexSpace nodes_space = i_args.ri_nodes.template get_indexspace(); + + log_app.debug() << "N: " << is_nodes; + + //For each node in the graph, mark it with a random (or deterministic) subgraph id + { + AffineAccessor, N1> a_point(i_args.ri_nodes, 0 /* offset */); + + for (IndexSpaceIterator it(is_nodes); it.valid; it.step()) { + for (PointInRectIterator point(it.rect); point.valid; point.step()) { + int idx = 0; + int stride = 1; + for (int d = 0; d < N1; d++) { + idx += (point.p[d] - is_nodes.bounds.lo[d]) * stride; + stride *= (is_nodes.bounds.hi[d] - is_nodes.bounds.lo[d] + 1); + } + Point destination; + chase_point(idx, destination); + a_point.write(point.p, destination); + } + } + } + } + + IndexSpace is_nodes; + IndexSpace is_edges; + std::vector ri_nodes; + std::vector, Point> > point_field_data; + + virtual void print_info(void) + { + //printf("Realm %dD -> %dD Preimage dependent partitioning test: %d nodes, %d edges, %d pieces ,%d targets, %d sparse factor, %lu tile size\n", (int) N1, (int) N2, + //(int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) sparse_factor, buffer_size); + } + + virtual Event initialize_data(const std::vector &memories, + const std::vector &procs) + { + // now create index space for nodes + Point node_lo, node_hi; + for (int d = 0; d < N1; d++) { + node_lo[d] = 0; + node_hi[d] = num_nodes - 1; + } + is_nodes = Rect(node_lo, node_hi); + + Point edge_lo, edge_hi; + for (int d = 0; d < N2; d++) { + edge_lo[d] = 0; + edge_hi[d] = num_edges - 1; + } + is_edges = Rect(edge_lo, edge_hi); + + // equal partition is used to do initial population of edges and nodes + std::vector > ss_nodes_eq; + + log_app.info() << "Creating equal subspaces\n"; + + is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); + + // create instances for each of these subspaces + std::vector node_fields; + node_fields.push_back(sizeof(Point)); + + ri_nodes.resize(num_pieces); + point_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_nodes_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i], + node_fields, 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_nodes[i] = ri; + + point_field_data[i].index_space = ss_nodes_eq[i]; + point_field_data[i].inst = ri_nodes[i]; + point_field_data[i].field_offset = 0; + } + + // fire off tasks to initialize data + std::set events; + for(int i = 0; i < num_pieces; i++) { + Processor p = procs[i % procs.size()]; + InitDataArgs args; + args.index = i; + args.ri_nodes = ri_nodes[i]; + Event e = p.spawn(INIT_PREIMAGE_DATA_TASK, &args, sizeof(args)); + events.insert(e); + } + + return Event::merge_events(events); + } + + // the outputs of our partitioning will be: + // p_nodes - nodes partitioned by subgraph id (from GPU) + // p_nodes_cpu - nodes partitioned by subgraph id (from CPU) + + + std::vector > p_nodes, p_garbage_nodes, p_nodes_cpu; + + virtual Event perform_partitioning(void) + { + // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU + // Ensure that the results are identical + + std::vector> targets; + if (sparse_factor <= 1) { + is_edges.create_equal_subspaces(num_spaces, 1, targets, Realm::ProfilingRequestSet()).wait(); + } else { + targets.resize(num_spaces); + for (int i = 0; i < num_spaces; i++) { + targets[i] = create_sparse_index_space(is_edges.bounds, sparse_factor, random_colors, i); + } + } + + // We need a GPU memory for GPU partitioning + Memory gpu_memory; + bool found_gpu_memory = false; + Machine machine = Machine::get_machine(); + std::set all_memories; + machine.get_all_memories(all_memories); + for(Memory memory : all_memories) { + if(memory.kind() == Memory::GPU_FB_MEM) { + gpu_memory = memory; + found_gpu_memory = true; + break; + } + } + if (!found_gpu_memory) { + log_app.error() << "No GPU memory found for partitioning test\n"; + return Event::NO_EVENT; + } + + + std::vector node_fields; + node_fields.push_back(sizeof(Point)); + + std::vector, Point>> point_field_data_gpu; + point_field_data_gpu.resize(num_pieces); + + for (int i = 0; i < num_pieces; i++) { + copy_piece(point_field_data[i], point_field_data_gpu[i], node_fields, 0, gpu_memory).wait(); + } + + std::vector> preimage_inputs(num_pieces); + std::vector> preimage_subspaces(num_spaces); + std::vector preimage_requirements(num_pieces); + + for (int i = 0; i < num_pieces; i++) { + preimage_inputs[i].location = point_field_data_gpu[i].inst.get_location(); + preimage_inputs[i].space = point_field_data_gpu[i].index_space; + } + + for (int i = 0; i < num_spaces; i++) { + preimage_subspaces[i].space = targets[i]; + preimage_subspaces[i].entries = targets[i].dense() ? 1 : targets[i].sparsity.impl()->get_entries().size(); + } + + is_nodes.by_preimage_buffer_requirements(preimage_subspaces, preimage_inputs, preimage_requirements); + + for (int i = 0; i < num_pieces; i++) { + size_t alloc_size = preimage_requirements[i].lower_bound + (preimage_requirements[i].upper_bound - preimage_requirements[i].lower_bound) * buffer_size / 100; + alloc_piece(point_field_data_gpu[i].scratch_buffer, alloc_size, gpu_memory).wait(); + } + + log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; + Event warmup = is_nodes.create_subspaces_by_preimage(point_field_data_gpu, + targets, + p_garbage_nodes, + Realm::ProfilingRequestSet()); + warmup.wait(); + + long long gpu_start = Clock::current_time_in_microseconds(); + Event gpu_call = is_nodes.create_subspaces_by_preimage(point_field_data_gpu, + targets, + p_nodes, + Realm::ProfilingRequestSet()); + + gpu_call.wait(); + long long gpu_us = Clock::current_time_in_microseconds() - gpu_start; + long long cpu_start = Clock::current_time_in_microseconds(); + Event cpu_call = is_nodes.create_subspaces_by_preimage(point_field_data, + targets, + p_nodes_cpu, + Realm::ProfilingRequestSet()); + + cpu_call.wait(); + long long cpu_us = Clock::current_time_in_microseconds() - cpu_start; + printf("RESULT,op=preimage,d1=%d,d2=%d,num_nodes=%d,num_edges=%d,sparse_factor=%d,buffer_size=%zu,gpu_us=%lld,cpu_us=%lld\n", + N1, N2, num_nodes, num_edges, sparse_factor, buffer_size, gpu_us, cpu_us); + return Event::merge_events({gpu_call, cpu_call}); + + } + + virtual int perform_dynamic_checks(void) + { + // Nothing to do here + return 0; + } + + virtual int check_partitioning(void) + { + int errors = 0; + + if (!p_nodes.size()) { + return p_nodes.size() != p_nodes_cpu.size(); + } + + log_app.info() << "Checking correctness of partitioning " << "\n"; + + for(int i = 0; i < num_spaces; i++) { + if (!p_nodes[i].dense() && (N1 > 1)) { + p_nodes[i].sparsity.impl()->request_bvh(); + if (!p_nodes_cpu[i].dense()) { + p_nodes_cpu[i].sparsity.impl()->request_bvh(); + } + } + for(IndexSpaceIterator it(p_nodes[i]); it.valid; it.step()) { + for(PointInRectIterator point(it.rect); point.valid; point.step()) { + if (!p_nodes_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra image point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for(IndexSpaceIterator it(p_nodes_cpu[i]); it.valid; it.step()) { + for(PointInRectIterator point(it.rect); point.valid; point.step()) { + if (!p_nodes[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing image point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + + } + return errors; + } +}; + +template +class PreimageRangeTest : public TestInterface { +public: + // graph config parameters + int num_nodes = 1000; + int num_edges = 1000; + int rect_size = 10; + int num_spaces = 4; + int num_pieces = 4; + int sparse_factor = 50; + size_t buffer_size = 100; + std::string filename; + + PreimageRangeTest(int argc, const char *argv[]) + { + for(int i = 1; i < argc; i++) { + + if(!strcmp(argv[i], "-p")) { + num_pieces = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-n")) { + num_nodes = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-e")) { + num_edges = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-r")) { + rect_size = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-s")) { + num_spaces = atoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "-f")) { + sparse_factor = atoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "-b")) { + buffer_size = atoi(argv[++i]); + continue; + } + } + + + if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0 || num_spaces <= 0 || rect_size <= 0 || sparse_factor < 0 || sparse_factor > 100 || buffer_size < 0 || buffer_size > 100) { + log_app.error() << "Invalid config: nodes=" << num_nodes << " colors=" << num_edges << " pieces=" << num_pieces << " targets=" << num_spaces << " rect size=" << rect_size << " sparse factor=" << sparse_factor << " buffer size=" << buffer_size << "\n"; + exit(1); + } + } + + struct InitDataArgs { + int index; + RegionInstance ri_nodes; + }; + + enum PRNGStreams + { + NODE_SUBGRAPH_STREAM, + }; + + // assign subgraph ids to nodes + void chase_rect(int idx, Rect& color) + { + for (int d = 0; d < N2; d++) { + if(random_colors) { + color.lo[d] = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_edges); + color.hi[d] = color.lo[d] + Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, 2 * rect_size); + } else { + color.lo[d] = (idx * num_edges / num_nodes) % num_edges; + color.hi[d] = color.lo[d] + rect_size; + } + } + } + + static void init_data_task_wrapper(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) + { + PreimageRangeTest *me = (PreimageRangeTest *)testcfg; + me->init_data_task(args, arglen, p); + } + + //Each piece has a task to initialize its data + void init_data_task(const void *args, size_t arglen, Processor p) + { + const InitDataArgs &i_args = *(const InitDataArgs *)args; + + log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes + << ")"; + + i_args.ri_nodes.fetch_metadata(p).wait(); + + IndexSpace nodes_space = i_args.ri_nodes.template get_indexspace(); + + log_app.debug() << "N: " << is_nodes; + + //For each node in the graph, mark it with a random (or deterministic) subgraph id + { + AffineAccessor, N1> a_rect(i_args.ri_nodes, 0 /* offset */); + + for (IndexSpaceIterator it(is_nodes); it.valid; it.step()) { + for (PointInRectIterator point(it.rect); point.valid; point.step()) { + int idx = 0; + int stride = 1; + for (int d = 0; d < N1; d++) { + idx += (point.p[d] - is_nodes.bounds.lo[d]) * stride; + stride *= (is_nodes.bounds.hi[d] - is_nodes.bounds.lo[d] + 1); + } + Rect destination; + chase_rect(idx, destination); + a_rect.write(point.p, destination); + } + } + } + } + + IndexSpace is_nodes; + IndexSpace is_edges; + std::vector ri_nodes; + std::vector, Rect> > rect_field_data; + + virtual void print_info(void) + { + printf("Realm %dD -> %dD Preimage Range dependent partitioning test: %d nodes, %d edges, %d pieces ,%d targets, %d rect size, %d sparse factor, %lu tile size\n", (int) N1, (int) N2, + (int)num_nodes, (int) num_edges, (int)num_pieces, (int) num_spaces, (int) rect_size, (int) sparse_factor, buffer_size); + } + + virtual Event initialize_data(const std::vector &memories, + const std::vector &procs) + { + // now create index space for nodes + Point node_lo, node_hi; + for (int d = 0; d < N1; d++) { + node_lo[d] = 0; + node_hi[d] = num_nodes - 1; + } + is_nodes = Rect(node_lo, node_hi); + + Point edge_lo, edge_hi; + for (int d = 0; d < N2; d++) { + edge_lo[d] = 0; + edge_hi[d] = num_edges - 1; + } + is_edges = Rect(edge_lo, edge_hi); + + // equal partition is used to do initial population of edges and nodes + std::vector > ss_nodes_eq; + + log_app.info() << "Creating equal subspaces\n"; + + is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); + + // create instances for each of these subspaces + std::vector node_fields; + node_fields.push_back(sizeof(Rect)); + + ri_nodes.resize(num_pieces); + rect_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_nodes_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i], + node_fields, 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_nodes[i] = ri; + + rect_field_data[i].index_space = ss_nodes_eq[i]; + rect_field_data[i].inst = ri_nodes[i]; + rect_field_data[i].field_offset = 0; + } + + // fire off tasks to initialize data + std::set events; + for(int i = 0; i < num_pieces; i++) { + Processor p = procs[i % procs.size()]; + InitDataArgs args; + args.index = i; + args.ri_nodes = ri_nodes[i]; + Event e = p.spawn(INIT_PREIMAGE_RANGE_DATA_TASK, &args, sizeof(args)); + events.insert(e); + } + + return Event::merge_events(events); + } + + // the outputs of our partitioning will be: + // p_nodes - nodes partitioned by subgraph id (from GPU) + // p_nodes_cpu - nodes partitioned by subgraph id (from CPU) + + std::vector > p_nodes, p_garbage_nodes, p_nodes_cpu; + + virtual Event perform_partitioning(void) + { + // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU + // Ensure that the results are identical + + std::vector> targets; + if (sparse_factor <= 1) { + is_edges.create_equal_subspaces(num_spaces, 1, targets, Realm::ProfilingRequestSet()).wait(); + } else { + targets.resize(num_spaces); + for (int i = 0; i < num_spaces; i++) { + targets[i] = create_sparse_index_space(is_edges.bounds, sparse_factor, random_colors, i); + } + } + + // We need a GPU memory for GPU partitioning + Memory gpu_memory; + bool found_gpu_memory = false; + Machine machine = Machine::get_machine(); + std::set all_memories; + machine.get_all_memories(all_memories); + for(Memory memory : all_memories) { + if(memory.kind() == Memory::GPU_FB_MEM) { + gpu_memory = memory; + found_gpu_memory = true; + break; + } + } + if (!found_gpu_memory) { + log_app.error() << "No GPU memory found for partitioning test\n"; + return Event::NO_EVENT; + } + + + std::vector node_fields; + node_fields.push_back(sizeof(Rect)); + + std::vector, Rect>> rect_field_data_gpu; + rect_field_data_gpu.resize(num_pieces); + + for (int i = 0; i < num_pieces; i++) { + copy_piece(rect_field_data[i], rect_field_data_gpu[i], node_fields, 0, gpu_memory).wait(); + } + + std::vector> preimage_inputs(num_pieces); + std::vector> preimage_subspaces(num_spaces); + std::vector preimage_requirements(num_pieces); + + for (int i = 0; i < num_pieces; i++) { + preimage_inputs[i].location = rect_field_data_gpu[i].inst.get_location(); + preimage_inputs[i].space = rect_field_data_gpu[i].index_space; + } + + for (int i = 0; i < num_spaces; i++) { + preimage_subspaces[i].space = targets[i]; + preimage_subspaces[i].entries = targets[i].dense() ? 1 : targets[i].sparsity.impl()->get_entries().size(); + } + + is_nodes.by_preimage_buffer_requirements(preimage_subspaces, preimage_inputs, preimage_requirements); + + for (int i = 0; i < num_pieces; i++) { + size_t alloc_size = preimage_requirements[i].lower_bound + (preimage_requirements[i].upper_bound - preimage_requirements[i].lower_bound) * buffer_size / 100; + alloc_piece(rect_field_data_gpu[i].scratch_buffer, alloc_size, gpu_memory).wait(); + } + + log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; + Event warmup = is_nodes.create_subspaces_by_preimage(rect_field_data_gpu, + targets, + p_garbage_nodes, + Realm::ProfilingRequestSet()); + warmup.wait(); + + Event gpu_call = is_nodes.create_subspaces_by_preimage(rect_field_data_gpu, + targets, + p_nodes, + Realm::ProfilingRequestSet()); + + if ( wait_on_events ) { + gpu_call.wait(); + } + Event cpu_call = is_nodes.create_subspaces_by_preimage(rect_field_data, + targets, + p_nodes_cpu, + Realm::ProfilingRequestSet()); + + if ( wait_on_events ) { + cpu_call.wait(); + } + + return Event::merge_events({gpu_call, cpu_call}); + } + + virtual int perform_dynamic_checks(void) + { + // Nothing to do here + return 0; + } + + virtual int check_partitioning(void) + { + int errors = 0; + + if (!p_nodes.size()) { + return p_nodes.size() != p_nodes_cpu.size(); + } + + log_app.info() << "Checking correctness of partitioning " << "\n"; + + for(int i = 0; i < num_spaces; i++) { + if (!p_nodes[i].dense() && (N1 > 1)) { + p_nodes[i].sparsity.impl()->request_bvh(); + if (!p_nodes_cpu[i].dense()) { + p_nodes_cpu[i].sparsity.impl()->request_bvh(); + } + } + for(IndexSpaceIterator it(p_nodes[i]); it.valid; it.step()) { + for(PointInRectIterator point(it.rect); point.valid; point.step()) { + if (!p_nodes_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra image point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for(IndexSpaceIterator it(p_nodes_cpu[i]); it.valid; it.step()) { + for(PointInRectIterator point(it.rect); point.valid; point.step()) { + if (!p_nodes[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing image point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + + } + return errors; + } +}; + +void top_level_task(const void *args, size_t arglen, const void *userdata, size_t userlen, + Processor p) +{ + int errors = 0; + + testcfg->print_info(); + + // find all the system memories - we'll stride our data across them + // for each memory, we'll need one CPU that can do the initialization of the data + std::vector sysmems; + std::vector procs; + + Machine machine = Machine::get_machine(); + { + std::set all_memories; + machine.get_all_memories(all_memories); + for(std::set::const_iterator it = all_memories.begin(); + it != all_memories.end(); it++) { + Memory m = *it; + + // skip memories with no capacity for creating instances + if(m.capacity() == 0) + continue; + + if(m.kind() == Memory::SYSTEM_MEM) { + sysmems.push_back(m); + std::set pset; + machine.get_shared_processors(m, pset); + Processor p = Processor::NO_PROC; + for(std::set::const_iterator it2 = pset.begin(); it2 != pset.end(); + it2++) { + if(it2->kind() == Processor::LOC_PROC) { + p = *it2; + break; + } + } + assert(p.exists()); + procs.push_back(p); + log_app.debug() << "System mem #" << (sysmems.size() - 1) << " = " + << *sysmems.rbegin() << " (" << *procs.rbegin() << ")"; + } + } + } + assert(sysmems.size() > 0); + + { + Realm::TimeStamp ts("initialization", true, &log_app); + + Event e = testcfg->initialize_data(sysmems, procs); + // wait for all initialization to be done + e.wait(); + } + + // now actual partitioning work + { + Realm::TimeStamp ts("dependent partitioning work", true, &log_app); + + Event e = testcfg->perform_partitioning(); + + e.wait(); + } + + // dynamic checks (which would be eliminated by compiler) + { + Realm::TimeStamp ts("dynamic checks", true, &log_app); + errors += testcfg->perform_dynamic_checks(); + } + + if(!skip_check) { + log_app.print() << "checking correctness of partitioning"; + Realm::TimeStamp ts("verification", true, &log_app); + errors += testcfg->check_partitioning(); + } + + if(errors > 0) { + printf("Exiting with errors\n"); + exit(1); + } + +} + +// Constructor function-pointer type +using CtorFn = TestInterface* (*)(int, const char** argv); + +// ---- Byfield constructors ---- +template +static TestInterface* make_byfield(int argc, const char** argv) { + return new ByfieldTest(argc, argv); +} + +static constexpr CtorFn BYFIELD_CTORS[3] = { + &make_byfield<1>, + &make_byfield<2>, + &make_byfield<3>, +}; + +// ---- Image constructors ---- +template +static TestInterface* make_image(int argc, const char** argv) { + return new ImageTest(argc, argv); +} + +static constexpr CtorFn IMAGE_CTORS[3][3] = { + { &make_image<1,1>, &make_image<1,2>, &make_image<1,3> }, + { &make_image<2,1>, &make_image<2,2>, &make_image<2,3> }, + { &make_image<3,1>, &make_image<3,2>, &make_image<3,3> }, +}; + +// ---- Image Range constructors ---- +template +static TestInterface* make_image_range(int argc, const char** argv) { + return new ImageRangeTest(argc, argv); +} + +static constexpr CtorFn IMAGE_RANGE_CTORS[3][3] = { + { &make_image_range<1,1>, &make_image_range<1,2>, &make_image_range<1,3> }, + { &make_image_range<2,1>, &make_image_range<2,2>, &make_image_range<2,3> }, + { &make_image_range<3,1>, &make_image_range<3,2>, &make_image_range<3,3> }, +}; + +// ---- Image constructors ---- +template +static TestInterface* make_preimage(int argc, const char** argv) { + return new PreimageTest(argc, argv); +} + +static constexpr CtorFn PREIMAGE_CTORS[3][3] = { + { &make_preimage<1,1>, &make_preimage<1,2>, &make_preimage<1,3> }, + { &make_preimage<2,1>, &make_preimage<2,2>, &make_preimage<2,3> }, + { &make_preimage<3,1>, &make_preimage<3,2>, &make_preimage<3,3> }, +}; + +// ---- Image constructors ---- +template +static TestInterface* make_preimage_range(int argc, const char** argv) { + return new PreimageRangeTest(argc, argv); +} + +static constexpr CtorFn PREIMAGE_RANGE_CTORS[3][3] = { + { &make_preimage_range<1,1>, &make_preimage_range<1,2>, &make_preimage_range<1,3> }, + { &make_preimage_range<2,1>, &make_preimage_range<2,2>, &make_preimage_range<2,3> }, + { &make_preimage_range<3,1>, &make_preimage_range<3,2>, &make_preimage_range<3,3> }, +}; + +using TaskWrapperFn = void (*)(const void*, size_t, const void*, size_t, Processor); + +static constexpr TaskWrapperFn BYFIELD_INIT_TBL[3] = { + &ByfieldTest<1>::init_data_task_wrapper, + &ByfieldTest<2>::init_data_task_wrapper, + &ByfieldTest<3>::init_data_task_wrapper, +}; + +static constexpr TaskWrapperFn IMAGE_INIT_TBL[3][3] = { + { &ImageTest<1,1>::init_data_task_wrapper, &ImageTest<1,2>::init_data_task_wrapper, &ImageTest<1,3>::init_data_task_wrapper }, + { &ImageTest<2,1>::init_data_task_wrapper, &ImageTest<2,2>::init_data_task_wrapper, &ImageTest<2,3>::init_data_task_wrapper }, + { &ImageTest<3,1>::init_data_task_wrapper, &ImageTest<3,2>::init_data_task_wrapper, &ImageTest<3,3>::init_data_task_wrapper }, +}; + +static constexpr TaskWrapperFn IMAGE_RANGE_INIT_TBL[3][3] = { + { &ImageRangeTest<1,1>::init_data_task_wrapper, &ImageRangeTest<1,2>::init_data_task_wrapper, &ImageRangeTest<1,3>::init_data_task_wrapper }, + { &ImageRangeTest<2,1>::init_data_task_wrapper, &ImageRangeTest<2,2>::init_data_task_wrapper, &ImageRangeTest<2,3>::init_data_task_wrapper }, + { &ImageRangeTest<3,1>::init_data_task_wrapper, &ImageRangeTest<3,2>::init_data_task_wrapper, &ImageRangeTest<3,3>::init_data_task_wrapper }, +}; + +static constexpr TaskWrapperFn PREIMAGE_INIT_TBL[3][3] = { + { &PreimageTest<1,1>::init_data_task_wrapper, &PreimageTest<1,2>::init_data_task_wrapper, &PreimageTest<1,3>::init_data_task_wrapper }, + { &PreimageTest<2,1>::init_data_task_wrapper, &PreimageTest<2,2>::init_data_task_wrapper, &PreimageTest<2,3>::init_data_task_wrapper }, + { &PreimageTest<3,1>::init_data_task_wrapper, &PreimageTest<3,2>::init_data_task_wrapper, &PreimageTest<3,3>::init_data_task_wrapper }, +}; + +static constexpr TaskWrapperFn PREIMAGE_RANGE_INIT_TBL[3][3] = { + { &PreimageRangeTest<1,1>::init_data_task_wrapper, &PreimageRangeTest<1,2>::init_data_task_wrapper, &PreimageRangeTest<1,3>::init_data_task_wrapper }, + { &PreimageRangeTest<2,1>::init_data_task_wrapper, &PreimageRangeTest<2,2>::init_data_task_wrapper, &PreimageRangeTest<2,3>::init_data_task_wrapper }, + { &PreimageRangeTest<3,1>::init_data_task_wrapper, &PreimageRangeTest<3,2>::init_data_task_wrapper, &PreimageRangeTest<3,3>::init_data_task_wrapper }, +}; + +int main(int argc, char **argv) +{ + Runtime rt; + + rt.init(&argc, &argv); + + // parse global options + for(int i = 1; i < argc; i++) { + if(!strcmp(argv[i], "-seed")) { + random_seed = atoi(argv[++i]); + continue; + } + + if(!strcmp(argv[i], "-random")) { + random_colors = true; + continue; + } + + if(!strcmp(argv[i], "-wait")) { + wait_on_events = true; + continue; + } + + if(!strcmp(argv[i], "-show")) { + show_graph = true; + continue; + } + + if(!strcmp(argv[i], "-nocheck")) { + skip_check = true; + continue; + } + + if(!strcmp(argv[i], "-d1")) { + dimension1 = atoi(argv[++i]); + continue; + } + + if(!strcmp(argv[i], "-d2")) { + dimension2 = atoi(argv[++i]); + continue; + } + + if(!strcmp(argv[i], "byfield")) { + if (dimension1 < 1 || dimension1 > 3) + assert(false && "invalid dimension"); + + op = "byfield"; + testcfg = BYFIELD_CTORS[dimension1 - 1](argc - i, const_cast(argv + i)); + break; + } + + if(!strcmp(argv[i], "image")) { + if (dimension1 < 1 || dimension1 > 3 || dimension2 < 1 || dimension2 > 3) + assert(false && "invalid dimension"); + op = "image"; + testcfg = IMAGE_CTORS[dimension1 - 1][dimension2 - 1](argc - i, const_cast(argv + i)); + break; + } + + if(!strcmp(argv[i], "irange")) { + if (dimension1 < 1 || dimension1 > 3 || dimension2 < 1 || dimension2 > 3) + assert(false && "invalid dimension"); + op = "irange"; + testcfg = IMAGE_RANGE_CTORS[dimension1 - 1][dimension2 - 1](argc - i, const_cast(argv + i)); + break; + } + + if(!strcmp(argv[i], "preimage")) { + if (dimension1 < 1 || dimension1 > 3 || dimension2 < 1 || dimension2 > 3) + assert(false && "invalid dimension"); + op = "preimage"; + testcfg = PREIMAGE_CTORS[dimension1 - 1][dimension2 - 1](argc - i, const_cast(argv + i)); + break; + } + + if(!strcmp(argv[i], "prange")) { + if (dimension1 < 1 || dimension1 > 3 || dimension2 < 1 || dimension2 > 3) + assert(false && "invalid dimension"); + op = "prange"; + testcfg = PREIMAGE_RANGE_CTORS[dimension1 - 1][dimension2 - 1](argc - i, const_cast(argv + i)); + break; + } + + // printf("unknown parameter: %s\n", argv[i]); + } + + // if no test specified, use circuit (with default parameters) + if(!testcfg) { + assert(false); + } + + rt.register_task(TOP_LEVEL_TASK, top_level_task); + + if (dimension1 < 1 || dimension1 > 3 || dimension2 < 1 || dimension2 > 3) + assert(false && "invalid dimension"); + + rt.register_task(INIT_BYFIELD_DATA_TASK, BYFIELD_INIT_TBL[dimension1 - 1]); + rt.register_task(INIT_IMAGE_DATA_TASK, IMAGE_INIT_TBL[dimension1 - 1][dimension2 - 1]); + rt.register_task(INIT_IMAGE_RANGE_DATA_TASK, IMAGE_RANGE_INIT_TBL[dimension1 - 1][dimension2 - 1]); + rt.register_task(INIT_PREIMAGE_DATA_TASK, PREIMAGE_INIT_TBL[dimension1 - 1][dimension2 - 1]); + rt.register_task(INIT_PREIMAGE_RANGE_DATA_TASK, PREIMAGE_RANGE_INIT_TBL[dimension1 - 1][dimension2 - 1]); + + signal(SIGALRM, sigalrm_handler); + + Processor p = Machine::ProcessorQuery(Machine::get_machine()) + .only_kind(Processor::LOC_PROC) + .first(); + assert(p.exists()); + + // collective launch of a single task - everybody gets the same finish + // event + Event e = rt.collective_spawn(p, TOP_LEVEL_TASK, 0, 0); + + // request shutdown once that task is complete + rt.shutdown(e); + + // now sleep this thread until that shutdown actually happens + rt.wait_for_shutdown(); + + delete testcfg; + + return 0; +} diff --git a/tests/deppart.cc b/tests/deppart.cc index e33708daf0..b6847f5513 100644 --- a/tests/deppart.cc +++ b/tests/deppart.cc @@ -41,6 +41,10 @@ enum { TOP_LEVEL_TASK = Processor::TASK_ID_FIRST_AVAILABLE + 0, INIT_CIRCUIT_DATA_TASK, + INIT_BASIC_DATA_TASK, + INIT_TILE_DATA_TASK, + INIT_RANGE_DATA_TASK, + INIT_RANGE2D_DATA_TASK, INIT_PENNANT_DATA_TASK, INIT_MINIAERO_DATA_TASK, }; @@ -87,14 +91,14 @@ void dump_sparse_index_space(const char *pfx, IndexSpace is) if(!is.sparsity.exists()) return; SparsityMapPublicImpl *impl = is.sparsity.impl(); - const std::vector> &entries = impl->get_entries(); - for(typename std::vector>::const_iterator it = entries.begin(); - it != entries.end(); it++) { - std::cout << " " << it->bounds; - if(it->bitmap) - std::cout << " bitmap(" << it->bitmap << ")"; - if(it->sparsity.exists()) - std::cout << " sparsity(" << it->sparsity << ")"; + span> entries = impl->get_entries(); + for(size_t i = 0; i < entries.size(); i++) { + SparsityMapEntry entry = entries[i]; + std::cout << " " << entry.bounds; + if(entry.bitmap) + std::cout << " bitmap(" << entry.bitmap << ")"; + if(entry.sparsity.exists()) + std::cout << " sparsity(" << entry.sparsity << ")"; std::cout << "\n"; } } @@ -161,6 +165,2109 @@ int find_split(const std::vector &cuts, T v) return 0; } +/* + * Basic test - create a graph, partition it by + * node subgraph id and then check that the partitioning + * is correct + */ +class BasicTest : public TestInterface { +public: + // graph config parameters + int num_nodes = 1000; + int num_edges = 1000; + int num_pieces = 4; + std::string filename; + + BasicTest(int argc, const char *argv[]) + { + for(int i = 1; i < argc; i++) { + + if(!strcmp(argv[i], "-p")) { + num_pieces = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-n")) { + num_nodes = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-e")) { + num_edges = atoi(argv[++i]); + continue; + } + } + + + if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0) { + log_app.error() << "Invalid config: nodes=" << num_nodes << " edges=" << num_edges << " pieces=" << num_pieces << "\n"; + exit(1); + } + } + + struct InitDataArgs { + int index; + RegionInstance ri_nodes; + RegionInstance ri_edges; + }; + + enum PRNGStreams + { + NODE_SUBGRAPH_STREAM, + }; + + // assign subgraph ids to nodes + void random_node_data(int idx, int &subgraph) + { + if(random_colors) + subgraph = + Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); + else + subgraph = idx * num_pieces / num_nodes; + } + + void random_edge_data(int idx, int& src, int& dst) + { + src = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes); + dst = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, num_nodes); + } + + static void init_data_task_wrapper(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) + { + BasicTest *me = (BasicTest *)testcfg; + me->init_data_task(args, arglen, p); + } + + //Each piece has a task to initialize its data + void init_data_task(const void *args, size_t arglen, Processor p) + { + const InitDataArgs &i_args = *(const InitDataArgs *)args; + + log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes + << ")"; + + i_args.ri_nodes.fetch_metadata(p).wait(); + i_args.ri_edges.fetch_metadata(p).wait(); + + IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>(); + IndexSpace<1> is_edges = i_args.ri_edges.get_indexspace<1>(); + + log_app.debug() << "N: " << is_nodes; + log_app.debug() << "E: " << is_edges; + + //For each node in the graph, mark it with a random (or deterministic) subgraph id + { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) { + int subgraph; + random_node_data(i, subgraph); + a_piece_id.write(i, subgraph); + } + + AffineAccessor,1> a_src(i_args.ri_edges, 0 /* offset */); + AffineAccessor,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */); + + for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) { + int src, dst; + random_edge_data(i, src, dst); + a_src.write(i, Point<1>(src)); + a_dst.write(i, Point<1>(dst)); + } + } + + //Optionally print out the assigned subgraph ids + if(show_graph) { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) + log_app.info() << "piece_id[" << i << "] = " << a_piece_id.read(i) << "\n"; + + AffineAccessor,1> a_src(i_args.ri_edges, 0 /* offset */); + AffineAccessor,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */); + + for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) + log_app.info() << "src, dst[" << i << "] = " << a_src.read(i) << ", " << a_dst.read(i) << "\n"; + } + } + + IndexSpace<1> is_nodes, is_edges; + std::vector ri_nodes, ri_edges; + std::vector, int> > piece_id_field_data; + std::vector, Point<1> > > src_node_field_data, dst_node_field_data; + + virtual void print_info(void) + { + printf("Realm dependent partitioning test - basic: %d nodes, %d edges, %d pieces\n", + (int)num_nodes, (int) num_edges, (int)num_pieces); + } + + virtual Event initialize_data(const std::vector &memories, + const std::vector &procs) + { + // now create index space for nodes + is_nodes = Rect<1>(0, num_nodes - 1); + is_edges = Rect<1>(0, num_edges - 1); + + // equal partition is used to do initial population of edges and nodes + std::vector > ss_nodes_eq; + std::vector > ss_edges_eq; + + log_app.info() << "Creating equal subspaces\n"; + + is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); + is_edges.create_equal_subspaces(num_pieces, 1, ss_edges_eq, Realm::ProfilingRequestSet()).wait(); + + log_app.debug() << "Initial partitions:"; + for(size_t i = 0; i < ss_nodes_eq.size(); i++) + log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i]; + for(size_t i = 0; i < ss_edges_eq.size(); i++) + log_app.debug() << " Edges #" << i << ": " << ss_edges_eq[i]; + + // create instances for each of these subspaces + std::vector node_fields, edge_fields; + node_fields.push_back(sizeof(int)); // piece_id + assert(sizeof(int) == sizeof(Point<1>)); + edge_fields.push_back(sizeof(Point<1>)); // src_node + edge_fields.push_back(sizeof(Point<1>)); // dst_node + + ri_nodes.resize(num_pieces); + piece_id_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_nodes_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i], + node_fields, 0 /*SOA*/, + Realm::ProfilingRequestSet()) + .wait(); + ri_nodes[i] = ri; + + piece_id_field_data[i].index_space = ss_nodes_eq[i]; + piece_id_field_data[i].inst = ri_nodes[i]; + piece_id_field_data[i].field_offset = 0; + } + + + // Fire off tasks to initialize data + ri_edges.resize(num_pieces); + src_node_field_data.resize(num_pieces); + dst_node_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_edges_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, + memories[i % memories.size()], + ss_edges_eq[i], + edge_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_edges[i] = ri; + + src_node_field_data[i].index_space = ss_edges_eq[i]; + src_node_field_data[i].inst = ri_edges[i]; + src_node_field_data[i].field_offset = 0 * sizeof(Point<1>); + + dst_node_field_data[i].index_space = ss_edges_eq[i]; + dst_node_field_data[i].inst = ri_edges[i]; + dst_node_field_data[i].field_offset = 1 * sizeof(Point<1>); + } + + // fire off tasks to initialize data + std::set events; + for(int i = 0; i < num_pieces; i++) { + Processor p = procs[i % procs.size()]; + InitDataArgs args; + args.index = i; + args.ri_nodes = ri_nodes[i]; + args.ri_edges = ri_edges[i]; + Event e = p.spawn(INIT_BASIC_DATA_TASK, &args, sizeof(args)); + events.insert(e); + } + + return Event::merge_events(events); + } + + // the outputs of our partitioning will be: + // p_nodes - nodes partitioned by subgraph id (from GPU) + // p_nodes_cpu - nodes partitioned by subgraph id (from CPU) + + + std::vector > p_nodes, p_rd; + std::vector > p_edges, p_preimage_edges; + + std::vector > p_nodes_cpu, p_rd_cpu; + std::vector > p_edges_cpu, p_preimage_edges_cpu; + + virtual Event perform_partitioning(void) + { + // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU + // Ensure that the results are identical + + std::vector colors(num_pieces); + for(int i = 0; i < num_pieces; i++) + colors[i] = i; + + // We need a GPU memory for GPU partitioning + Memory gpu_memory; + bool found_gpu_memory = false; + Machine machine = Machine::get_machine(); + std::set all_memories; + machine.get_all_memories(all_memories); + for(Memory memory : all_memories) { + if(memory.kind() == Memory::GPU_FB_MEM) { + gpu_memory = memory; + found_gpu_memory = true; + break; + } + } + if (!found_gpu_memory) { + log_app.error() << "No GPU memory found for partitioning test\n"; + return Event::NO_EVENT; + } + std::vector edge_fields; + edge_fields.push_back(sizeof(Point<1>)); + edge_fields.push_back(sizeof(Point<1>)) ; + std::vector node_fields; + node_fields.push_back(sizeof(int)); + + std::vector, Point<1> > > src_field_data_gpu; + std::vector, Point<1> > > dst_field_data_gpu; + std::vector, int> > piece_field_data_gpu; + piece_field_data_gpu.resize(num_pieces); + src_field_data_gpu.resize(num_pieces); + dst_field_data_gpu.resize(num_pieces); + for (int i = 0; i < num_pieces; i++) { + RegionInstance src_gpu_instance; + RegionInstance dst_gpu_instance; + RegionInstance piece_gpu_instance; + RegionInstance::create_instance(src_gpu_instance, + gpu_memory, + src_node_field_data[i].index_space, + edge_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(dst_gpu_instance, + gpu_memory, + dst_node_field_data[i].index_space, + edge_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(piece_gpu_instance, + gpu_memory, + piece_id_field_data[i].index_space, + node_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + CopySrcDstField src_gpu_field, src_cpu_field, dst_gpu_field, dst_cpu_field, piece_gpu_field, piece_cpu_field; + src_gpu_field.inst = src_gpu_instance; + src_gpu_field.size = sizeof(Point<1>); + src_gpu_field.field_id = 0; + src_cpu_field.inst = src_node_field_data[i].inst; + src_cpu_field.size = sizeof(Point<1>); + src_cpu_field.field_id = 0; + dst_gpu_field.inst = dst_gpu_instance; + dst_gpu_field.size = sizeof(Point<1>); + dst_gpu_field.field_id = sizeof(Point<1>); + dst_cpu_field.inst = dst_node_field_data[i].inst; + dst_cpu_field.size = sizeof(Point<1>); + dst_cpu_field.field_id = sizeof(Point<1>); + piece_gpu_field.inst = piece_gpu_instance; + piece_gpu_field.size = sizeof(int); + piece_gpu_field.field_id = 0; + piece_cpu_field.inst = piece_id_field_data[i].inst; + piece_cpu_field.size = sizeof(int); + piece_cpu_field.field_id = 0; + std::vector src_cpu_data, src_gpu_data, dst_cpu_data, dst_gpu_data, piece_cpu_data, piece_gpu_data; + src_cpu_data.push_back(src_cpu_field); + dst_cpu_data.push_back(dst_cpu_field); + src_gpu_data.push_back(src_gpu_field); + dst_gpu_data.push_back(dst_gpu_field); + piece_gpu_data.push_back(piece_gpu_field); + piece_cpu_data.push_back(piece_cpu_field); + Event copy_event = src_node_field_data[i].index_space.copy(src_cpu_data, src_gpu_data, Realm::ProfilingRequestSet()); + copy_event.wait(); + Event second_copy_event = dst_node_field_data[i].index_space.copy(dst_cpu_data, dst_gpu_data, Realm::ProfilingRequestSet()); + second_copy_event.wait(); + Event third_copy_event = piece_id_field_data[i].index_space.copy(piece_cpu_data, piece_gpu_data, Realm::ProfilingRequestSet()); + third_copy_event.wait(); + src_field_data_gpu[i].inst = src_gpu_instance; + src_field_data_gpu[i].index_space = src_node_field_data[i].index_space; + src_field_data_gpu[i].field_offset = 0; + dst_field_data_gpu[i].inst = dst_gpu_instance; + dst_field_data_gpu[i].index_space = dst_node_field_data[i].index_space; + dst_field_data_gpu[i].field_offset = 1 * sizeof(Point<1>); + piece_field_data_gpu[i].inst = piece_gpu_instance; + piece_field_data_gpu[i].index_space = piece_id_field_data[i].index_space; + piece_field_data_gpu[i].field_offset = 0; + } + wait_on_events = true; + log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; + const char* val = std::getenv("TILE_SIZE"); // or any env var + size_t tile_size = 10000000; //default + if (val) { + tile_size = atoi(val); + } + std::vector byte_fields = {sizeof(char)}; + IndexSpace<1> instance_index_space(Rect<1>(0, tile_size-1)); + IndexSpace<1> dst_index_space(Rect<1>(0, tile_size/100-1)); + for (size_t i = 0; i < piece_field_data_gpu.size(); i++) { + RegionInstance::create_instance(piece_field_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + } + for (size_t i = 0; i < src_field_data_gpu.size(); i++) { + RegionInstance::create_instance(src_field_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + } + for (size_t i = 0; i < dst_field_data_gpu.size(); i++) { + RegionInstance::create_instance(dst_field_data_gpu[i].scratch_buffer, gpu_memory, dst_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + } + std::vector > p_garbage_nodes, p_garbage_edges, p_garbage_rd, p_garbage_preimage_edges; + Event e01 = is_nodes.create_subspaces_by_field(piece_field_data_gpu, + colors, + p_garbage_nodes, + Realm::ProfilingRequestSet()); + if (wait_on_events) e01.wait(); + Event e02 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu, + p_garbage_nodes, + p_garbage_edges, + Realm::ProfilingRequestSet(), + e01); + if(wait_on_events) e02.wait(); + + // an image of p_edges through out_node gives us all the shared nodes, along + // with some private nodes + Event e03 = is_nodes.create_subspaces_by_image(src_field_data_gpu, + p_garbage_edges, + p_garbage_rd, + Realm::ProfilingRequestSet(), + e02); + if(wait_on_events) e03.wait(); + + Event e04 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu, + p_garbage_rd, + p_garbage_preimage_edges, + Realm::ProfilingRequestSet(), + e03); + e04.wait(); + log_app.info() << "warming up complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU Partitioning " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU By Field " << Clock::current_time_in_microseconds() << "\n"; + Event e1 = is_nodes.create_subspaces_by_field(piece_field_data_gpu, + colors, + p_nodes, + Realm::ProfilingRequestSet()); + if(wait_on_events) e1.wait(); + log_app.info() << "GPU By Field complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU Preimage " << Clock::current_time_in_microseconds() << "\n"; + // now compute p_edges based on the color of their in_node (i.e. a preimage) + Event e2 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu, + p_nodes, + p_edges, + Realm::ProfilingRequestSet(), + e1); + if(wait_on_events) e2.wait(); + log_app.info() << "GPU Preimage complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU Image " << Clock::current_time_in_microseconds() << "\n"; + + std::vector> spaces = {}; + std::vector requirements; + is_nodes.by_field_buffer_requirements(spaces, requirements); + // an image of p_edges through out_node gives us all the shared nodes, along + // with some private nodes + Event e3 = is_nodes.create_subspaces_by_image(src_field_data_gpu, + p_edges, + p_rd, + Realm::ProfilingRequestSet(), + e2); + if(wait_on_events) e3.wait(); + log_app.info() << "GPU Image complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting second GPU preimage " << Clock::current_time_in_microseconds() << "\n"; + + Event e4 = is_edges.create_subspaces_by_preimage(dst_field_data_gpu, + p_rd, + p_preimage_edges, + Realm::ProfilingRequestSet(), + e3); + e4.wait(); + log_app.info() << "Second GPU preimage complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "GPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting CPU Partitioning " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting CPU By Field " << Clock::current_time_in_microseconds() << "\n"; + Event e5 = is_nodes.create_subspaces_by_field(piece_id_field_data, + colors, + p_nodes_cpu, + Realm::ProfilingRequestSet()); + if(wait_on_events) e5.wait(); + log_app.info() << "CPU By Field complete " << Clock::current_time_in_microseconds() << "\n"; + // now compute p_edges based on the color of their in_node (i.e. a preimage) + log_app.info() << "Starting CPU Preimage " << Clock::current_time_in_microseconds() << "\n"; + Event e6 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_nodes_cpu, + p_edges_cpu, + Realm::ProfilingRequestSet(), + e5); + if(wait_on_events) e6.wait(); + log_app.info() << "CPU Preimage complete " << Clock::current_time_in_microseconds() << "\n"; + + // an image of p_edges through out_node gives us all the shared nodes, along + // with some private nodes + log_app.info() << "Starting CPU Image " << Clock::current_time_in_microseconds() << "\n"; + Event e7 = is_nodes.create_subspaces_by_image(src_node_field_data, + p_edges_cpu, + p_rd_cpu, + Realm::ProfilingRequestSet(), + e6); + if(wait_on_events) e7.wait(); + log_app.info() << "CPU Image complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting second CPU preimage " << Clock::current_time_in_microseconds() << "\n"; + + Event e8 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_rd_cpu, + p_preimage_edges_cpu, + Realm::ProfilingRequestSet(), + e7); + e8.wait(); + log_app.info() << "Second CPU preimage complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; + return e8; + } + + virtual int perform_dynamic_checks(void) + { + // Nothing to do here + return 0; + } + + virtual int check_partitioning(void) + { + int errors = 0; + + if (!p_nodes.size()) { + return 0; + } + + log_app.info() << "Checking correctness of partitioning " << "\n"; + + for(int i = 0; i < num_pieces; i++) { + for(IndexSpaceIterator<1> it(p_nodes[i]); it.valid; it.step()) { + for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_nodes_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra byfield point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for(IndexSpaceIterator<1> it(p_nodes_cpu[i]); it.valid; it.step()) { + for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_nodes[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing byfield point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_edges[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_edges_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_edges_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_edges[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_rd[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_rd_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra image node " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_rd_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_rd[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing image node " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_preimage_edges[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_preimage_edges_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra second preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_preimage_edges_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_preimage_edges[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing second preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + + } + return errors; + } +}; + +class TileTest : public TestInterface { +public: + // graph config parameters + int num_nodes = 1000; + int num_edges = 1000; + int num_pieces = 4; + int num_tiles = 1; + std::string filename; + + TileTest(int argc, const char *argv[]) + { + for(int i = 1; i < argc; i++) { + + if(!strcmp(argv[i], "-p")) { + num_pieces = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-n")) { + num_nodes = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-e")) { + num_edges = atoi(argv[++i]); + continue; + } + if(!strcmp(argv[i], "-t")) { + num_tiles = atoi(argv[++i]); + continue; + } + } + + + if (num_nodes <= 0 || num_pieces <= 0 || num_edges <= 0) { + log_app.error() << "Invalid config: nodes=" << num_nodes << " edges=" << num_edges << " pieces=" << num_pieces << "\n"; + exit(1); + } + } + + struct InitDataArgs { + int index; + RegionInstance ri_nodes; + RegionInstance ri_edges; + }; + + enum PRNGStreams + { + NODE_SUBGRAPH_STREAM, + }; + + // assign subgraph ids to nodes + void random_node_data(int idx, int &subgraph) + { + if(random_colors) + subgraph = + Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); + else + subgraph = idx * num_pieces / num_nodes; + } + + void random_edge_data(int idx, int& src, int& dst) + { + src = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes); + dst = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, num_nodes); + } + + static void init_data_task_wrapper(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) + { + TileTest *me = (TileTest *)testcfg; + me->init_data_task(args, arglen, p); + } + + //Each piece has a task to initialize its data + void init_data_task(const void *args, size_t arglen, Processor p) + { + const InitDataArgs &i_args = *(const InitDataArgs *)args; + + log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes + << ")"; + + i_args.ri_nodes.fetch_metadata(p).wait(); + i_args.ri_edges.fetch_metadata(p).wait(); + + IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>(); + IndexSpace<1> is_edges = i_args.ri_edges.get_indexspace<1>(); + + log_app.debug() << "N: " << is_nodes; + log_app.debug() << "E: " << is_edges; + + //For each node in the graph, mark it with a random (or deterministic) subgraph id + { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) { + int subgraph; + random_node_data(i, subgraph); + a_piece_id.write(i, subgraph); + } + + AffineAccessor,1> a_src(i_args.ri_edges, 0 /* offset */); + AffineAccessor,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */); + + for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) { + int src, dst; + random_edge_data(i, src, dst); + a_src.write(i, Point<1>(src)); + a_dst.write(i, Point<1>(dst)); + } + } + + //Optionally print out the assigned subgraph ids + if(show_graph) { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) + log_app.info() << "piece_id[" << i << "] = " << a_piece_id.read(i) << "\n"; + + AffineAccessor,1> a_src(i_args.ri_edges, 0 /* offset */); + AffineAccessor,1> a_dst(i_args.ri_edges, sizeof(Point<1>)/* offset */); + + for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) + log_app.info() << "src, dst[" << i << "] = " << a_src.read(i) << ", " << a_dst.read(i) << "\n"; + } + } + + IndexSpace<1> is_nodes, is_edges; + std::vector ri_nodes, ri_edges; + std::vector, int> > piece_id_field_data; + std::vector, Point<1> > > src_node_field_data, dst_node_field_data; + + virtual void print_info(void) + { + printf("Realm dependent partitioning test - tile: %d nodes, %d edges, %d pieces, %d tiles\n", + (int)num_nodes, (int) num_edges, (int)num_pieces, (int)num_tiles); + } + + virtual Event initialize_data(const std::vector &memories, + const std::vector &procs) + { + // now create index space for nodes + is_nodes = Rect<1>(0, num_nodes - 1); + is_edges = Rect<1>(0, num_edges - 1); + // equal partition is used to do initial population of edges and nodes + std::vector > ss_nodes_eq; + std::vector > ss_edges_eq; + + log_app.info() << "Creating equal subspaces\n"; + + is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); + is_edges.create_equal_subspaces(num_pieces, 1, ss_edges_eq, Realm::ProfilingRequestSet()).wait(); + + log_app.debug() << "Initial partitions:"; + for(size_t i = 0; i < ss_nodes_eq.size(); i++) + log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i]; + for(size_t i = 0; i < ss_edges_eq.size(); i++) + log_app.debug() << " Edges #" << i << ": " << ss_edges_eq[i]; + + // create instances for each of these subspaces + std::vector node_fields, edge_fields; + node_fields.push_back(sizeof(int)); // piece_id + assert(sizeof(int) == sizeof(Point<1>)); + edge_fields.push_back(sizeof(Point<1>)); // src_node + edge_fields.push_back(sizeof(Point<1>)); // dst_node + + ri_nodes.resize(num_pieces); + piece_id_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_nodes_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, memories[i % memories.size()], ss_nodes_eq[i], + node_fields, 0 /*SOA*/, + Realm::ProfilingRequestSet()) + .wait(); + ri_nodes[i] = ri; + + piece_id_field_data[i].index_space = ss_nodes_eq[i]; + piece_id_field_data[i].inst = ri_nodes[i]; + piece_id_field_data[i].field_offset = 0; + } + + + // Fire off tasks to initialize data + ri_edges.resize(num_pieces); + src_node_field_data.resize(num_pieces); + dst_node_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_edges_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, + memories[i % memories.size()], + ss_edges_eq[i], + edge_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_edges[i] = ri; + + src_node_field_data[i].index_space = ss_edges_eq[i]; + src_node_field_data[i].inst = ri_edges[i]; + src_node_field_data[i].field_offset = 0 * sizeof(Point<1>); + + dst_node_field_data[i].index_space = ss_edges_eq[i]; + dst_node_field_data[i].inst = ri_edges[i]; + dst_node_field_data[i].field_offset = 1 * sizeof(Point<1>); + } + + // fire off tasks to initialize data + std::set events; + for(int i = 0; i < num_pieces; i++) { + Processor p = procs[i % procs.size()]; + InitDataArgs args; + args.index = i; + args.ri_nodes = ri_nodes[i]; + args.ri_edges = ri_edges[i]; + Event e = p.spawn(INIT_TILE_DATA_TASK, &args, sizeof(args)); + events.insert(e); + } + + return Event::merge_events(events); + } + + // the outputs of our partitioning will be: + // p_nodes - nodes partitioned by subgraph id (from GPU) + // p_nodes_cpu - nodes partitioned by subgraph id (from CPU) + + + std::vector > p_nodes, p_rd; + std::vector > p_edges, p_preimage_edges; + + std::vector > p_nodes_cpu, p_rd_cpu; + std::vector > p_edges_cpu, p_preimage_edges_cpu; + + virtual Event perform_partitioning(void) + { + // Partition nodes by subgraph id - do this twice, once on CPU and once on GPU + // Ensure that the results are identical + + std::vector colors(num_pieces); + for(int i = 0; i < num_pieces; i++) + colors[i] = i; + + // We need a GPU memory for GPU partitioning + Memory gpu_memory; + bool found_gpu_memory = false; + Machine machine = Machine::get_machine(); + std::set all_memories; + machine.get_all_memories(all_memories); + for(Memory memory : all_memories) { + if(memory.kind() == Memory::GPU_FB_MEM) { + gpu_memory = memory; + found_gpu_memory = true; + break; + } + } + if (!found_gpu_memory) { + log_app.error() << "No GPU memory found for partitioning test\n"; + return Event::NO_EVENT; + } + std::vector edge_fields; + edge_fields.push_back(sizeof(Point<1>)); + edge_fields.push_back(sizeof(Point<1>)) ; + std::vector node_fields; + node_fields.push_back(sizeof(int)); + + std::vector, Point<1> > > src_field_data_gpu; + std::vector, Point<1> > > dst_field_data_gpu; + std::vector, int> > piece_field_data_gpu; + piece_field_data_gpu.resize(num_pieces); + src_field_data_gpu.resize(num_pieces); + dst_field_data_gpu.resize(num_pieces); + for (int i = 0; i < num_pieces; i++) { + RegionInstance src_gpu_instance; + RegionInstance dst_gpu_instance; + RegionInstance piece_gpu_instance; + RegionInstance::create_instance(src_gpu_instance, + gpu_memory, + src_node_field_data[i].index_space, + edge_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(dst_gpu_instance, + gpu_memory, + dst_node_field_data[i].index_space, + edge_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(piece_gpu_instance, + gpu_memory, + piece_id_field_data[i].index_space, + node_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + CopySrcDstField src_gpu_field, src_cpu_field, dst_gpu_field, dst_cpu_field, piece_gpu_field, piece_cpu_field; + src_gpu_field.inst = src_gpu_instance; + src_gpu_field.size = sizeof(Point<1>); + src_gpu_field.field_id = 0; + src_cpu_field.inst = src_node_field_data[i].inst; + src_cpu_field.size = sizeof(Point<1>); + src_cpu_field.field_id = 0; + dst_gpu_field.inst = dst_gpu_instance; + dst_gpu_field.size = sizeof(Point<1>); + dst_gpu_field.field_id = sizeof(Point<1>); + dst_cpu_field.inst = dst_node_field_data[i].inst; + dst_cpu_field.size = sizeof(Point<1>); + dst_cpu_field.field_id = sizeof(Point<1>); + piece_gpu_field.inst = piece_gpu_instance; + piece_gpu_field.size = sizeof(int); + piece_gpu_field.field_id = 0; + piece_cpu_field.inst = piece_id_field_data[i].inst; + piece_cpu_field.size = sizeof(int); + piece_cpu_field.field_id = 0; + std::vector src_cpu_data, src_gpu_data, dst_cpu_data, dst_gpu_data, piece_cpu_data, piece_gpu_data; + src_cpu_data.push_back(src_cpu_field); + dst_cpu_data.push_back(dst_cpu_field); + src_gpu_data.push_back(src_gpu_field); + dst_gpu_data.push_back(dst_gpu_field); + piece_gpu_data.push_back(piece_gpu_field); + piece_cpu_data.push_back(piece_cpu_field); + Event copy_event = src_node_field_data[i].index_space.copy(src_cpu_data, src_gpu_data, Realm::ProfilingRequestSet()); + copy_event.wait(); + Event second_copy_event = dst_node_field_data[i].index_space.copy(dst_cpu_data, dst_gpu_data, Realm::ProfilingRequestSet()); + second_copy_event.wait(); + Event third_copy_event = piece_id_field_data[i].index_space.copy(piece_cpu_data, piece_gpu_data, Realm::ProfilingRequestSet()); + third_copy_event.wait(); + src_field_data_gpu[i].inst = src_gpu_instance; + src_field_data_gpu[i].index_space = src_node_field_data[i].index_space; + src_field_data_gpu[i].field_offset = 0; + dst_field_data_gpu[i].inst = dst_gpu_instance; + dst_field_data_gpu[i].index_space = dst_node_field_data[i].index_space; + dst_field_data_gpu[i].field_offset = 1 * sizeof(Point<1>); + piece_field_data_gpu[i].inst = piece_gpu_instance; + piece_field_data_gpu[i].index_space = piece_id_field_data[i].index_space; + piece_field_data_gpu[i].field_offset = 0; + } + wait_on_events = true; + log_app.info() << "warming up" << Clock::current_time_in_microseconds() << "\n"; + std::vector > p_garbage_nodes, p_garbage_edges, p_garbage_rd, p_garbage_preimage_edges; + Event e01 = is_nodes.create_subspaces_by_field(piece_field_data_gpu, + colors, + p_garbage_nodes, + Realm::ProfilingRequestSet()); + if (wait_on_events) e01.wait(); + Event e02 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_garbage_nodes, + p_garbage_edges, + Realm::ProfilingRequestSet(), + e01); + if(wait_on_events) e02.wait(); + + // an image of p_edges through out_node gives us all the shared nodes, along + // with some private nodes + Event e03 = is_nodes.create_subspaces_by_image(src_field_data_gpu, + p_garbage_edges, + p_garbage_rd, + Realm::ProfilingRequestSet(), + e02); + if(wait_on_events) e03.wait(); + + Event e04 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_garbage_rd, + p_garbage_preimage_edges, + Realm::ProfilingRequestSet(), + e03); + e04.wait(); + log_app.info() << "warming up complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU Partitioning " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU By Field " << Clock::current_time_in_microseconds() << "\n"; + Event e1 = is_nodes.create_subspaces_by_field(piece_field_data_gpu, + colors, + p_nodes, + Realm::ProfilingRequestSet()); + if(wait_on_events) e1.wait(); + log_app.info() << "GPU By Field complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU Preimage " << Clock::current_time_in_microseconds() << "\n"; + // now compute p_edges based on the color of their in_node (i.e. a preimage) + Event e2 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_nodes, + p_edges, + Realm::ProfilingRequestSet(), + e1); + if(wait_on_events) e2.wait(); + log_app.info() << "GPU Preimage complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting GPU Image " << Clock::current_time_in_microseconds() << "\n"; + + // an image of p_edges through out_node gives us all the shared nodes, along + // with some private nodes + Event e3 = is_nodes.create_subspaces_by_image(src_field_data_gpu, + p_edges, + p_rd, + Realm::ProfilingRequestSet(), + e2); + if(wait_on_events) e3.wait(); + log_app.info() << "GPU Image complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting second GPU preimage " << Clock::current_time_in_microseconds() << "\n"; + + Event e4 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_rd, + p_preimage_edges, + Realm::ProfilingRequestSet(), + e3); + e4.wait(); + log_app.info() << "Second GPU preimage complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "GPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting CPU Partitioning " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting CPU By Field " << Clock::current_time_in_microseconds() << "\n"; + Event e5 = is_nodes.create_subspaces_by_field(piece_id_field_data, + colors, + p_nodes_cpu, + Realm::ProfilingRequestSet()); + if(wait_on_events) e5.wait(); + log_app.info() << "CPU By Field complete " << Clock::current_time_in_microseconds() << "\n"; + // now compute p_edges based on the color of their in_node (i.e. a preimage) + log_app.info() << "Starting CPU Preimage " << Clock::current_time_in_microseconds() << "\n"; + Event e6 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_nodes_cpu, + p_edges_cpu, + Realm::ProfilingRequestSet(), + e5); + if(wait_on_events) e6.wait(); + log_app.info() << "CPU Preimage complete " << Clock::current_time_in_microseconds() << "\n"; + + // an image of p_edges through out_node gives us all the shared nodes, along + // with some private nodes + log_app.info() << "Starting CPU Image " << Clock::current_time_in_microseconds() << "\n"; + Event e7 = is_nodes.create_subspaces_by_image(src_node_field_data, + p_edges_cpu, + p_rd_cpu, + Realm::ProfilingRequestSet(), + e6); + if(wait_on_events) e7.wait(); + log_app.info() << "CPU Image complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "Starting second CPU preimage " << Clock::current_time_in_microseconds() << "\n"; + + Event e8 = is_edges.create_subspaces_by_preimage(dst_node_field_data, + p_rd_cpu, + p_preimage_edges_cpu, + Realm::ProfilingRequestSet(), + e7); + e8.wait(); + log_app.info() << "Second CPU preimage complete " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; + return e8; + } + + virtual int perform_dynamic_checks(void) + { + // Nothing to do here + return 0; + } + + virtual int check_partitioning(void) + { + int errors = 0; + + if (!p_nodes.size()) { + return 0; + } + + log_app.info() << "Checking correctness of partitioning " << "\n"; + + for(int i = 0; i < num_pieces; i++) { + for(IndexSpaceIterator<1> it(p_nodes[i]); it.valid; it.step()) { + for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_nodes_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra byfield point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for(IndexSpaceIterator<1> it(p_nodes_cpu[i]); it.valid; it.step()) { + for(PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_nodes[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing byfield point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_edges[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_edges_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_edges_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_edges[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_rd[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_rd_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra image node " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_rd_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_rd[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing image node " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_preimage_edges[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_preimage_edges_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra second preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_preimage_edges_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_preimage_edges[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing second preimage edge " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + + } + return errors; + } +}; + +class RangeTest : public TestInterface { +public: + // graph config parameters + int num_nodes = 1000; + int num_rects = 1000; + int max_rect_size = 10; + int num_pieces = 4; + std::string filename; + + RangeTest(int argc, const char *argv[]) + { + for(int i = 1; i < argc; i++) { + + if(!strcmp(argv[i], "-p")) { + num_pieces = atoi(argv[++i]); + continue; + } + + if(!strcmp(argv[i], "-n")) { + num_nodes = atoi(argv[++i]); + continue; + } + + if(!strcmp(argv[i], "-r")) { + num_rects = atoi(argv[++i]); + continue; + } + + if(!strcmp(argv[i], "-m")) { + max_rect_size = atoi(argv[++i]); + continue; + } + } + + + + if (num_nodes <= 0 || num_rects <= 0) { + log_app.error() << "Invalid graph dimensions in input file: rects=" << num_rects << " nodes=" << num_nodes; + exit(1); + } + + } + + + + struct InitDataArgs { + int index; + RegionInstance ri_nodes; + RegionInstance ri_rects; + }; + + enum PRNGStreams { + NODE_SUBGRAPH_STREAM, + }; + + void random_rect_data(int idx, int& subgraph) + { + if(random_colors) + subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); + else + subgraph = idx * num_pieces / num_rects; + } + + void random_node_data(int idx, int& subgraph) + { + if(true) + subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); + else + subgraph = idx * num_pieces / num_nodes; + } + + void initialize_rect_data(int idx, Rect<1> &rect, int max_rect_size = 10) + { + + int first = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes); + int amount = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, max_rect_size); + rect = Rect<1>(first, first + amount); + } + + + static void init_data_task_wrapper(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) + { + RangeTest *me = (RangeTest *)testcfg; + me->init_data_task(args, arglen, p); + } + + void init_data_task(const void *args, size_t arglen, Processor p) + { + const InitDataArgs& i_args = *(const InitDataArgs *)args; + + log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes << ", ri_rects=" << i_args.ri_rects << ")"; + + i_args.ri_nodes.fetch_metadata(p).wait(); + i_args.ri_rects.fetch_metadata(p).wait(); + + IndexSpace<1> is_nodes = i_args.ri_nodes.get_indexspace<1>(); + IndexSpace<1> is_rects = i_args.ri_rects.get_indexspace<1>(); + + log_app.debug() << "N: " << is_nodes; + log_app.debug() << "E: " << is_rects; + + //Write out colors and rectangles + + { + AffineAccessor a_rect_id(i_args.ri_rects, 0 /* offset */); + + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) { + int subgraph; + random_rect_data(i, subgraph); + a_rect_id.write(i, subgraph); + } + } + { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) { + int subgraph; + random_node_data(i, subgraph); + a_piece_id.write(i, subgraph); + } + } + + + { + + AffineAccessor, 1> a_rect_val(i_args.ri_rects, 1 * sizeof(int) /* offset */); + + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) { + Rect<1> rect; + initialize_rect_data(i, rect, max_rect_size); + a_rect_val.write(i, rect); + } + } + + if(show_graph) { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) + log_app.info() << "node_id[" << i << "] = " << a_piece_id.read(i) << "\n"; + + AffineAccessor a_rect_id(i_args.ri_rects, 0 * sizeof(Point<1>) /* offset */); + + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) + log_app.info() << "rect_id[" << i << "] = " << a_rect_id.read(i) << "\n"; + + AffineAccessor,1> a_rect_val(i_args.ri_rects, 1 * sizeof(int) /* offset */); + + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) + log_app.info() << "rect_val[" << i << "] = " << a_rect_val.read(i) << "\n"; + } + } + + IndexSpace<1> is_nodes, is_rects; + std::vector ri_nodes; + std::vector, int> > node_id_field_data; + std::vector ri_rects; + std::vector, int> > rect_id_field_data; + std::vector, Rect<1> > > rect_val_field_data; + + virtual void print_info(void) + { + printf("Realm dependent partitioning test - ranges: %d nodes, %d rects, %d pieces\n", + (int)num_nodes, (int)num_rects, (int)num_pieces); + } + + virtual Event initialize_data(const std::vector& memories, + const std::vector& procs) + { + // now create index spaces for nodes and edges + is_nodes = Rect<1>(0, num_nodes - 1); + is_rects = Rect<1>(0, num_rects - 1); + + // equal partition is used to do initial population of edges and nodes + std::vector > ss_nodes_eq; + std::vector > ss_rects_eq; + + log_app.info() << "Creating equal subspaces" << "\n"; + + is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); + is_rects.create_equal_subspaces(num_pieces, 1, ss_rects_eq, Realm::ProfilingRequestSet()).wait(); + + log_app.debug() << "Initial partitions:"; + for(size_t i = 0; i < ss_nodes_eq.size(); i++) + log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i]; + for(size_t i = 0; i < ss_rects_eq.size(); i++) + log_app.debug() << " Rects #" << i << ": " << ss_rects_eq[i]; + + // create instances for each of these subspaces + std::vector node_fields, rect_fields; + node_fields.push_back(sizeof(int)); // piece_id + rect_fields.push_back(sizeof(int)); // src_node + rect_fields.push_back(sizeof(Rect<1>)); // dst_node + + ri_nodes.resize(num_pieces); + node_id_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_nodes_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, + memories[i % memories.size()], + ss_nodes_eq[i], + node_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_nodes[i] = ri; + + node_id_field_data[i].index_space = ss_nodes_eq[i]; + node_id_field_data[i].inst = ri_nodes[i]; + node_id_field_data[i].field_offset = 0; + } + + ri_rects.resize(num_pieces); + rect_id_field_data.resize(num_pieces); + rect_val_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_rects_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, + memories[i % memories.size()], + ss_rects_eq[i], + rect_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_rects[i] = ri; + + rect_id_field_data[i].index_space = ss_rects_eq[i]; + rect_id_field_data[i].inst = ri_rects[i]; + rect_id_field_data[i].field_offset = 0; + + rect_val_field_data[i].index_space = ss_rects_eq[i]; + rect_val_field_data[i].inst = ri_rects[i]; + rect_val_field_data[i].field_offset = 1 * sizeof(int); + } + + // fire off tasks to initialize data + std::set events; + for(int i = 0; i < num_pieces; i++) { + Processor p = procs[i % procs.size()]; + InitDataArgs args; + args.index = i; + args.ri_nodes = ri_nodes[i]; + args.ri_rects = ri_rects[i]; + Event e = p.spawn(INIT_RANGE_DATA_TASK, &args, sizeof(args)); + events.insert(e); + } + + return Event::merge_events(events); + } + + // the outputs of our partitioning will be: + //p_colored_rects -> all of our rectangles marked with the color given by random_rect_data + //p_rects -> image range by p colored rects into nodes + + std::vector > p_colored_rects, p_rects; + std::vector > p_colored_rects_cpu, p_rects_cpu; + + virtual Event perform_partitioning(void) + { + + std::vector colors(num_pieces); + for(int i = 0; i < num_pieces; i++) + colors[i] = i; + + Memory gpu_memory; + bool found_gpu_memory = false; + Machine machine = Machine::get_machine(); + std::set all_memories; + machine.get_all_memories(all_memories); + for(auto& memory : all_memories) { + if(memory.kind() == Memory::GPU_FB_MEM) { + gpu_memory = memory; + found_gpu_memory = true; + break; + } + } + assert(found_gpu_memory); + std::vector rect_fields; + rect_fields.push_back(sizeof(int)); + rect_fields.push_back(sizeof(Rect<1>)); + std::vector node_fields; + node_fields.push_back(sizeof(int)); + + std::vector, int > > node_id_data_gpu; + std::vector, int > > rect_id_data_gpu; + std::vector, Rect<1>>> rect_val_data_gpu; + node_id_data_gpu.resize(num_pieces); + rect_id_data_gpu.resize(num_pieces); + rect_val_data_gpu.resize(num_pieces); + for (int i = 0; i < num_pieces; i++) { + RegionInstance node_id_instance; + RegionInstance rect_id_instance; + RegionInstance rect_val_instance; + RegionInstance::create_instance(node_id_instance, + gpu_memory, + node_id_field_data[i].index_space, + node_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(rect_id_instance, + gpu_memory, + rect_id_field_data[i].index_space, + rect_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(rect_val_instance, + gpu_memory, + rect_val_field_data[i].index_space, + rect_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + CopySrcDstField node_id_gpu_field, node_id_cpu_field, rect_id_gpu_field, rect_id_cpu_field, rect_val_gpu_field, rect_val_cpu_field; + node_id_gpu_field.inst = node_id_instance; + node_id_gpu_field.size = sizeof(int); + node_id_gpu_field.field_id = 0; + node_id_cpu_field.inst = node_id_field_data[i].inst; + node_id_cpu_field.size = sizeof(int); + node_id_cpu_field.field_id = 0; + rect_id_gpu_field.inst = rect_id_instance; + rect_id_gpu_field.size = sizeof(int); + rect_id_gpu_field.field_id = 0; + rect_id_cpu_field.inst = rect_id_field_data[i].inst; + rect_id_cpu_field.size = sizeof(int); + rect_id_cpu_field.field_id = 0; + rect_val_gpu_field.inst = rect_val_instance; + rect_val_gpu_field.size = sizeof(Rect<1>); + rect_val_gpu_field.field_id = sizeof(int); + rect_val_cpu_field.inst = rect_val_field_data[i].inst; + rect_val_cpu_field.size = sizeof(Rect<1>); + rect_val_cpu_field.field_id = sizeof(int); + std::vector node_id_gpu_data, node_id_cpu_data, rect_id_gpu_data, rect_id_cpu_data, rect_val_gpu_data, rect_val_cpu_data; + node_id_gpu_data.push_back(node_id_gpu_field); + node_id_cpu_data.push_back(node_id_cpu_field); + rect_id_gpu_data.push_back(rect_id_gpu_field); + rect_id_cpu_data.push_back(rect_id_cpu_field); + rect_val_gpu_data.push_back(rect_val_gpu_field); + rect_val_cpu_data.push_back(rect_val_cpu_field); + Event copy_event = node_id_field_data[i].index_space.copy(node_id_cpu_data, node_id_gpu_data, Realm::ProfilingRequestSet()); + copy_event.wait(); + Event second_copy_event = rect_id_field_data[i].index_space.copy(rect_id_cpu_data, rect_id_gpu_data, Realm::ProfilingRequestSet()); + second_copy_event.wait(); + Event third_copy_event = rect_val_field_data[i].index_space.copy(rect_val_cpu_data, rect_val_gpu_data, Realm::ProfilingRequestSet()); + third_copy_event.wait(); + node_id_data_gpu[i].inst = node_id_instance; + node_id_data_gpu[i].index_space = node_id_field_data[i].index_space; + node_id_data_gpu[i].field_offset = 0; + rect_id_data_gpu[i].inst = rect_id_instance; + rect_id_data_gpu[i].index_space = rect_id_field_data[i].index_space; + rect_id_data_gpu[i].field_offset = 0; + rect_val_data_gpu[i].inst = rect_val_instance; + rect_val_data_gpu[i].index_space = rect_val_field_data[i].index_space; + rect_val_data_gpu[i].field_offset = sizeof(int); + } + wait_on_events = true; + std::vector> p_garbage_rects, p_garbage_colors; + log_app.info() << "WARMING UP " << "\n"; + + std::vector> field_estimate_input(rect_id_data_gpu.size()); + std::vector field_estimate_output(rect_id_data_gpu.size()); + std::vector> image_estimate_input(rect_val_data_gpu.size()); + std::vector image_estimate_output(rect_val_data_gpu.size()); + std::vector> subspace_input(colors.size()); + for (size_t i = 0; i < rect_id_data_gpu.size(); i++) { + field_estimate_input[i].location = rect_id_data_gpu[i].inst.get_location(); + field_estimate_input[i].space = rect_id_data_gpu[i].index_space; + } + for (size_t i = 0; i < rect_val_data_gpu.size(); i++) { + image_estimate_input[i].location = rect_val_data_gpu[i].inst.get_location(); + image_estimate_input[i].space = rect_val_data_gpu[i].index_space; + } + + is_rects.by_field_buffer_requirements(field_estimate_input, field_estimate_output); + std::vector byte_fields = {sizeof(char)}; + for (size_t i = 0; i < rect_id_data_gpu.size(); i++) { + IndexSpace<1> instance_index_space(Rect<1>(0, field_estimate_output[i].upper_bound-1)); + RegionInstance::create_instance(rect_id_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + } + + Event e001 = is_rects.create_subspaces_by_field(rect_id_data_gpu, + colors, + p_garbage_colors, + Realm::ProfilingRequestSet()); + if (wait_on_events) e001.wait(); + for (size_t i = 0; i < colors.size(); i++) { + subspace_input[i].space = p_garbage_colors[i]; + subspace_input[i].entries = p_garbage_colors[i].sparsity.impl()->get_entries().size(); + } + is_nodes.by_image_buffer_requirements(subspace_input, image_estimate_input, image_estimate_output); + for (size_t i = 0; i < rect_val_data_gpu.size(); i++) { + IndexSpace<1> instance_index_space(Rect<1>(0, (image_estimate_output[i].upper_bound)/12-1)); + RegionInstance::create_instance(rect_val_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + } + Event e002 = is_nodes.create_subspaces_by_image(rect_val_data_gpu, + p_garbage_colors, + p_garbage_rects, + Realm::ProfilingRequestSet(), + e001); + if(wait_on_events) e002.wait(); + + log_app.info() << "FINISHED WARMING UP " << "\n"; + log_app.info() << "starting GPU partitioning " << Clock::current_time_in_microseconds() << "\n"; + + log_app.info() << "STARTING GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + + Event e01 = is_rects.create_subspaces_by_field(rect_id_data_gpu, + colors, + p_colored_rects, + Realm::ProfilingRequestSet()); + if (wait_on_events) e01.wait(); + + log_app.info() << "FINISHED GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "STARTING GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + Event e02 = is_nodes.create_subspaces_by_image(rect_val_data_gpu, + p_colored_rects, + p_rects, + Realm::ProfilingRequestSet(), + e01); + if(wait_on_events) e02.wait(); + + log_app.info() << "FINISHED GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "STARTING CPU partitioning " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "STARTING CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + Event e1 = is_rects.create_subspaces_by_field(rect_id_field_data, + colors, + p_colored_rects_cpu, + Realm::ProfilingRequestSet()); + if (wait_on_events) e1.wait(); + log_app.info() << "FINISHED CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "STARTING CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + Event e2 = is_nodes.create_subspaces_by_image(rect_val_field_data, + p_colored_rects_cpu, + p_rects_cpu, + Realm::ProfilingRequestSet(), + e1); + if(wait_on_events) e2.wait(); + log_app.info() << "FINISHED CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; + return e2; + } + + + + virtual int perform_dynamic_checks(void) + { + return 0; + } + + virtual int check_partitioning(void) + { + log_app.info() << "Checking correctness of partitioning " << "\n"; + int errors = 0; + + for (int i = 0; i < num_pieces; i++) { + for (IndexSpaceIterator<1> it(p_colored_rects[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_colored_rects_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra colored rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_colored_rects_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if(!p_colored_rects[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing colored rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_rects[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_rects_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_rects_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if(!p_rects[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + } + return errors; + } +}; + +class Range2DTest : public TestInterface { +public: + // graph config parameters + int num_nodes = 1000; + int num_rects = 1000; + int max_rect_size = 10; + int num_pieces = 4; + + Range2DTest(int argc, const char *argv[]) + { + for(int i = 1; i < argc; i++) { + + if(!strcmp(argv[i], "-p")) { + num_pieces = atoi(argv[++i]); + continue; + } + + if(!strcmp(argv[i], "-n")) { + num_nodes = atoi(argv[++i]); + continue; + } + + if (!strcmp(argv[i], "-r")) { + num_rects = atoi(argv[++i]); + continue; + } + + if (!strcmp(argv[i], "-m")) { + max_rect_size = atoi(argv[++i]); + continue; + } + } + + if (num_nodes <= 0 || num_rects <= 0) { + log_app.error() << "Invalid graph dimensions in input file: rects=" << num_rects << " nodes=" << num_nodes; + exit(1); + } + + } + + + + struct InitDataArgs { + int index; + RegionInstance ri_nodes; + RegionInstance ri_rects; + }; + + enum PRNGStreams { + NODE_SUBGRAPH_STREAM, + }; + + void random_rect_data(int idx, int& subgraph) + { + if(random_colors) + subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); + else + subgraph = idx * num_pieces / num_rects; + } + + void random_node_data(int idx, int& subgraph) + { + if(true) + subgraph = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_pieces); + else + subgraph = idx * num_pieces / num_nodes; + } + + void initialize_rect_data(int idx, Rect<2> &rect, int max_rect_size = 10) + { + + int x = Philox_2x32<>::rand_int(random_seed, idx, NODE_SUBGRAPH_STREAM, num_nodes); + int y = Philox_2x32<>::rand_int(random_seed, idx + 1, NODE_SUBGRAPH_STREAM, num_nodes); + int length = Philox_2x32<>::rand_int(random_seed, idx + 2, NODE_SUBGRAPH_STREAM, max_rect_size); + int height = Philox_2x32<>::rand_int(random_seed, idx + 3, NODE_SUBGRAPH_STREAM, max_rect_size); + rect.lo[0] = x; + rect.hi[0] = x + length; + rect.lo[1] = y; + rect.hi[1] = y + height; + } + + + static void init_data_task_wrapper(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) + { + Range2DTest *me = (Range2DTest *)testcfg; + me->init_data_task(args, arglen, p); + } + + void init_data_task(const void *args, size_t arglen, Processor p) + { + const InitDataArgs& i_args = *(const InitDataArgs *)args; + + log_app.info() << "init task #" << i_args.index << " (ri_nodes=" << i_args.ri_nodes << ", ri_rects=" << i_args.ri_rects << ")"; + + i_args.ri_nodes.fetch_metadata(p).wait(); + i_args.ri_rects.fetch_metadata(p).wait(); + + IndexSpace<2> is_nodes = i_args.ri_nodes.get_indexspace<2>(); + IndexSpace<1> is_rects = i_args.ri_rects.get_indexspace<1>(); + + log_app.debug() << "N: " << is_nodes; + log_app.debug() << "E: " << is_rects; + + { + AffineAccessor a_piece_id(i_args.ri_rects, 0 /* offset */); + + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) { + int subgraph; + random_rect_data(i, subgraph); + a_piece_id.write(i, subgraph); + } + } + { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo[0]; i <= is_nodes.bounds.hi[0]; i++) { + for (int j = is_nodes.bounds.lo[1]; j <= is_nodes.bounds.hi[1]; j++) { + int idx = i * (is_nodes.bounds.hi[1] - is_nodes.bounds.lo[1] + 1) + j; + int subgraph; + random_node_data(idx, subgraph); + a_piece_id.write(Point<2>(i, j), subgraph); + } + } + } + + + { + + AffineAccessor, 1> a_rect(i_args.ri_rects, 1 * sizeof(int) /* offset */); + + // Read edges line by line + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) { + Rect<2> rect; + initialize_rect_data(i, rect, max_rect_size); + a_rect.write(i, rect); + } + } + + if(show_graph) { + AffineAccessor a_piece_id(i_args.ri_nodes, 0 /* offset */); + + for(int i = is_nodes.bounds.lo[0]; i <= is_nodes.bounds.hi[1]; i++) { + for (int j = is_nodes.bounds.lo[1]; j <= is_nodes.bounds.hi[1]; j++) { + Point<2> p(i, j); + log_app.info() << "node_id[" << p << "] = " << a_piece_id.read(p) << "\n"; + } + } + + AffineAccessor a_rect_id(i_args.ri_rects, 0 * sizeof(Point<1>) /* offset */); + + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) + log_app.info() << "rect_id[" << i << "] = " << a_rect_id.read(i) << "\n"; + + AffineAccessor,1> a_rect_val(i_args.ri_rects, 1 * sizeof(int) /* offset */); + + for(int i = is_rects.bounds.lo; i <= is_rects.bounds.hi; i++) + log_app.info() << "rect_val[" << i << "] = " << a_rect_val.read(i) << "\n"; + } + } + + IndexSpace<1> is_rects; + IndexSpace<2> is_nodes; + std::vector ri_nodes; + std::vector, int> > node_id_field_data; + std::vector ri_rects; + std::vector, int> > rect_id_field_data; + std::vector, Rect<2> > > rect_val_field_data; + + virtual void print_info(void) + { + printf("Realm dependent partitioning test - 2D ranges: %d nodes, %d rects, %d pieces\n", + (int)num_nodes, (int)num_rects, (int)num_pieces); + } + + virtual Event initialize_data(const std::vector& memories, + const std::vector& procs) + { + // now create index spaces for nodes and edges + is_nodes = Rect<2>(Point<2>(0, 0), Point<2>(num_nodes - 1, num_nodes - 1)); + is_rects = Rect<1>(0, num_rects - 1); + + // equal partition is used to do initial population of edges and nodes + std::vector > ss_nodes_eq; + std::vector > ss_rects_eq; + + log_app.info() << "Creating equal subspaces" << "\n"; + + is_nodes.create_equal_subspaces(num_pieces, 1, ss_nodes_eq, Realm::ProfilingRequestSet()).wait(); + is_rects.create_equal_subspaces(num_pieces, 1, ss_rects_eq, Realm::ProfilingRequestSet()).wait(); + + log_app.debug() << "Initial partitions:\n"; + for(size_t i = 0; i < ss_nodes_eq.size(); i++) + log_app.debug() << " Nodes #" << i << ": " << ss_nodes_eq[i]; + for(size_t i = 0; i < ss_rects_eq.size(); i++) + log_app.debug() << " Rects #" << i << ": " << ss_rects_eq[i]; + + // create instances for each of these subspaces + std::vector node_fields, rect_fields; + node_fields.push_back(sizeof(int)); // piece_id + rect_fields.push_back(sizeof(int)); // src_node + rect_fields.push_back(sizeof(Rect<2>)); // dst_node + + ri_nodes.resize(num_pieces); + node_id_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_nodes_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, + memories[i % memories.size()], + ss_nodes_eq[i], + node_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_nodes[i] = ri; + + node_id_field_data[i].index_space = ss_nodes_eq[i]; + node_id_field_data[i].inst = ri_nodes[i]; + node_id_field_data[i].field_offset = 0; + } + + ri_rects.resize(num_pieces); + rect_id_field_data.resize(num_pieces); + rect_val_field_data.resize(num_pieces); + + for(size_t i = 0; i < ss_rects_eq.size(); i++) { + RegionInstance ri; + RegionInstance::create_instance(ri, + memories[i % memories.size()], + ss_rects_eq[i], + rect_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + ri_rects[i] = ri; + + rect_id_field_data[i].index_space = ss_rects_eq[i]; + rect_id_field_data[i].inst = ri_rects[i]; + rect_id_field_data[i].field_offset = 0; + + rect_val_field_data[i].index_space = ss_rects_eq[i]; + rect_val_field_data[i].inst = ri_rects[i]; + rect_val_field_data[i].field_offset = 1 * sizeof(int); + } + + // fire off tasks to initialize data + std::set events; + for(int i = 0; i < num_pieces; i++) { + Processor p = procs[i % procs.size()]; + InitDataArgs args; + args.index = i; + args.ri_nodes = ri_nodes[i]; + args.ri_rects = ri_rects[i]; + Event e = p.spawn(INIT_RANGE2D_DATA_TASK, &args, sizeof(args)); + events.insert(e); + } + + return Event::merge_events(events); + } + + // the outputs of our partitioning will be: + // is_private, is_shared - subsets of is_nodes based on private/shared + // p_rd, p_wr, p_ghost - subsets of the above split by subckt + // p_edges - subsets of is_edges for each subckt + + std::vector > p_colored_rects; + std::vector> p_rects, p_intersect, p_diff; + std::vector> p_colored_rects_cpu; + std::vector> p_rects_cpu, p_intersect_cpu, p_diff_cpu; + + IndexSpace<2> cpu_union, gpu_union, garbage_union; + + virtual Event perform_partitioning(void) + { + // first partition nodes by subckt id (this is the independent partition, + // but not actually used by the app) + + std::vector colors(num_pieces); + for(int i = 0; i < num_pieces; i++) + colors[i] = i; + + Memory gpu_memory; + bool found_gpu_memory = false; + Machine machine = Machine::get_machine(); + std::set all_memories; + machine.get_all_memories(all_memories); + for(auto& memory : all_memories) { + if(memory.kind() == Memory::GPU_FB_MEM) { + gpu_memory = memory; + found_gpu_memory = true; + break; + } + } + assert(found_gpu_memory); + std::vector rect_fields; + rect_fields.push_back(sizeof(int)); + rect_fields.push_back(sizeof(Rect<2>)); + std::vector node_fields; + node_fields.push_back(sizeof(int)); + + std::vector, int > > node_id_data_gpu; + std::vector, int > > rect_id_data_gpu; + std::vector, Rect<2>>> rect_val_data_gpu; + node_id_data_gpu.resize(num_pieces); + rect_id_data_gpu.resize(num_pieces); + rect_val_data_gpu.resize(num_pieces); + for (int i = 0; i < num_pieces; i++) { + RegionInstance node_id_instance; + RegionInstance rect_id_instance; + RegionInstance rect_val_instance; + RegionInstance::create_instance(node_id_instance, + gpu_memory, + node_id_field_data[i].index_space, + node_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(rect_id_instance, + gpu_memory, + rect_id_field_data[i].index_space, + rect_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + RegionInstance::create_instance(rect_val_instance, + gpu_memory, + rect_val_field_data[i].index_space, + rect_fields, + 0 /*SOA*/, + Realm::ProfilingRequestSet()).wait(); + CopySrcDstField node_id_gpu_field, node_id_cpu_field, rect_id_gpu_field, rect_id_cpu_field, rect_val_gpu_field, rect_val_cpu_field; + node_id_gpu_field.inst = node_id_instance; + node_id_gpu_field.size = sizeof(int); + node_id_gpu_field.field_id = 0; + node_id_cpu_field.inst = node_id_field_data[i].inst; + node_id_cpu_field.size = sizeof(int); + node_id_cpu_field.field_id = 0; + rect_id_gpu_field.inst = rect_id_instance; + rect_id_gpu_field.size = sizeof(int); + rect_id_gpu_field.field_id = 0; + rect_id_cpu_field.inst = rect_id_field_data[i].inst; + rect_id_cpu_field.size = sizeof(int); + rect_id_cpu_field.field_id = 0; + rect_val_gpu_field.inst = rect_val_instance; + rect_val_gpu_field.size = sizeof(Rect<2>); + rect_val_gpu_field.field_id = sizeof(int); + rect_val_cpu_field.inst = rect_val_field_data[i].inst; + rect_val_cpu_field.size = sizeof(Rect<2>); + rect_val_cpu_field.field_id = sizeof(int); + std::vector node_id_gpu_data, node_id_cpu_data, rect_id_gpu_data, rect_id_cpu_data, rect_val_gpu_data, rect_val_cpu_data; + node_id_gpu_data.push_back(node_id_gpu_field); + node_id_cpu_data.push_back(node_id_cpu_field); + rect_id_gpu_data.push_back(rect_id_gpu_field); + rect_id_cpu_data.push_back(rect_id_cpu_field); + rect_val_gpu_data.push_back(rect_val_gpu_field); + rect_val_cpu_data.push_back(rect_val_cpu_field); + Event copy_event = node_id_field_data[i].index_space.copy(node_id_cpu_data, node_id_gpu_data, Realm::ProfilingRequestSet()); + copy_event.wait(); + Event second_copy_event = rect_id_field_data[i].index_space.copy(rect_id_cpu_data, rect_id_gpu_data, Realm::ProfilingRequestSet()); + second_copy_event.wait(); + Event third_copy_event = rect_val_field_data[i].index_space.copy(rect_val_cpu_data, rect_val_gpu_data, Realm::ProfilingRequestSet()); + third_copy_event.wait(); + node_id_data_gpu[i].inst = node_id_instance; + node_id_data_gpu[i].index_space = node_id_field_data[i].index_space; + node_id_data_gpu[i].field_offset = 0; + rect_id_data_gpu[i].inst = rect_id_instance; + rect_id_data_gpu[i].index_space = rect_id_field_data[i].index_space; + rect_id_data_gpu[i].field_offset = 0; + rect_val_data_gpu[i].inst = rect_val_instance; + rect_val_data_gpu[i].index_space = rect_val_field_data[i].index_space; + rect_val_data_gpu[i].field_offset = sizeof(int); + } + wait_on_events = true; + std::vector> p_garbage_colors; + std::vector> p_garbage_rects; + log_app.info() << "WARMING UP " << "\n"; + + std::vector> field_estimate_input(rect_id_data_gpu.size()); + std::vector field_estimate_output(rect_id_data_gpu.size()); + std::vector> image_estimate_input(rect_val_data_gpu.size()); + std::vector image_estimate_output(rect_val_data_gpu.size()); + std::vector> subspace_input(colors.size()); + for (size_t i = 0; i < rect_id_data_gpu.size(); i++) { + field_estimate_input[i].location = rect_id_data_gpu[i].inst.get_location(); + field_estimate_input[i].space = rect_id_data_gpu[i].index_space; + } + for (size_t i = 0; i < rect_val_data_gpu.size(); i++) { + image_estimate_input[i].location = rect_val_data_gpu[i].inst.get_location(); + image_estimate_input[i].space = rect_val_data_gpu[i].index_space; + } + + is_rects.by_field_buffer_requirements(field_estimate_input, field_estimate_output); + std::vector byte_fields = {sizeof(char)}; + for (size_t i = 0; i < rect_id_data_gpu.size(); i++) { + IndexSpace<1> instance_index_space(Rect<1>(0, field_estimate_output[i].upper_bound-1)); + RegionInstance::create_instance(rect_id_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + } + + Event e001 = is_rects.create_subspaces_by_field(rect_id_data_gpu, + colors, + p_garbage_colors, + Realm::ProfilingRequestSet()); + if (wait_on_events) e001.wait(); + for (size_t i = 0; i < colors.size(); i++) { + subspace_input[i].space = p_garbage_colors[i]; + subspace_input[i].entries = p_garbage_colors[i].sparsity.impl()->get_entries().size(); + } + is_nodes.by_image_buffer_requirements(subspace_input, image_estimate_input, image_estimate_output); + for (size_t i = 0; i < rect_val_data_gpu.size(); i++) { + IndexSpace<1> instance_index_space(Rect<1>(0, (image_estimate_output[i].upper_bound*5)-1)); + RegionInstance::create_instance(rect_val_data_gpu[i].scratch_buffer, gpu_memory, instance_index_space, byte_fields, 0, Realm::ProfilingRequestSet()).wait(); + } + Event e002 = is_nodes.create_subspaces_by_image(rect_val_data_gpu, + p_garbage_colors, + p_garbage_rects, + Realm::ProfilingRequestSet(), + e001); + if(wait_on_events) e002.wait(); + + log_app.info() << "FINISHED WARMING UP " << "\n"; + log_app.info() << "starting GPU partitioning " << Clock::current_time_in_microseconds() << "\n"; + + log_app.info() << "STARTING GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + + Event e01 = is_rects.create_subspaces_by_field(rect_id_data_gpu, + colors, + p_colored_rects, + Realm::ProfilingRequestSet()); + if (wait_on_events) e01.wait(); + + log_app.info() << "FINISHED GPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "STARTING GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + Event e02 = is_nodes.create_subspaces_by_image(rect_val_data_gpu, + p_colored_rects, + p_rects, + Realm::ProfilingRequestSet(), + e01); + if(wait_on_events) e02.wait(); + log_app.info() << "FINISHED GPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "GPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; + + log_app.info() << "STARTING CPU partitioning " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "STARTING CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + Event e1 = is_rects.create_subspaces_by_field(rect_id_field_data, + colors, + p_colored_rects_cpu, + Realm::ProfilingRequestSet()); + if (wait_on_events) e1.wait(); + log_app.info() << "FINISHED CPU BY FIELD " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "STARTING CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + Event e2 = is_nodes.create_subspaces_by_image(rect_val_field_data, + p_colored_rects_cpu, + p_rects_cpu, + Realm::ProfilingRequestSet(), + e1); + if(wait_on_events) e2.wait(); + log_app.info() << "FINISHED CPU BY IMAGE " << Clock::current_time_in_microseconds() << "\n"; + log_app.info() << "CPU Partitioning complete " << Clock::current_time_in_microseconds() << "\n"; + return e2; + } + + + + virtual int perform_dynamic_checks(void) + { + return 0; + } + + virtual int check_partitioning(void) + { + log_app.info() << "Checking correctness of partitioning " << "\n"; + int errors = 0; + + for (int i = 0; i < num_pieces; i++) { + for (IndexSpaceIterator<1> it(p_colored_rects[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if(!p_colored_rects_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra colored rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<1> it(p_colored_rects_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<1> point(it.rect); point.valid; point.step()) { + if (!p_colored_rects[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing colored rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<2> it(p_rects[i]); it.valid; it.step()) { + for (PointInRectIterator<2> point(it.rect); point.valid; point.step()) { + if (!p_rects_cpu[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU has extra rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + for (IndexSpaceIterator<2> it(p_rects_cpu[i]); it.valid; it.step()) { + for (PointInRectIterator<2> point(it.rect); point.valid; point.step()) { + if (!p_rects[i].contains(point.p)) { + log_app.error() << "Mismatch! GPU is missing rect point " << point.p + << " on piece " << i << "\n"; + errors++; + } + } + } + } + return errors; + } +}; + class MiniAeroTest : public TestInterface { public: enum ProblemType @@ -625,7 +2732,7 @@ class MiniAeroTest : public TestInterface { AffineAccessor a_cell_blockid(i_args.ri_cells, 0 /* offset */); for(int i = is_cells.bounds.lo[0]; i <= is_cells.bounds.hi[0]; i++) - std::cout << "Z[" << i << "]: blockid=" << a_cell_blockid.read(i) << std::endl; + std::cout << "Z[" << i << "]: blockid=" << a_cell_blockid.read(i) << "\n"; AffineAccessor, 1> a_face_left(i_args.ri_faces, 0 * sizeof(Point<1>) /* offset */); @@ -637,7 +2744,7 @@ class MiniAeroTest : public TestInterface { for(int i = is_faces.bounds.lo[0]; i <= is_faces.bounds.hi[0]; i++) std::cout << "S[" << i << "]:" << " left=" << a_face_left.read(i) << " right=" << a_face_right.read(i) - << " type=" << a_face_type.read(i) << std::endl; + << " type=" << a_face_type.read(i) << "\n"; } } @@ -1006,7 +3113,6 @@ class CircuitTest : public TestInterface { { AffineAccessor a_subckt_id(i_args.ri_nodes, 0 /* offset */); - // std::cout << "a_subckt_id = " << a_subckt_id << "\n"; for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) { int subckt; @@ -1021,9 +3127,6 @@ class CircuitTest : public TestInterface { AffineAccessor, 1> a_out_node(i_args.ri_edges, 1 * sizeof(Point<1>) /* offset */); - // std::cout << "a_in_node = " << a_in_node << "\n"; - // std::cout << "a_out_node = " << a_out_node << "\n"; - for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) { Point<1> in_node, out_node; random_edge_data(i, in_node, out_node); @@ -1036,19 +3139,19 @@ class CircuitTest : public TestInterface { AffineAccessor a_subckt_id(i_args.ri_nodes, 0 /* offset */); for(int i = is_nodes.bounds.lo; i <= is_nodes.bounds.hi; i++) - std::cout << "subckt_id[" << i << "] = " << a_subckt_id.read(i) << std::endl; + std::cout << "subckt_id[" << i << "] = " << a_subckt_id.read(i) << "\n"; AffineAccessor, 1> a_in_node(i_args.ri_edges, 0 * sizeof(Point<1>) /* offset */); for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) - std::cout << "in_node[" << i << "] = " << a_in_node.read(i) << std::endl; + std::cout << "in_node[" << i << "] = " << a_in_node.read(i) << "\n"; AffineAccessor, 1> a_out_node(i_args.ri_edges, 1 * sizeof(Point<1>) /* offset */); for(int i = is_edges.bounds.lo; i <= is_edges.bounds.hi; i++) - std::cout << "out_node[" << i << "] = " << a_out_node.read(i) << std::endl; + std::cout << "out_node[" << i << "] = " << a_out_node.read(i) << "\n"; } } @@ -1761,7 +3864,7 @@ class PennantTest : public TestInterface { AffineAccessor a_zone_color(i_args.ri_zones, 0 /* offset */); for(int i = is_zones.bounds.lo; i <= is_zones.bounds.hi; i++) - std::cout << "Z[" << i << "]: color=" << a_zone_color.read(i) << std::endl; + std::cout << "Z[" << i << "]: color=" << a_zone_color.read(i) << "\n"; AffineAccessor, 1> a_side_mapsz(i_args.ri_sides, 0 * sizeof(Point<1>) /* offset */); @@ -1777,7 +3880,7 @@ class PennantTest : public TestInterface { << " mapsz=" << a_side_mapsz.read(i) << " mapss3=" << a_side_mapss3.read(i) << " mapsp1=" << a_side_mapsp1.read(i) << " ok=" << a_side_ok.read(i) - << std::endl; + << "\n"; } } @@ -2831,6 +4934,26 @@ int main(int argc, char **argv) break; } + if(!strcmp(argv[i], "basic")) { + testcfg = new BasicTest(argc - i, const_cast(argv + i)); + break; + } + + if(!strcmp(argv[i], "tile")) { + testcfg = new TileTest(argc - i, const_cast(argv + i)); + break; + } + + if (!strcmp(argv[i], "range")) { + testcfg = new RangeTest(argc - i, const_cast(argv + i)); + break; + } + + if (!strcmp(argv[i], "multi")) { + testcfg = new Range2DTest(argc - i, const_cast(argv + i)); + break; + } + if(!strcmp(argv[i], "pennant")) { testcfg = new PennantTest(argc - i, const_cast(argv + i)); break; @@ -2867,6 +4990,10 @@ int main(int argc, char **argv) rt.register_task(TOP_LEVEL_TASK, top_level_task); rt.register_task(INIT_CIRCUIT_DATA_TASK, CircuitTest::init_data_task_wrapper); rt.register_task(INIT_PENNANT_DATA_TASK, PennantTest::init_data_task_wrapper); + rt.register_task(INIT_BASIC_DATA_TASK, BasicTest::init_data_task_wrapper); + rt.register_task(INIT_TILE_DATA_TASK, TileTest::init_data_task_wrapper); + rt.register_task(INIT_RANGE_DATA_TASK, RangeTest::init_data_task_wrapper); + rt.register_task(INIT_RANGE2D_DATA_TASK, Range2DTest::init_data_task_wrapper); rt.register_task(INIT_MINIAERO_DATA_TASK, MiniAeroTest::init_data_task_wrapper); signal(SIGALRM, sigalrm_handler); diff --git a/tests/gpu_deppart_1d.cc b/tests/gpu_deppart_1d.cc new file mode 100644 index 0000000000..250a63f2df --- /dev/null +++ b/tests/gpu_deppart_1d.cc @@ -0,0 +1,327 @@ +/* + * Copyright 2025 Stanford University, NVIDIA + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include +#include "realm.h" +#include "realm/id.h" +#include "realm/machine.h" +#include "realm/cmdline.h" +#include "philox.h" + +using namespace Realm; + +#ifdef REALM_USE_CUDA +#include "realm/cuda/cuda_memcpy.h" +#include "realm/cuda/cuda_module.h" +#endif +#ifdef REALM_USE_HIP +#include "hip_cuda_compat/hip_cuda.h" +#include "realm/hip/hip_module.h" +#endif + +#ifdef REALM_USE_CUDA +using namespace Realm::Cuda; +#endif +#ifdef REALM_USE_HIP +using namespace Realm::Hip; +#endif + +Logger log_app("app"); + +// ---------------- Config (matches transpose_test style) ---------------- +namespace TestConfig { + int num_nodes = 1000; + int num_edges = 5000; + int num_pieces = 4; + int random = 0; // 0 deterministic, 1 random + unsigned long long seed = 123456789ULL; + int show = 0; // print assigned ids + int verify = 1; // do correctness check +}; +static const FieldID FID_SUBGRAPH = 0; +static const FieldID FID_SRC = 0; +static const FieldID FID_DST = sizeof(Point<1, int>); + +// ---------------- Small helpers (same idioms as transpose_test) -------- +template +static void fill_index_space(RegionInstance inst, + FieldID fid, + const IndexSpace& is, + Fn gen) +{ + AffineAccessor acc(inst, fid); + for (IndexSpaceIterator it(is); it.valid; it.step()) { + for (PointInRectIterator p(it.rect); p.valid; p.step()) + acc[p.p] = gen(p.p); + } +} + +template +static void copy_field(const IndexSpace& is, + RegionInstance src, RegionInstance dst, FieldID fid) +{ + std::vector srcs(1), dsts(1); + srcs[0].set_field(src, fid, sizeof(DT)); + dsts[0].set_field(dst, fid, sizeof(DT)); + is.copy(srcs, dsts, ProfilingRequestSet()).wait(); +} + +static void choose_cpu_and_gpu_mems(Memory& cpu_mem, Memory& gpu_mem, bool& have_gpu) +{ + have_gpu = false; + for (auto mem : Machine::MemoryQuery(Machine::get_machine())) { + if (!cpu_mem.exists() && (mem.kind() == Memory::SYSTEM_MEM)) + cpu_mem = mem; + if (!gpu_mem.exists() && (mem.kind() == Memory::GPU_FB_MEM)) { + gpu_mem = mem; + have_gpu = true; + } + } +} + +// For brevity, we use the simple vector layout helper (as in many Realm tests) +static Event make_instance(RegionInstance& ri, + Memory mem, + const IndexSpace<1,int>& is, + std::vector fields) +{ + return RegionInstance::create_instance(ri, mem, is, fields, + /*soa=*/0, ProfilingRequestSet()); +} + +// Compare two partitions index-space-by-index-space +static int compare_partitions(const std::vector>& A, + const std::vector>& B) +{ + int errors = 0; + if (A.size() != B.size()) return 1; + for (size_t i = 0; i < A.size(); i++) { + // Check A minus B + for (IndexSpaceIterator<1,int> it(A[i]); it.valid; it.step()) + for (PointInRectIterator<1,int> p(it.rect); p.valid; p.step()) + if (!B[i].contains(p.p)) { errors++; } + // Check B minus A + for (IndexSpaceIterator<1,int> it(B[i]); it.valid; it.step()) + for (PointInRectIterator<1,int> p(it.rect); p.valid; p.step()) + if (!A[i].contains(p.p)) { errors++; } + } + return errors; +} + +// ---------------- Top-level task (like transpose_test_gpu) -------------- +enum { + TOP_LEVEL_TASK = Processor::TASK_ID_FIRST_AVAILABLE + 300, +}; + +static void top_level_task(const void*, size_t, const void*, size_t, Processor) +{ + log_app.print() << "deppart_byfield_itest starting"; + + // Build the 1D node space [0 .. N-1] + IndexSpace<1,int> is_nodes(Rect<1,int>(0, TestConfig::num_nodes - 1)); + IndexSpace<1,int> is_edges(Rect<1, int>(0, TestConfig::num_edges - 1)); + + // Choose memories + Memory cpu_mem, gpu_mem; + bool have_gpu = false; + choose_cpu_and_gpu_mems(cpu_mem, gpu_mem, have_gpu); + if (!cpu_mem.exists()) { + log_app.fatal() << "No SYSTEM_MEM found"; + assert(0); + return; + } + if (!have_gpu) { + log_app.warning() << "No GPU_FB_MEM found; running CPU-only check."; + } + + // Create CPU instance holding subgraph ids + RegionInstance cpu_inst_nodes; + make_instance(cpu_inst_nodes, cpu_mem, is_nodes, {sizeof(int)}).wait(); + + RegionInstance cpu_inst_edges; + make_instance(cpu_inst_edges, cpu_mem, is_edges, {sizeof(Point<1, int>), sizeof(Point<1, int>)}).wait(); + + // Fill ids (deterministic or random) + auto gen_id = [&](Point<1,int> p)->int { + if (TestConfig::random) { + return Philox_2x32<>::rand_int(TestConfig::seed, + /*counter=*/p[0], + /*stream=*/0, + /*bound=*/TestConfig::num_pieces); + } else { + // even split + return int((long long)p[0] * TestConfig::num_pieces / TestConfig::num_nodes); + } + }; + fill_index_space<1,int,int>(cpu_inst_nodes, FID_SUBGRAPH, is_nodes, gen_id); + + auto gen_src = [&](Point<1,int> p)->Point<1, int> { + if (TestConfig::random) { + return Point<1, int>(Philox_2x32<>::rand_int(TestConfig::seed, + /*counter=*/p[0], + /*stream=*/0, + /*bound=*/TestConfig::num_nodes)); + } else { + return Point<1, int>(p[0] % TestConfig::num_nodes); + } + }; + + fill_index_space<1,int,Point<1,int>>(cpu_inst_edges, FID_SRC, is_edges, gen_src); + + auto gen_dst = [&](Point<1,int> p)->Point<1, int> { + if (TestConfig::random) { + return Point<1, int>(Philox_2x32<>::rand_int(TestConfig::seed, + /*counter=*/p[0]+TestConfig::num_edges, + /*stream=*/0, + /*bound=*/TestConfig::num_nodes)); + } else { + return Point<1, int>((p[0]+1) % TestConfig::num_nodes); + } + }; + + fill_index_space<1,int,Point<1,int>>(cpu_inst_edges, FID_DST, is_edges, gen_dst); + + if (TestConfig::show) { + AffineAccessor acc(cpu_inst_nodes, FID_SUBGRAPH); + for (IndexSpaceIterator<1,int> it(is_nodes); it.valid; it.step()) + for (PointInRectIterator<1,int> p(it.rect); p.valid; p.step()) + log_app.print() << "id[" << p.p << "]=" << acc[p.p]; + + AffineAccessor,1,int> acc_src(cpu_inst_edges, FID_SRC); + AffineAccessor,1,int> acc_dst(cpu_inst_edges, FID_DST); + for (IndexSpaceIterator<1,int> it(is_edges); it.valid; it.step()) + for (PointInRectIterator<1,int> p(it.rect); p.valid; p.step()) + log_app.print() << "edge[" << p.p << "]=" << acc_src[p.p] << "->" << acc_dst[p.p]; + } + + // Describe the field data (CPU) + FieldDataDescriptor, int> cpu_field_nodes; + cpu_field_nodes.index_space = is_nodes; + cpu_field_nodes.inst = cpu_inst_nodes; + cpu_field_nodes.field_offset = 0; + + FieldDataDescriptor, Point<1, int>> cpu_field_src; + cpu_field_src.index_space = is_edges; + cpu_field_src.inst = cpu_inst_edges; + cpu_field_src.field_offset = 0; + + FieldDataDescriptor, Point<1, int>> cpu_field_dst; + cpu_field_dst.index_space = is_edges; + cpu_field_dst.inst = cpu_inst_edges; + cpu_field_dst.field_offset = sizeof(Point<1,int>); + + std::vector, int>> cpu_nodes(1, cpu_field_nodes); + std::vector, Point<1, int>>> cpu_src(1, cpu_field_src); + std::vector, Point<1, int>>> cpu_dst(1, cpu_field_dst); + + + // Colors 0..num_pieces-1 + std::vector colors(TestConfig::num_pieces); + for (int i = 0; i < TestConfig::num_pieces; i++) colors[i] = i; + + // CPU partitioning + std::vector> p_cpu_nodes, p_cpu_edges, p_cpu_rd; + Event e_cpu_byfield = is_nodes.create_subspaces_by_field(cpu_nodes, colors, p_cpu_nodes, ProfilingRequestSet()); + Event e_cpu_bypreimage = is_edges.create_subspaces_by_preimage(cpu_dst, p_cpu_nodes, p_cpu_edges, ProfilingRequestSet(), e_cpu_byfield); + Event e_cpu_image = is_nodes.create_subspaces_by_image(cpu_src, p_cpu_edges, p_cpu_rd, ProfilingRequestSet(), e_cpu_bypreimage); + + // GPU path (optional if GPU exists) + std::vector> p_gpu_nodes, p_gpu_edges, p_gpu_rd; + if (have_gpu) { + RegionInstance gpu_inst_nodes, gpu_inst_edges; + make_instance(gpu_inst_nodes, gpu_mem, is_nodes, {sizeof(int)}).wait(); + make_instance(gpu_inst_edges, gpu_mem, is_edges, {sizeof(Point<1, int>), sizeof(Point<1, int>)}).wait(); + + // Copy field data CPU -> GPU + copy_field<1,int,int>(is_nodes, cpu_inst_nodes, gpu_inst_nodes, FID_SUBGRAPH); + copy_field<1,int,Point<1,int>>(is_edges, cpu_inst_edges, gpu_inst_edges, FID_SRC); + copy_field<1,int,Point<1,int>>(is_edges, cpu_inst_edges, gpu_inst_edges, FID_DST); + + // Describe the field data (CPU) + FieldDataDescriptor, int> gpu_field_nodes; + gpu_field_nodes.index_space = is_nodes; + gpu_field_nodes.inst = gpu_inst_nodes; + gpu_field_nodes.field_offset = 0; + + FieldDataDescriptor, Point<1, int>> gpu_field_src; + gpu_field_src.index_space = is_edges; + gpu_field_src.inst = gpu_inst_edges; + gpu_field_src.field_offset = 0; + + FieldDataDescriptor, Point<1, int>> gpu_field_dst; + gpu_field_dst.index_space = is_edges; + gpu_field_dst.inst = cpu_inst_edges; + gpu_field_dst.field_offset = sizeof(Point<1,int>); + + std::vector, int>> gpu_nodes(1, gpu_field_nodes); + std::vector, Point<1, int>>> gpu_src(1, gpu_field_src); + std::vector, Point<1, int>>> gpu_dst(1, gpu_field_dst); + + std::vector> p_gpu_nodes, p_gpu_edges, p_gpu_rd; + Event e_gpu_byfield = is_nodes.create_subspaces_by_field(gpu_nodes, colors, p_gpu_nodes, + ProfilingRequestSet()); + Event e_gpu_bypreimage = is_edges.create_subspaces_by_preimage(gpu_dst, p_gpu_nodes, p_gpu_edges, ProfilingRequestSet(), e_gpu_byfield); + Event e_gpu_image = is_nodes.create_subspaces_by_image(gpu_src, p_gpu_edges, p_gpu_rd, ProfilingRequestSet(), e_gpu_bypreimage); + + e_cpu_image.wait(); + e_gpu_image.wait(); + // Compare CPU vs GPU partitions + if (TestConfig::verify) { + int errs = compare_partitions(p_cpu_nodes, p_gpu_nodes) + + compare_partitions(p_cpu_edges, p_gpu_edges) + + compare_partitions(p_cpu_rd, p_gpu_rd); + if (errs) { + log_app.fatal() << "Mismatch between CPU and GPU partitions, errors=" << errs; + assert(0); + } + } + gpu_inst_nodes.destroy(); + gpu_inst_edges.destroy(); + } else { + e_cpu_image.wait(); + } + + // Cleanup + cpu_inst_nodes.destroy(); + cpu_inst_edges.destroy(); + is_nodes.destroy(); + is_edges.destroy(); + + log_app.print() << "deppart_1d_itest: PASS"; +} + +// ---------------- Main (same as transpose_test pattern) ----------------- +int main(int argc, char** argv) +{ + Runtime rt; + rt.init(&argc, &argv); + + // Parse simple flags similar to the example + CommandLineParser cp; + cp.add_option_int("-n", TestConfig::num_nodes) + .add_option_int("-e", TestConfig::num_edges) + .add_option_int("-p", TestConfig::num_pieces) + .add_option_int("-random", TestConfig::random) + .add_option_int("-show", TestConfig::show) + .add_option_int("-verify", TestConfig::verify); + bool ok = cp.parse_command_line(argc, const_cast(argv)); + assert(ok); + + rt.register_task(TOP_LEVEL_TASK, top_level_task); + + Processor p = Machine::ProcessorQuery(Machine::get_machine()) + .only_kind(Processor::LOC_PROC) + .first(); + assert(p.exists()); + + Event e = rt.collective_spawn(p, TOP_LEVEL_TASK, nullptr, 0); + rt.shutdown(e); + rt.wait_for_shutdown(); + return 0; +} \ No newline at end of file diff --git a/tests/unit_tests/sparsity_map_test.cc b/tests/unit_tests/sparsity_map_test.cc index ab673f7b27..a0fafbf834 100644 --- a/tests/unit_tests/sparsity_map_test.cc +++ b/tests/unit_tests/sparsity_map_test.cc @@ -284,7 +284,7 @@ void run_contribute_dense_case(const ContributeDenseRectTestData &test_case) impl->set_contributor_count(1); impl->contribute_dense_rect_list(test_case.rects, test_case.disjoint); - std::vector> entries = public_impl->get_entries(); + span> entries = public_impl->get_entries(); ASSERT_TRUE(public_impl->is_valid()); ASSERT_EQ(entries.size(), test_case.expected.size()); for(size_t i = 0; i < entries.size(); i++) {