From 648cbe76e3852de2b7e81175e83831c75cbf8ed6 Mon Sep 17 00:00:00 2001 From: Oleksandr Zinenko Date: Wed, 25 Jul 2018 16:33:13 +0200 Subject: [PATCH 1/9] promoteToSharedBelow: do not capture an unnecessary variable in a lambda The captured variable was not used inside the lambda since it was introduced in the prehistory. --- tc/core/polyhedral/cuda/memory_promotion_heuristic.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc index b1bca6923..f74b9a56b 100644 --- a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc +++ b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc @@ -512,9 +512,8 @@ void promoteToSharedBelow( std::sort( tensorGroups.second.begin(), tensorGroups.second.end(), - [refsCount]( - const std::unique_ptr& group1, - const std::unique_ptr& group2) { + [](const std::unique_ptr& group1, + const std::unique_ptr& group2) { return group1->referenceIds().size() > group2->referenceIds().size(); }); From 0f7c5624ff02e618fde9bc19725016b569382c11 Mon Sep 17 00:00:00 2001 From: Oleksandr Zinenko Date: Wed, 25 Jul 2018 16:55:27 +0200 Subject: [PATCH 2/9] promoteToSharedBelow: extract out sortTensorGroupMap This function will be reused in an upcoming commit to sort groups before register promotion. --- .../cuda/memory_promotion_heuristic.cc | 61 ++++++++++--------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc index f74b9a56b..dd104c8b6 100644 --- a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc +++ b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc @@ -446,6 +446,37 @@ bool isInThreadMappedScope( return false; } +static std::vector> sortTensorGroupMap( + TensorGroups&& groupMap) { + // Prepare groups for sorting, to have specified order necessary for + // reproducibility and tests. + using TensorGroupList = std::pair; + std::vector groupLists( + std::make_move_iterator(groupMap.begin()), + std::make_move_iterator(groupMap.end())); + + // Computes the total number of references in all groups. + auto refsCount = [](const TensorGroupsInfo& info) { + size_t refs = 0; + for (auto const& group : info) { + refs += group->referenceIds().size(); + } + return refs; + }; + + // Sort by the total number of references, then by name. Because names are + // guarenteed to be unique, the order is total. + std::sort( + groupLists.begin(), + groupLists.end(), + [refsCount](const TensorGroupList& l1, const TensorGroupList& l2) { + auto r1 = refsCount(l1.second); + auto r2 = refsCount(l2.second); + return r1 == r2 ? l1.first.get_name() < l2.first.get_name() : r1 < r2; + }); + return groupLists; +} + /* * Promote to shared memory in "scop" below "node". Use at most * "remainingMemory" bytes, and update the variable to reflect the amount of @@ -474,37 +505,11 @@ void promoteToSharedBelow( auto partialSched = partialSchedule(root, node); auto mapping = collectMappingsTo(scop); - auto groupMap = TensorReferenceGroup::accessedWithin( - partialSched.intersect_domain(mapping), scop.body); + auto groupLists = sortTensorGroupMap(TensorReferenceGroup::accessedWithin( + partialSched.intersect_domain(mapping), scop.body)); // Pure affine schedule without (mapping) filters. auto partialSchedMupa = partialScheduleMupa(root, node); - // Prepare groups for sorting, to have specified order necessary for - // reproducibility and tests. - using TensorGroupList = std::pair; - std::vector groupLists( - std::make_move_iterator(groupMap.begin()), - std::make_move_iterator(groupMap.end())); - - // Computes the total number of references in all groups. - auto refsCount = [](const TensorGroupsInfo& info) { - size_t refs = 0; - for (auto const& group : info) { - refs += group->referenceIds().size(); - } - return refs; - }; - - // Sort by the total number of references, then by name. Because names are - // guarenteed to be unique, the order is total. - std::sort( - groupLists.begin(), - groupLists.end(), - [refsCount](const TensorGroupList& l1, const TensorGroupList& l2) { - auto r1 = refsCount(l1.second); - auto r2 = refsCount(l2.second); - return r1 == r2 ? l1.first.get_name() < l2.first.get_name() : r1 < r2; - }); for (auto& tensorGroups : groupLists) { auto tensorId = tensorGroups.first; // Sort the reference groups to prioritize groups with more references as From 4aa377d26eb6f6f35ee2ade1730cab164d8d5ee8 Mon Sep 17 00:00:00 2001 From: Oleksandr Zinenko Date: Wed, 25 Jul 2018 17:35:09 +0200 Subject: [PATCH 3/9] promoteToSharedBelow: extract out sortTensorGroups This function will be reused in an upcoming commit. --- .../cuda/memory_promotion_heuristic.cc | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc index dd104c8b6..7c858551d 100644 --- a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc +++ b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc @@ -477,6 +477,20 @@ static std::vector> sortTensorGroupMap( return groupLists; } +/* Sorts the given vector of tensor groups in place following the number of + * references in the group in decreasing order. This prioritize groups with + * more references as they are more likely to benefit from promotion. + */ +static void sortTensorGroups(TensorGroupsInfo& tensorGroups) { + std::sort( + tensorGroups.begin(), + tensorGroups.end(), + [](const std::unique_ptr& group1, + const std::unique_ptr& group2) { + return group1->referenceIds().size() > group2->referenceIds().size(); + }); +} + /* * Promote to shared memory in "scop" below "node". Use at most * "remainingMemory" bytes, and update the variable to reflect the amount of @@ -512,15 +526,7 @@ void promoteToSharedBelow( for (auto& tensorGroups : groupLists) { auto tensorId = tensorGroups.first; - // Sort the reference groups to prioritize groups with more references as - // they are more likely to benefit from promotion. - std::sort( - tensorGroups.second.begin(), - tensorGroups.second.end(), - [](const std::unique_ptr& group1, - const std::unique_ptr& group2) { - return group1->referenceIds().size() > group2->referenceIds().size(); - }); + sortTensorGroups(tensorGroups.second); for (auto& group : tensorGroups.second) { auto sizes = group->approximationSizes(); From 1e5ad912b951da818d2f01e8f5d8eb89aad010b5 Mon Sep 17 00:00:00 2001 From: Oleksandr Zinenko Date: Thu, 26 Jul 2018 10:32:56 +0200 Subject: [PATCH 4/9] promoteToRegistersBelow: sort tensor reference groups Follow the same strategy as with shared memory promotion: first, sort tensors in decreasing order of the total number of references; then, for each tensor, sort groups based on the number of references in this group. Tensor groups with more references are expected to benefit more from promotion as more global memory accesses may be avoided thanks to explicit caching in faster layers of the memory hierarchy. Note that since there is no limit on the number of registers to use, all groups that can be promoted into registers are promoted, and the sorting has no effect on the outcome. Such limit will be introduced next. --- tc/core/polyhedral/cuda/memory_promotion_heuristic.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc index 7c858551d..d45028420 100644 --- a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc +++ b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc @@ -656,8 +656,8 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) { auto mapping = collectMappingsTo(scop).intersect(blockMapping); auto schedule = partialSchedule(scop.scheduleRoot(), scope); - auto groupMap = TensorReferenceGroup::accessedWithin( - schedule.intersect_domain(mapping), scop.body); + auto groupLists = sortTensorGroupMap(TensorReferenceGroup::accessedWithin( + schedule.intersect_domain(mapping), scop.body)); auto threadSchedule = mscop.threadMappingSchedule(mscop.schedule()); auto blockSchedule = mscop.blockMappingSchedule(mscop.schedule()); @@ -673,10 +673,10 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) { // identical dimensions without affecting the result of the checks. partialSchedMupa = partialSchedMupa.flat_range_product(blockSchedule); - for (auto& tensorGroups : groupMap) { + for (auto& tensorGroups : groupLists) { auto tensorId = tensorGroups.first; - - // TODO: sorting of groups and counting the number of promoted elements + sortTensorGroups(tensorGroups.second); + // TODO: counting the number of promoted elements for (auto& group : tensorGroups.second) { auto sizes = group->approximationSizes(); From 9155a07556da23f7426eb1588660f6e202da4fe5 Mon Sep 17 00:00:00 2001 From: Oleksandr Zinenko Date: Thu, 26 Jul 2018 11:12:07 +0200 Subject: [PATCH 5/9] promoteToRegistersBelow: limit the number of registers to use Introduce the per-thread limit on the total number of registers to use during promotion. This limit does not differentiate between the data types because we cannot control the register allocation at CUDA level anyway. It rather serves as a controllable input to the promotion heuristic. --- .../cuda/memory_promotion_heuristic.cc | 21 +++++++++++++++++-- .../cuda/memory_promotion_heuristic.h | 5 ++++- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc index d45028420..a718fe974 100644 --- a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc +++ b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc @@ -630,8 +630,17 @@ void promoteToSharedAtDepth( * of "mscop". Throw if promotion would violate the well-formedness of the * schedule tree, in particular in cases of promotion immediately below * a set/sequence node or immediately above a thread-specific marker node. + * Promote at most "maxElements" elements per thread and return the difference + * between "maxElements" and the number of actuall promoted elements. Note + * that this function does not differentitate types and sizes of the promoted + * elements because register allocation cannot be controlled at the CUDA level + * anyway. Instead, the "maxElements" value controls how much register + * promotion is performed overall. */ -void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) { +size_t promoteToRegistersBelow( + MappedScop& mscop, + detail::ScheduleTree* scope, + size_t maxElements) { // Cannot promote below a sequence or a set node. Promotion may insert an // extension node, but sequence/set must be followed by filters. if (scope->as() || @@ -684,6 +693,12 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) { if (sizes.size() == 0) { continue; } + // Do not promote if requires more registers than remaining. + auto nElements = std::accumulate( + sizes.begin(), sizes.end(), 1u, std::multiplies()); + if (nElements > maxElements) { + continue; + } if (!isPromotableToRegistersBelow( *group, root, scope, partialSchedMupa, threadSchedule)) { continue; @@ -703,13 +718,14 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) { std::move(group), scope, partialSched); + maxElements -= nElements; } } // Return immediately if nothing was promoted. if (scope->numChildren() == 0 || !matchOne(extension(sequence(any())), scope->child({0}))) { - return; + return maxElements; } // If promoting above thread mapping, insert synchronizations. @@ -725,6 +741,7 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) { if (functional::Filter(isMappingTo, ancestors).empty()) { scop.insertSyncsAroundSeqChildren(scope->child({0, 0})); } + return maxElements; } /* diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.h b/tc/core/polyhedral/cuda/memory_promotion_heuristic.h index 2dc264949..1ceea0d30 100644 --- a/tc/core/polyhedral/cuda/memory_promotion_heuristic.h +++ b/tc/core/polyhedral/cuda/memory_promotion_heuristic.h @@ -41,7 +41,10 @@ void promoteToSharedAtDepth( std::size_t sharedMemorySize, bool unrollCopies); -void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope); +size_t promoteToRegistersBelow( + MappedScop& mscop, + detail::ScheduleTree* scope, + std::size_t maxElements = SIZE_MAX); void promoteToRegistersAtDepth(MappedScop& scop, std::size_t depth); From c6730e35fdfd9885b8fe7c9b472ba64d17a26037 Mon Sep 17 00:00:00 2001 From: Oleksandr Zinenko Date: Thu, 26 Jul 2018 11:27:29 +0200 Subject: [PATCH 6/9] promoteToRegistersAtDepth: limit the total number of elements promoted The limit applies per thread and is cumulated for all subtrees where promotion is performed. By default, it is set to SIZE_MAX, which ensures backwards-compatible behavior for all sensible cases (if something had required more than SIZE_MAX registers, it would have been spilled to global memory and still would not have fit). This limit will be exposed as a mapping option in an upcoming commit. --- tc/core/polyhedral/cuda/memory_promotion_heuristic.cc | 9 ++++++--- tc/core/polyhedral/cuda/memory_promotion_heuristic.h | 5 ++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc index a718fe974..e2dd89a8d 100644 --- a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc +++ b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc @@ -748,9 +748,12 @@ size_t promoteToRegistersBelow( * Promote to registers below "depth" schedule dimensions. Split bands if * necessary to create promotion scopes. Do not promote if it would require * splitting the band mapped to threads as we assume only one band can be - * mapped. + * mapped. Use at most "maxElements" per thread in all promoted subtrees. */ -void promoteToRegistersAtDepth(MappedScop& mscop, size_t depth) { +void promoteToRegistersAtDepth( + MappedScop& mscop, + size_t depth, + size_t maxElements) { using namespace detail; auto root = mscop.scop().scheduleRoot(); @@ -784,7 +787,7 @@ void promoteToRegistersAtDepth(MappedScop& mscop, size_t depth) { auto scopes = functional::Map(findScope, bands); for (auto scope : scopes) { - promoteToRegistersBelow(mscop, scope); + maxElements = promoteToRegistersBelow(mscop, scope, maxElements); } } diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.h b/tc/core/polyhedral/cuda/memory_promotion_heuristic.h index 1ceea0d30..fcc6dfdb6 100644 --- a/tc/core/polyhedral/cuda/memory_promotion_heuristic.h +++ b/tc/core/polyhedral/cuda/memory_promotion_heuristic.h @@ -46,7 +46,10 @@ size_t promoteToRegistersBelow( detail::ScheduleTree* scope, std::size_t maxElements = SIZE_MAX); -void promoteToRegistersAtDepth(MappedScop& scop, std::size_t depth); +void promoteToRegistersAtDepth( + MappedScop& scop, + std::size_t depth, + std::size_t maxElements = SIZE_MAX); } // namespace cuda } // namespace polyhedral From 1d9d6e3bb3646823452a5f4ec3158a74d8a3bd02 Mon Sep 17 00:00:00 2001 From: Oleksandr Zinenko Date: Thu, 26 Jul 2018 12:01:33 +0200 Subject: [PATCH 7/9] CudaGPUInfo: record the number of threads per block This will be used in computation of the default number of elements to promote to private. --- tc/core/cuda/cuda.cc | 18 ++++++++++++++---- tc/core/cuda/cuda.h | 9 +++++++-- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/tc/core/cuda/cuda.cc b/tc/core/cuda/cuda.cc index 108e058cd..57baa2ed7 100644 --- a/tc/core/cuda/cuda.cc +++ b/tc/core/cuda/cuda.cc @@ -30,7 +30,8 @@ DEFINE_bool(use_nvprof, false, "Start / stop nvprof"); namespace { -std::tuple, std::vector> init() { +std::tuple, std::vector, std::vector> +init() { int deviceCount = 0; auto err_id = cudaGetDeviceCount(&deviceCount); if (err_id == 35 or err_id == 30) { @@ -44,14 +45,16 @@ std::tuple, std::vector> init() { } std::vector gpuNames; std::vector sharedMemSizes; + std::vector registersPerBlock; gpuNames.reserve(deviceCount); for (int i = 0; i < deviceCount; ++i) { cudaDeviceProp deviceProp; TC_CUDA_RUNTIMEAPI_ENFORCE(cudaGetDeviceProperties(&deviceProp, i)); gpuNames.emplace_back(deviceProp.name); sharedMemSizes.emplace_back(deviceProp.sharedMemPerBlock); + registersPerBlock.emplace_back(deviceProp.regsPerBlock); } - return std::make_tuple(gpuNames, sharedMemSizes); + return std::make_tuple(gpuNames, sharedMemSizes, registersPerBlock); } } // namespace @@ -61,8 +64,8 @@ CudaGPUInfo& CudaGPUInfo::GPUInfo() { static thread_local bool inited = false; if (!inited) { auto infos = init(); - pInfo = std::unique_ptr( - new CudaGPUInfo(std::get<0>(infos), std::get<1>(infos))); + pInfo = std::unique_ptr(new CudaGPUInfo( + std::get<0>(infos), std::get<1>(infos), std::get<2>(infos))); inited = true; } return *pInfo; @@ -102,4 +105,11 @@ size_t CudaGPUInfo::SharedMemorySize() const { } return sharedMemSizes_.at(CurrentGPUId()); } + +size_t CudaGPUInfo::RegistersPerBlock() const { + if (NumberGPUs() == 0) { + return 0; // no registers if no GPUs + } + return registersPerBlock_.at(CurrentGPUId()); +} } // namespace tc diff --git a/tc/core/cuda/cuda.h b/tc/core/cuda/cuda.h index a9fe1383a..fa5e68b98 100644 --- a/tc/core/cuda/cuda.h +++ b/tc/core/cuda/cuda.h @@ -98,8 +98,11 @@ struct WithCudaDevice { class CudaGPUInfo { CudaGPUInfo( const std::vector& gpuNames, - const std::vector& sharedMemSizes) - : gpuNames_(gpuNames), sharedMemSizes_(sharedMemSizes) {} + const std::vector& sharedMemSizes, + const std::vector& registersPerBlock) + : gpuNames_(gpuNames), + sharedMemSizes_(sharedMemSizes), + registersPerBlock_(registersPerBlock) {} public: static CudaGPUInfo& GPUInfo(); @@ -112,9 +115,11 @@ class CudaGPUInfo { std::string GetGPUName(int id = -1) const; std::string getCudaDeviceStr() const; size_t SharedMemorySize() const; + size_t RegistersPerBlock() const; std::vector gpuNames_; std::vector sharedMemSizes_; + std::vector registersPerBlock_; }; struct CudaProfiler { From c200a4e7c8e5af60367c3da166d24dc7084de321 Mon Sep 17 00:00:00 2001 From: Oleksandr Zinenko Date: Thu, 26 Jul 2018 12:05:10 +0200 Subject: [PATCH 8/9] gpu.h: add queryRegistersPerBlock This platform-neutral function to query the number of registers will be used in an upcoming commit. --- tc/core/gpu.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tc/core/gpu.h b/tc/core/gpu.h index 6846304fd..06c8d680d 100644 --- a/tc/core/gpu.h +++ b/tc/core/gpu.h @@ -36,4 +36,15 @@ inline size_t querySharedMemorySize() { #endif } +/// Get the maximum number of registers per block provided by the GPU device +/// active in the current thread. The call is forwarded to the GPU driver. +/// If the thread has no associated GPU, return 0. +inline size_t queryRegistersPerBlock() { +#if TC_WITH_CUDA && !defined(NO_CUDA_SDK) + return CudaGPUInfo::GPUInfo().RegistersPerBlock(); +#else + return 0; +#endif +} + } // namespace tc From 74d3e8512b2139527c0fdb36b8cf27a3e378cf8a Mon Sep 17 00:00:00 2001 From: Oleksandr Zinenko Date: Thu, 26 Jul 2018 12:13:27 +0200 Subject: [PATCH 9/9] cuda::MappedScop: introduce maxPrivateElements mapping option This mapping option controls the maximum number of elements per thread that are promoted into the private memory (hopefully, registers, but we cannot guarantee this at the CUDA level). The value is optional in the protocol buffers. When not provided, query the maximum number of threads per block from CUDA device properties and divide it by the number of threads in the block to obtain the per-thread limitation. Note that using all registers in a single block will likely limit the occupancy of SMs, potentially degrading performance. Introducing the limiting factor is primarily motivated by this effect, and it lets the caller to require the mapper to use less registers, potentially increasing the occupancy. Since register allocation is performed by the downstream compiler, this option is a mere recommendation and is expressed in terms of (untyped) elements rather than actual registers. It would be impossible to account for all registers required by the main computation (that is, necessary to store the data loaded from memory during operations) at the CUDA level, that also contribute to the register pressure of the kernel. Although limiting the number of promoted elements number of registers available per thread may seem too constraining for occupancy, it is strictly better than the current approach where we may promote even more elements, which then get spilled into the slow local memory. --- tc/core/cuda/cuda_mapping_options.cc | 5 +++ tc/core/cuda/cuda_mapping_options.h | 1 + .../cuda/cuda_mapping_options_cpp_printer.cc | 4 +++ tc/core/polyhedral/cuda/mapped_scop.cc | 9 ++++- .../cuda/memory_promotion_heuristic.cc | 1 - tc/proto/mapping_options.proto | 3 ++ tensor_comprehensions/pybinds/tclib.cc | 5 +++ test/test_cuda_mapper_memory_promotion.cc | 33 ++++++++++++++++--- 8 files changed, 54 insertions(+), 7 deletions(-) diff --git a/tc/core/cuda/cuda_mapping_options.cc b/tc/core/cuda/cuda_mapping_options.cc index 09d7edf8c..ba911aa90 100644 --- a/tc/core/cuda/cuda_mapping_options.cc +++ b/tc/core/cuda/cuda_mapping_options.cc @@ -299,6 +299,11 @@ CudaMappingOptions& CudaMappingOptions::sharedDepth(uint32_t depth) { return *this; } +CudaMappingOptions& CudaMappingOptions::maxPrivateElements(uint64_t nElements) { + ownedProto_.set_max_private_elements(nElements); + return *this; +} + CudaMappingOptions& CudaMappingOptions::mapToThreads( const std::string& commaSeparatedSizes) { auto sizes = parseCommaSeparatedIntegers(commaSeparatedSizes); diff --git a/tc/core/cuda/cuda_mapping_options.h b/tc/core/cuda/cuda_mapping_options.h index aa8530307..ab6ce5e11 100644 --- a/tc/core/cuda/cuda_mapping_options.h +++ b/tc/core/cuda/cuda_mapping_options.h @@ -197,6 +197,7 @@ class CudaMappingOptions { CudaMappingOptions& useReadOnlyCache(bool b); CudaMappingOptions& privateDepth(uint32_t depth); CudaMappingOptions& sharedDepth(uint32_t depth); + CudaMappingOptions& maxPrivateElements(uint64_t nElements); ///@} /// Static constructors for predefined strategies. diff --git a/tc/core/cuda/cuda_mapping_options_cpp_printer.cc b/tc/core/cuda/cuda_mapping_options_cpp_printer.cc index 9ffa95bcc..a223fcb80 100644 --- a/tc/core/cuda/cuda_mapping_options_cpp_printer.cc +++ b/tc/core/cuda/cuda_mapping_options_cpp_printer.cc @@ -40,6 +40,10 @@ CudaMappingOptionsCppPrinter& operator<<( } prn.printValueOption("privateDepth", cudaOptions.proto().private_depth()); prn.printValueOption("sharedDepth", cudaOptions.proto().shared_depth()); + if (cudaOptions.proto().has_max_private_elements()) { + prn.printValueOption( + "maxPrivateElements", cudaOptions.proto().max_private_elements()); + } prn.endStmt(); return prn; } diff --git a/tc/core/polyhedral/cuda/mapped_scop.cc b/tc/core/polyhedral/cuda/mapped_scop.cc index b0b129cbc..faa7d9361 100644 --- a/tc/core/polyhedral/cuda/mapped_scop.cc +++ b/tc/core/polyhedral/cuda/mapped_scop.cc @@ -1086,7 +1086,14 @@ std::unique_ptr MappedScop::makeWithOuterBlockInnerThreadStrategy( // 9. Promote to registers below the loops mapped to threads. if (cudaOptions.proto().use_private_memory()) { - promoteToRegistersAtDepth(*mappedScop, cudaOptions.proto().private_depth()); + auto blockSizes = cudaOptions.block.extractVector(); + auto nThreadsPerBlock = std::accumulate( + blockSizes.begin(), blockSizes.end(), 1, std::multiplies()); + auto nElementsPerThread = cudaOptions.proto().has_max_private_elements() + ? cudaOptions.proto().max_private_elements() + : queryRegistersPerBlock() / nThreadsPerBlock; + promoteToRegistersAtDepth( + *mappedScop, cudaOptions.proto().private_depth(), nElementsPerThread); } LOG_IF(INFO, FLAGS_debug_tc_mapper) diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc index e2dd89a8d..f73246666 100644 --- a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc +++ b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc @@ -685,7 +685,6 @@ size_t promoteToRegistersBelow( for (auto& tensorGroups : groupLists) { auto tensorId = tensorGroups.first; sortTensorGroups(tensorGroups.second); - // TODO: counting the number of promoted elements for (auto& group : tensorGroups.second) { auto sizes = group->approximationSizes(); diff --git a/tc/proto/mapping_options.proto b/tc/proto/mapping_options.proto index 8beaf46dc..52301f5a2 100644 --- a/tc/proto/mapping_options.proto +++ b/tc/proto/mapping_options.proto @@ -74,6 +74,9 @@ message CudaMappingOptionsProto { optional uint32 private_depth = 9; // Depth of promotion to shared memory, ignored if use_shared_memory is false. optional uint32 shared_depth = 10; + // Maximum number of elements to promote to registers per thread. If not + // provided, the number 32-bit registers per thread will be used. + optional uint64 max_private_elements = 11; } message CpuMappingOptionsProto { diff --git a/tensor_comprehensions/pybinds/tclib.cc b/tensor_comprehensions/pybinds/tclib.cc index a18fb4ca7..3062c21e4 100644 --- a/tensor_comprehensions/pybinds/tclib.cc +++ b/tensor_comprehensions/pybinds/tclib.cc @@ -672,6 +672,11 @@ PYBIND11_MODULE(tclib, m) { "usePrivateMemory", &tc::CudaMappingOptions::usePrivateMemory, "Create thread-local copies of data in private memory") + .def( + "maxPrivateElements", + &tc::CudaMappingOptions::maxPrivateElements, + "The maximum number of elements per thread for which thread-local " + "copies are created") .def( "unrollCopyShared", &tc::CudaMappingOptions::unrollCopyShared, diff --git a/test/test_cuda_mapper_memory_promotion.cc b/test/test_cuda_mapper_memory_promotion.cc index 0fb7405b8..51a2fa057 100644 --- a/test/test_cuda_mapper_memory_promotion.cc +++ b/test/test_cuda_mapper_memory_promotion.cc @@ -539,7 +539,8 @@ TEST_F(MatMulBias, RegisterPromotion) { .tile(32, 32, 32) .privateDepth(5) .useSharedMemory(false) - .usePrivateMemory(true); + .usePrivateMemory(true) + .maxPrivateElements(100); auto code = emitCode({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions); auto declPos = code.find("float _O_0"); @@ -567,7 +568,8 @@ TEST_F(MatMulBias, RegisterPromotionSharedPreference) { .tile(32, 32, 32) .maxSharedMemory(32768) .useSharedMemory(true) - .usePrivateMemory(true); + .usePrivateMemory(true) + .maxPrivateElements(100); auto code = emitCode({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions); @@ -587,7 +589,7 @@ TEST_F(MatMulBias, RegistersAtRoot) { .usePrivateMemory(false); auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions); - promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot()); + promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot(), 4); auto code = emitCode(mscop); // Expecting 4 elements because we map the loop i in O[i][j] to 8 threads @@ -595,6 +597,27 @@ TEST_F(MatMulBias, RegistersAtRoot) { expectFourOElementsPromoted(code); } +TEST_F(MatMulBias, RegistersAtRootNotEnoughAvailable) { + // Disable automatic promotion to registers because we are going to call it + // manually. Require sufficient unrolling to actually hit registers. + auto mappingOptions = CudaMappingOptions::makeNaiveMappingOptions() + .unroll(512) + .useSharedMemory(false) + .usePrivateMemory(false); + + auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions); + promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot(), 3); + auto code = emitCode(mscop); + + // Not expecting O to be promoted because 4 elements must be promoted and + // only 3 were indicated as available in promoteToRegistersBelow. + auto oDeclPos = code.find("float _O_0;"); + EXPECT_TRUE(oDeclPos == std::string::npos) + << "not expected O to be promoted to registers"; + + expectNoABCPromotion(code); +} + TEST_F(MatMulBias, RegistersAtRootNotEnoughUnroll) { // Disable automatic promotion to registers because we are going to call it // manually. Require no unrolling so as to make promotion to registers @@ -605,7 +628,7 @@ TEST_F(MatMulBias, RegistersAtRootNotEnoughUnroll) { .usePrivateMemory(false); auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions); - promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot()); + promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot(), 100); auto code = emitCode(mscop); auto oDeclPos = code.find("float _O_0;"); @@ -631,7 +654,7 @@ TEST_F(MatMulBias, RegistersBelowFirstBand) { mscop->scop().scheduleRoot(), ScheduleTreeType::Band); ASSERT_GT(nodes.size(), 0u); auto node = nodes[0]; - promoteToRegistersBelow(*mscop, node); + promoteToRegistersBelow(*mscop, node, 100); auto code = emitCode(mscop); // Expecting 4 elements because we map the loop i in O[i][j] to 8 threads