From 648cbe76e3852de2b7e81175e83831c75cbf8ed6 Mon Sep 17 00:00:00 2001
From: Oleksandr Zinenko <git@ozinenko.com>
Date: Wed, 25 Jul 2018 16:33:13 +0200
Subject: [PATCH 1/9] promoteToSharedBelow: do not capture an unnecessary
 variable in a lambda

The captured variable was not used inside the lambda since it was
introduced in the prehistory.
---
 tc/core/polyhedral/cuda/memory_promotion_heuristic.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
index b1bca6923..f74b9a56b 100644
--- a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
+++ b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
@@ -512,9 +512,8 @@ void promoteToSharedBelow(
     std::sort(
         tensorGroups.second.begin(),
         tensorGroups.second.end(),
-        [refsCount](
-            const std::unique_ptr<TensorReferenceGroup>& group1,
-            const std::unique_ptr<TensorReferenceGroup>& group2) {
+        [](const std::unique_ptr<TensorReferenceGroup>& group1,
+           const std::unique_ptr<TensorReferenceGroup>& group2) {
           return group1->referenceIds().size() > group2->referenceIds().size();
         });
 

From 0f7c5624ff02e618fde9bc19725016b569382c11 Mon Sep 17 00:00:00 2001
From: Oleksandr Zinenko <git@ozinenko.com>
Date: Wed, 25 Jul 2018 16:55:27 +0200
Subject: [PATCH 2/9] promoteToSharedBelow: extract out sortTensorGroupMap

This function will be reused in an upcoming commit to sort groups before
register promotion.
---
 .../cuda/memory_promotion_heuristic.cc        | 61 ++++++++++---------
 1 file changed, 33 insertions(+), 28 deletions(-)

diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
index f74b9a56b..dd104c8b6 100644
--- a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
+++ b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
@@ -446,6 +446,37 @@ bool isInThreadMappedScope(
   return false;
 }
 
+static std::vector<std::pair<isl::id, TensorGroupsInfo>> sortTensorGroupMap(
+    TensorGroups&& groupMap) {
+  // Prepare groups for sorting, to have specified order necessary for
+  // reproducibility and tests.
+  using TensorGroupList = std::pair<isl::id, TensorGroupsInfo>;
+  std::vector<TensorGroupList> groupLists(
+      std::make_move_iterator(groupMap.begin()),
+      std::make_move_iterator(groupMap.end()));
+
+  // Computes the total number of references in all groups.
+  auto refsCount = [](const TensorGroupsInfo& info) {
+    size_t refs = 0;
+    for (auto const& group : info) {
+      refs += group->referenceIds().size();
+    }
+    return refs;
+  };
+
+  // Sort by the total number of references, then by name.  Because names are
+  // guarenteed to be unique, the order is total.
+  std::sort(
+      groupLists.begin(),
+      groupLists.end(),
+      [refsCount](const TensorGroupList& l1, const TensorGroupList& l2) {
+        auto r1 = refsCount(l1.second);
+        auto r2 = refsCount(l2.second);
+        return r1 == r2 ? l1.first.get_name() < l2.first.get_name() : r1 < r2;
+      });
+  return groupLists;
+}
+
 /*
  * Promote to shared memory in "scop" below "node".  Use at most
  * "remainingMemory" bytes, and update the variable to reflect the amount of
@@ -474,37 +505,11 @@ void promoteToSharedBelow(
   auto partialSched = partialSchedule(root, node);
   auto mapping = collectMappingsTo<mapping::BlockId>(scop);
 
-  auto groupMap = TensorReferenceGroup::accessedWithin(
-      partialSched.intersect_domain(mapping), scop.body);
+  auto groupLists = sortTensorGroupMap(TensorReferenceGroup::accessedWithin(
+      partialSched.intersect_domain(mapping), scop.body));
   // Pure affine schedule without (mapping) filters.
   auto partialSchedMupa = partialScheduleMupa(root, node);
 
-  // Prepare groups for sorting, to have specified order necessary for
-  // reproducibility and tests.
-  using TensorGroupList = std::pair<isl::id, TensorGroupsInfo>;
-  std::vector<TensorGroupList> groupLists(
-      std::make_move_iterator(groupMap.begin()),
-      std::make_move_iterator(groupMap.end()));
-
-  // Computes the total number of references in all groups.
-  auto refsCount = [](const TensorGroupsInfo& info) {
-    size_t refs = 0;
-    for (auto const& group : info) {
-      refs += group->referenceIds().size();
-    }
-    return refs;
-  };
-
-  // Sort by the total number of references, then by name.  Because names are
-  // guarenteed to be unique, the order is total.
-  std::sort(
-      groupLists.begin(),
-      groupLists.end(),
-      [refsCount](const TensorGroupList& l1, const TensorGroupList& l2) {
-        auto r1 = refsCount(l1.second);
-        auto r2 = refsCount(l2.second);
-        return r1 == r2 ? l1.first.get_name() < l2.first.get_name() : r1 < r2;
-      });
   for (auto& tensorGroups : groupLists) {
     auto tensorId = tensorGroups.first;
     // Sort the reference groups to prioritize groups with more references as

From 4aa377d26eb6f6f35ee2ade1730cab164d8d5ee8 Mon Sep 17 00:00:00 2001
From: Oleksandr Zinenko <git@ozinenko.com>
Date: Wed, 25 Jul 2018 17:35:09 +0200
Subject: [PATCH 3/9] promoteToSharedBelow: extract out sortTensorGroups

This function will be reused in an upcoming commit.
---
 .../cuda/memory_promotion_heuristic.cc        | 24 ++++++++++++-------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
index dd104c8b6..7c858551d 100644
--- a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
+++ b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
@@ -477,6 +477,20 @@ static std::vector<std::pair<isl::id, TensorGroupsInfo>> sortTensorGroupMap(
   return groupLists;
 }
 
+/* Sorts the given vector of tensor groups in place following the number of
+ * references in the group in decreasing order.  This prioritize groups with
+ * more references as they are more likely to benefit from promotion.
+ */
+static void sortTensorGroups(TensorGroupsInfo& tensorGroups) {
+  std::sort(
+      tensorGroups.begin(),
+      tensorGroups.end(),
+      [](const std::unique_ptr<TensorReferenceGroup>& group1,
+         const std::unique_ptr<TensorReferenceGroup>& group2) {
+        return group1->referenceIds().size() > group2->referenceIds().size();
+      });
+}
+
 /*
  * Promote to shared memory in "scop" below "node".  Use at most
  * "remainingMemory" bytes, and update the variable to reflect the amount of
@@ -512,15 +526,7 @@ void promoteToSharedBelow(
 
   for (auto& tensorGroups : groupLists) {
     auto tensorId = tensorGroups.first;
-    // Sort the reference groups to prioritize groups with more references as
-    // they are more likely to benefit from promotion.
-    std::sort(
-        tensorGroups.second.begin(),
-        tensorGroups.second.end(),
-        [](const std::unique_ptr<TensorReferenceGroup>& group1,
-           const std::unique_ptr<TensorReferenceGroup>& group2) {
-          return group1->referenceIds().size() > group2->referenceIds().size();
-        });
+    sortTensorGroups(tensorGroups.second);
 
     for (auto& group : tensorGroups.second) {
       auto sizes = group->approximationSizes();

From 1e5ad912b951da818d2f01e8f5d8eb89aad010b5 Mon Sep 17 00:00:00 2001
From: Oleksandr Zinenko <git@ozinenko.com>
Date: Thu, 26 Jul 2018 10:32:56 +0200
Subject: [PATCH 4/9] promoteToRegistersBelow: sort tensor reference groups

Follow the same strategy as with shared memory promotion: first, sort
tensors in decreasing order of the total number of references; then, for
each tensor, sort groups based on the number of references in this
group.  Tensor groups with more references are expected to benefit more
from promotion as more global memory accesses may be avoided thanks to
explicit caching in faster layers of the memory hierarchy.  Note that
since there is no limit on the number of registers to use, all groups
that can be promoted into registers are promoted, and the sorting has no
effect on the outcome.  Such limit will be introduced next.
---
 tc/core/polyhedral/cuda/memory_promotion_heuristic.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
index 7c858551d..d45028420 100644
--- a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
+++ b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
@@ -656,8 +656,8 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
   auto mapping =
       collectMappingsTo<mapping::ThreadId>(scop).intersect(blockMapping);
   auto schedule = partialSchedule(scop.scheduleRoot(), scope);
-  auto groupMap = TensorReferenceGroup::accessedWithin(
-      schedule.intersect_domain(mapping), scop.body);
+  auto groupLists = sortTensorGroupMap(TensorReferenceGroup::accessedWithin(
+      schedule.intersect_domain(mapping), scop.body));
 
   auto threadSchedule = mscop.threadMappingSchedule(mscop.schedule());
   auto blockSchedule = mscop.blockMappingSchedule(mscop.schedule());
@@ -673,10 +673,10 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
   // identical dimensions without affecting the result of the checks.
   partialSchedMupa = partialSchedMupa.flat_range_product(blockSchedule);
 
-  for (auto& tensorGroups : groupMap) {
+  for (auto& tensorGroups : groupLists) {
     auto tensorId = tensorGroups.first;
-
-    // TODO: sorting of groups and counting the number of promoted elements
+    sortTensorGroups(tensorGroups.second);
+    // TODO: counting the number of promoted elements
 
     for (auto& group : tensorGroups.second) {
       auto sizes = group->approximationSizes();

From 9155a07556da23f7426eb1588660f6e202da4fe5 Mon Sep 17 00:00:00 2001
From: Oleksandr Zinenko <git@ozinenko.com>
Date: Thu, 26 Jul 2018 11:12:07 +0200
Subject: [PATCH 5/9] promoteToRegistersBelow: limit the number of registers to
 use

Introduce the per-thread limit on the total number of registers to use
during promotion.  This limit does not differentiate between the data
types because we cannot control the register allocation at CUDA level
anyway.  It rather serves as a controllable input to the promotion
heuristic.
---
 .../cuda/memory_promotion_heuristic.cc        | 21 +++++++++++++++++--
 .../cuda/memory_promotion_heuristic.h         |  5 ++++-
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
index d45028420..a718fe974 100644
--- a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
+++ b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
@@ -630,8 +630,17 @@ void promoteToSharedAtDepth(
  * of "mscop".  Throw if promotion would violate the well-formedness of the
  * schedule tree, in particular in cases of promotion immediately below
  * a set/sequence node or immediately above a thread-specific marker node.
+ * Promote at most "maxElements" elements per thread and return the difference
+ * between "maxElements" and the number of actuall promoted elements.  Note
+ * that this function does not differentitate types and sizes of the promoted
+ * elements because register allocation cannot be controlled at the CUDA level
+ * anyway.  Instead, the "maxElements" value controls how much register
+ * promotion is performed overall.
  */
-void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
+size_t promoteToRegistersBelow(
+    MappedScop& mscop,
+    detail::ScheduleTree* scope,
+    size_t maxElements) {
   // Cannot promote below a sequence or a set node.  Promotion may insert an
   // extension node, but sequence/set must be followed by filters.
   if (scope->as<detail::ScheduleTreeSequence>() ||
@@ -684,6 +693,12 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
       if (sizes.size() == 0) {
         continue;
       }
+      // Do not promote if requires more registers than remaining.
+      auto nElements = std::accumulate(
+          sizes.begin(), sizes.end(), 1u, std::multiplies<size_t>());
+      if (nElements > maxElements) {
+        continue;
+      }
       if (!isPromotableToRegistersBelow(
               *group, root, scope, partialSchedMupa, threadSchedule)) {
         continue;
@@ -703,13 +718,14 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
           std::move(group),
           scope,
           partialSched);
+      maxElements -= nElements;
     }
   }
 
   // Return immediately if nothing was promoted.
   if (scope->numChildren() == 0 ||
       !matchOne(extension(sequence(any())), scope->child({0}))) {
-    return;
+    return maxElements;
   }
 
   // If promoting above thread mapping, insert synchronizations.
@@ -725,6 +741,7 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
   if (functional::Filter(isMappingTo<mapping::ThreadId>, ancestors).empty()) {
     scop.insertSyncsAroundSeqChildren(scope->child({0, 0}));
   }
+  return maxElements;
 }
 
 /*
diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.h b/tc/core/polyhedral/cuda/memory_promotion_heuristic.h
index 2dc264949..1ceea0d30 100644
--- a/tc/core/polyhedral/cuda/memory_promotion_heuristic.h
+++ b/tc/core/polyhedral/cuda/memory_promotion_heuristic.h
@@ -41,7 +41,10 @@ void promoteToSharedAtDepth(
     std::size_t sharedMemorySize,
     bool unrollCopies);
 
-void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope);
+size_t promoteToRegistersBelow(
+    MappedScop& mscop,
+    detail::ScheduleTree* scope,
+    std::size_t maxElements = SIZE_MAX);
 
 void promoteToRegistersAtDepth(MappedScop& scop, std::size_t depth);
 

From c6730e35fdfd9885b8fe7c9b472ba64d17a26037 Mon Sep 17 00:00:00 2001
From: Oleksandr Zinenko <git@ozinenko.com>
Date: Thu, 26 Jul 2018 11:27:29 +0200
Subject: [PATCH 6/9] promoteToRegistersAtDepth: limit the total number of
 elements promoted

The limit applies per thread and is cumulated for all subtrees where
promotion is performed.  By default, it is set to SIZE_MAX, which
ensures backwards-compatible behavior for all sensible cases (if
something had required more than SIZE_MAX registers, it would have been
spilled to global memory and still would not have fit).  This limit will
be exposed as a mapping option in an upcoming commit.
---
 tc/core/polyhedral/cuda/memory_promotion_heuristic.cc | 9 ++++++---
 tc/core/polyhedral/cuda/memory_promotion_heuristic.h  | 5 ++++-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
index a718fe974..e2dd89a8d 100644
--- a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
+++ b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
@@ -748,9 +748,12 @@ size_t promoteToRegistersBelow(
  * Promote to registers below "depth" schedule dimensions.  Split bands if
  * necessary to create promotion scopes.  Do not promote if it would require
  * splitting the band mapped to threads as we assume only one band can be
- * mapped.
+ * mapped.  Use at most "maxElements" per thread in all promoted subtrees.
  */
-void promoteToRegistersAtDepth(MappedScop& mscop, size_t depth) {
+void promoteToRegistersAtDepth(
+    MappedScop& mscop,
+    size_t depth,
+    size_t maxElements) {
   using namespace detail;
 
   auto root = mscop.scop().scheduleRoot();
@@ -784,7 +787,7 @@ void promoteToRegistersAtDepth(MappedScop& mscop, size_t depth) {
   auto scopes = functional::Map(findScope, bands);
 
   for (auto scope : scopes) {
-    promoteToRegistersBelow(mscop, scope);
+    maxElements = promoteToRegistersBelow(mscop, scope, maxElements);
   }
 }
 
diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.h b/tc/core/polyhedral/cuda/memory_promotion_heuristic.h
index 1ceea0d30..fcc6dfdb6 100644
--- a/tc/core/polyhedral/cuda/memory_promotion_heuristic.h
+++ b/tc/core/polyhedral/cuda/memory_promotion_heuristic.h
@@ -46,7 +46,10 @@ size_t promoteToRegistersBelow(
     detail::ScheduleTree* scope,
     std::size_t maxElements = SIZE_MAX);
 
-void promoteToRegistersAtDepth(MappedScop& scop, std::size_t depth);
+void promoteToRegistersAtDepth(
+    MappedScop& scop,
+    std::size_t depth,
+    std::size_t maxElements = SIZE_MAX);
 
 } // namespace cuda
 } // namespace polyhedral

From 1d9d6e3bb3646823452a5f4ec3158a74d8a3bd02 Mon Sep 17 00:00:00 2001
From: Oleksandr Zinenko <git@ozinenko.com>
Date: Thu, 26 Jul 2018 12:01:33 +0200
Subject: [PATCH 7/9] CudaGPUInfo: record the number of threads per block

This will be used in computation of the default number of elements to
promote to private.
---
 tc/core/cuda/cuda.cc | 18 ++++++++++++++----
 tc/core/cuda/cuda.h  |  9 +++++++--
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/tc/core/cuda/cuda.cc b/tc/core/cuda/cuda.cc
index 108e058cd..57baa2ed7 100644
--- a/tc/core/cuda/cuda.cc
+++ b/tc/core/cuda/cuda.cc
@@ -30,7 +30,8 @@ DEFINE_bool(use_nvprof, false, "Start / stop nvprof");
 
 namespace {
 
-std::tuple<std::vector<std::string>, std::vector<size_t>> init() {
+std::tuple<std::vector<std::string>, std::vector<size_t>, std::vector<size_t>>
+init() {
   int deviceCount = 0;
   auto err_id = cudaGetDeviceCount(&deviceCount);
   if (err_id == 35 or err_id == 30) {
@@ -44,14 +45,16 @@ std::tuple<std::vector<std::string>, std::vector<size_t>> init() {
   }
   std::vector<std::string> gpuNames;
   std::vector<size_t> sharedMemSizes;
+  std::vector<size_t> registersPerBlock;
   gpuNames.reserve(deviceCount);
   for (int i = 0; i < deviceCount; ++i) {
     cudaDeviceProp deviceProp;
     TC_CUDA_RUNTIMEAPI_ENFORCE(cudaGetDeviceProperties(&deviceProp, i));
     gpuNames.emplace_back(deviceProp.name);
     sharedMemSizes.emplace_back(deviceProp.sharedMemPerBlock);
+    registersPerBlock.emplace_back(deviceProp.regsPerBlock);
   }
-  return std::make_tuple(gpuNames, sharedMemSizes);
+  return std::make_tuple(gpuNames, sharedMemSizes, registersPerBlock);
 }
 
 } // namespace
@@ -61,8 +64,8 @@ CudaGPUInfo& CudaGPUInfo::GPUInfo() {
   static thread_local bool inited = false;
   if (!inited) {
     auto infos = init();
-    pInfo = std::unique_ptr<CudaGPUInfo>(
-        new CudaGPUInfo(std::get<0>(infos), std::get<1>(infos)));
+    pInfo = std::unique_ptr<CudaGPUInfo>(new CudaGPUInfo(
+        std::get<0>(infos), std::get<1>(infos), std::get<2>(infos)));
     inited = true;
   }
   return *pInfo;
@@ -102,4 +105,11 @@ size_t CudaGPUInfo::SharedMemorySize() const {
   }
   return sharedMemSizes_.at(CurrentGPUId());
 }
+
+size_t CudaGPUInfo::RegistersPerBlock() const {
+  if (NumberGPUs() == 0) {
+    return 0; // no registers if no GPUs
+  }
+  return registersPerBlock_.at(CurrentGPUId());
+}
 } // namespace tc
diff --git a/tc/core/cuda/cuda.h b/tc/core/cuda/cuda.h
index a9fe1383a..fa5e68b98 100644
--- a/tc/core/cuda/cuda.h
+++ b/tc/core/cuda/cuda.h
@@ -98,8 +98,11 @@ struct WithCudaDevice {
 class CudaGPUInfo {
   CudaGPUInfo(
       const std::vector<std::string>& gpuNames,
-      const std::vector<size_t>& sharedMemSizes)
-      : gpuNames_(gpuNames), sharedMemSizes_(sharedMemSizes) {}
+      const std::vector<size_t>& sharedMemSizes,
+      const std::vector<size_t>& registersPerBlock)
+      : gpuNames_(gpuNames),
+        sharedMemSizes_(sharedMemSizes),
+        registersPerBlock_(registersPerBlock) {}
 
  public:
   static CudaGPUInfo& GPUInfo();
@@ -112,9 +115,11 @@ class CudaGPUInfo {
   std::string GetGPUName(int id = -1) const;
   std::string getCudaDeviceStr() const;
   size_t SharedMemorySize() const;
+  size_t RegistersPerBlock() const;
 
   std::vector<std::string> gpuNames_;
   std::vector<size_t> sharedMemSizes_;
+  std::vector<size_t> registersPerBlock_;
 };
 
 struct CudaProfiler {

From c200a4e7c8e5af60367c3da166d24dc7084de321 Mon Sep 17 00:00:00 2001
From: Oleksandr Zinenko <git@ozinenko.com>
Date: Thu, 26 Jul 2018 12:05:10 +0200
Subject: [PATCH 8/9] gpu.h: add queryRegistersPerBlock

This platform-neutral function to query the number of registers will be
used in an upcoming commit.
---
 tc/core/gpu.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tc/core/gpu.h b/tc/core/gpu.h
index 6846304fd..06c8d680d 100644
--- a/tc/core/gpu.h
+++ b/tc/core/gpu.h
@@ -36,4 +36,15 @@ inline size_t querySharedMemorySize() {
 #endif
 }
 
+/// Get the maximum number of registers per block provided by the GPU device
+/// active in the current thread.  The call is forwarded to the GPU driver.
+/// If the thread has no associated GPU, return 0.
+inline size_t queryRegistersPerBlock() {
+#if TC_WITH_CUDA && !defined(NO_CUDA_SDK)
+  return CudaGPUInfo::GPUInfo().RegistersPerBlock();
+#else
+  return 0;
+#endif
+}
+
 } // namespace tc

From 74d3e8512b2139527c0fdb36b8cf27a3e378cf8a Mon Sep 17 00:00:00 2001
From: Oleksandr Zinenko <git@ozinenko.com>
Date: Thu, 26 Jul 2018 12:13:27 +0200
Subject: [PATCH 9/9] cuda::MappedScop: introduce maxPrivateElements mapping
 option

This mapping option controls the maximum number of elements per thread
that are promoted into the private memory (hopefully, registers, but we
cannot guarantee this at the CUDA level).  The value is optional in the
protocol buffers.  When not provided, query the maximum number of
threads per block from CUDA device properties and divide it by the
number of threads in the block to obtain the per-thread limitation.
Note that using all registers in a single block will likely limit the
occupancy of SMs, potentially degrading performance.  Introducing the
limiting factor is primarily motivated by this effect, and it lets the
caller to require the mapper to use less registers, potentially
increasing the occupancy.  Since register allocation is performed by the
downstream compiler, this option is a mere recommendation and is
expressed in terms of (untyped) elements rather than actual registers.
It would be impossible to account for all registers required by the main
computation (that is, necessary to store the data loaded from memory
during operations) at the CUDA level, that also contribute to the
register pressure of the kernel.

Although limiting the number of promoted elements number of registers
available per thread may seem too constraining for occupancy, it is
strictly better than the current approach where we may promote even more
elements, which then get spilled into the slow local memory.
---
 tc/core/cuda/cuda_mapping_options.cc          |  5 +++
 tc/core/cuda/cuda_mapping_options.h           |  1 +
 .../cuda/cuda_mapping_options_cpp_printer.cc  |  4 +++
 tc/core/polyhedral/cuda/mapped_scop.cc        |  9 ++++-
 .../cuda/memory_promotion_heuristic.cc        |  1 -
 tc/proto/mapping_options.proto                |  3 ++
 tensor_comprehensions/pybinds/tclib.cc        |  5 +++
 test/test_cuda_mapper_memory_promotion.cc     | 33 ++++++++++++++++---
 8 files changed, 54 insertions(+), 7 deletions(-)

diff --git a/tc/core/cuda/cuda_mapping_options.cc b/tc/core/cuda/cuda_mapping_options.cc
index 09d7edf8c..ba911aa90 100644
--- a/tc/core/cuda/cuda_mapping_options.cc
+++ b/tc/core/cuda/cuda_mapping_options.cc
@@ -299,6 +299,11 @@ CudaMappingOptions& CudaMappingOptions::sharedDepth(uint32_t depth) {
   return *this;
 }
 
+CudaMappingOptions& CudaMappingOptions::maxPrivateElements(uint64_t nElements) {
+  ownedProto_.set_max_private_elements(nElements);
+  return *this;
+}
+
 CudaMappingOptions& CudaMappingOptions::mapToThreads(
     const std::string& commaSeparatedSizes) {
   auto sizes = parseCommaSeparatedIntegers<uint64_t>(commaSeparatedSizes);
diff --git a/tc/core/cuda/cuda_mapping_options.h b/tc/core/cuda/cuda_mapping_options.h
index aa8530307..ab6ce5e11 100644
--- a/tc/core/cuda/cuda_mapping_options.h
+++ b/tc/core/cuda/cuda_mapping_options.h
@@ -197,6 +197,7 @@ class CudaMappingOptions {
   CudaMappingOptions& useReadOnlyCache(bool b);
   CudaMappingOptions& privateDepth(uint32_t depth);
   CudaMappingOptions& sharedDepth(uint32_t depth);
+  CudaMappingOptions& maxPrivateElements(uint64_t nElements);
   ///@}
 
   /// Static constructors for predefined strategies.
diff --git a/tc/core/cuda/cuda_mapping_options_cpp_printer.cc b/tc/core/cuda/cuda_mapping_options_cpp_printer.cc
index 9ffa95bcc..a223fcb80 100644
--- a/tc/core/cuda/cuda_mapping_options_cpp_printer.cc
+++ b/tc/core/cuda/cuda_mapping_options_cpp_printer.cc
@@ -40,6 +40,10 @@ CudaMappingOptionsCppPrinter& operator<<(
   }
   prn.printValueOption("privateDepth", cudaOptions.proto().private_depth());
   prn.printValueOption("sharedDepth", cudaOptions.proto().shared_depth());
+  if (cudaOptions.proto().has_max_private_elements()) {
+    prn.printValueOption(
+        "maxPrivateElements", cudaOptions.proto().max_private_elements());
+  }
   prn.endStmt();
   return prn;
 }
diff --git a/tc/core/polyhedral/cuda/mapped_scop.cc b/tc/core/polyhedral/cuda/mapped_scop.cc
index b0b129cbc..faa7d9361 100644
--- a/tc/core/polyhedral/cuda/mapped_scop.cc
+++ b/tc/core/polyhedral/cuda/mapped_scop.cc
@@ -1086,7 +1086,14 @@ std::unique_ptr<MappedScop> MappedScop::makeWithOuterBlockInnerThreadStrategy(
 
   // 9. Promote to registers below the loops mapped to threads.
   if (cudaOptions.proto().use_private_memory()) {
-    promoteToRegistersAtDepth(*mappedScop, cudaOptions.proto().private_depth());
+    auto blockSizes = cudaOptions.block.extractVector();
+    auto nThreadsPerBlock = std::accumulate(
+        blockSizes.begin(), blockSizes.end(), 1, std::multiplies<size_t>());
+    auto nElementsPerThread = cudaOptions.proto().has_max_private_elements()
+        ? cudaOptions.proto().max_private_elements()
+        : queryRegistersPerBlock() / nThreadsPerBlock;
+    promoteToRegistersAtDepth(
+        *mappedScop, cudaOptions.proto().private_depth(), nElementsPerThread);
   }
 
   LOG_IF(INFO, FLAGS_debug_tc_mapper)
diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
index e2dd89a8d..f73246666 100644
--- a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
+++ b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
@@ -685,7 +685,6 @@ size_t promoteToRegistersBelow(
   for (auto& tensorGroups : groupLists) {
     auto tensorId = tensorGroups.first;
     sortTensorGroups(tensorGroups.second);
-    // TODO: counting the number of promoted elements
 
     for (auto& group : tensorGroups.second) {
       auto sizes = group->approximationSizes();
diff --git a/tc/proto/mapping_options.proto b/tc/proto/mapping_options.proto
index 8beaf46dc..52301f5a2 100644
--- a/tc/proto/mapping_options.proto
+++ b/tc/proto/mapping_options.proto
@@ -74,6 +74,9 @@ message CudaMappingOptionsProto {
   optional uint32 private_depth = 9;
   // Depth of promotion to shared memory, ignored if use_shared_memory is false.
   optional uint32 shared_depth = 10;
+  // Maximum number of elements to promote to registers per thread.  If not
+  // provided, the number 32-bit registers per thread will be used.
+  optional uint64 max_private_elements = 11;
 }
 
 message CpuMappingOptionsProto {
diff --git a/tensor_comprehensions/pybinds/tclib.cc b/tensor_comprehensions/pybinds/tclib.cc
index a18fb4ca7..3062c21e4 100644
--- a/tensor_comprehensions/pybinds/tclib.cc
+++ b/tensor_comprehensions/pybinds/tclib.cc
@@ -672,6 +672,11 @@ PYBIND11_MODULE(tclib, m) {
           "usePrivateMemory",
           &tc::CudaMappingOptions::usePrivateMemory,
           "Create thread-local copies of data in private memory")
+      .def(
+          "maxPrivateElements",
+          &tc::CudaMappingOptions::maxPrivateElements,
+          "The maximum number of elements per thread for which thread-local "
+          "copies are created")
       .def(
           "unrollCopyShared",
           &tc::CudaMappingOptions::unrollCopyShared,
diff --git a/test/test_cuda_mapper_memory_promotion.cc b/test/test_cuda_mapper_memory_promotion.cc
index 0fb7405b8..51a2fa057 100644
--- a/test/test_cuda_mapper_memory_promotion.cc
+++ b/test/test_cuda_mapper_memory_promotion.cc
@@ -539,7 +539,8 @@ TEST_F(MatMulBias, RegisterPromotion) {
                             .tile(32, 32, 32)
                             .privateDepth(5)
                             .useSharedMemory(false)
-                            .usePrivateMemory(true);
+                            .usePrivateMemory(true)
+                            .maxPrivateElements(100);
 
   auto code = emitCode({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
   auto declPos = code.find("float _O_0");
@@ -567,7 +568,8 @@ TEST_F(MatMulBias, RegisterPromotionSharedPreference) {
                             .tile(32, 32, 32)
                             .maxSharedMemory(32768)
                             .useSharedMemory(true)
-                            .usePrivateMemory(true);
+                            .usePrivateMemory(true)
+                            .maxPrivateElements(100);
 
   auto code = emitCode({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
 
@@ -587,7 +589,7 @@ TEST_F(MatMulBias, RegistersAtRoot) {
                             .usePrivateMemory(false);
 
   auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
-  promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot());
+  promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot(), 4);
   auto code = emitCode(mscop);
 
   // Expecting 4 elements because we map the loop i in O[i][j] to 8 threads
@@ -595,6 +597,27 @@ TEST_F(MatMulBias, RegistersAtRoot) {
   expectFourOElementsPromoted(code);
 }
 
+TEST_F(MatMulBias, RegistersAtRootNotEnoughAvailable) {
+  // Disable automatic promotion to registers because we are going to call it
+  // manually.  Require sufficient unrolling to actually hit registers.
+  auto mappingOptions = CudaMappingOptions::makeNaiveMappingOptions()
+                            .unroll(512)
+                            .useSharedMemory(false)
+                            .usePrivateMemory(false);
+
+  auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
+  promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot(), 3);
+  auto code = emitCode(mscop);
+
+  // Not expecting O to be promoted because 4 elements must be promoted and
+  // only 3 were indicated as available in promoteToRegistersBelow.
+  auto oDeclPos = code.find("float _O_0;");
+  EXPECT_TRUE(oDeclPos == std::string::npos)
+      << "not expected O to be promoted to registers";
+
+  expectNoABCPromotion(code);
+}
+
 TEST_F(MatMulBias, RegistersAtRootNotEnoughUnroll) {
   // Disable automatic promotion to registers because we are going to call it
   // manually.  Require no unrolling so as to make promotion to registers
@@ -605,7 +628,7 @@ TEST_F(MatMulBias, RegistersAtRootNotEnoughUnroll) {
                             .usePrivateMemory(false);
 
   auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
-  promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot());
+  promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot(), 100);
   auto code = emitCode(mscop);
   auto oDeclPos = code.find("float _O_0;");
 
@@ -631,7 +654,7 @@ TEST_F(MatMulBias, RegistersBelowFirstBand) {
       mscop->scop().scheduleRoot(), ScheduleTreeType::Band);
   ASSERT_GT(nodes.size(), 0u);
   auto node = nodes[0];
-  promoteToRegistersBelow(*mscop, node);
+  promoteToRegistersBelow(*mscop, node, 100);
   auto code = emitCode(mscop);
 
   // Expecting 4 elements because we map the loop i in O[i][j] to 8 threads