Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Register promotion: limit the number of elements promoted per thread #587

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
18 changes: 14 additions & 4 deletions tc/core/cuda/cuda.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ DEFINE_bool(use_nvprof, false, "Start / stop nvprof");

namespace {

std::tuple<std::vector<std::string>, std::vector<size_t>> init() {
std::tuple<std::vector<std::string>, std::vector<size_t>, std::vector<size_t>>
init() {
int deviceCount = 0;
auto err_id = cudaGetDeviceCount(&deviceCount);
if (err_id == 35 or err_id == 30) {
Expand All @@ -44,14 +45,16 @@ std::tuple<std::vector<std::string>, std::vector<size_t>> init() {
}
std::vector<std::string> gpuNames;
std::vector<size_t> sharedMemSizes;
std::vector<size_t> registersPerBlock;
gpuNames.reserve(deviceCount);
for (int i = 0; i < deviceCount; ++i) {
cudaDeviceProp deviceProp;
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaGetDeviceProperties(&deviceProp, i));
gpuNames.emplace_back(deviceProp.name);
sharedMemSizes.emplace_back(deviceProp.sharedMemPerBlock);
registersPerBlock.emplace_back(deviceProp.regsPerBlock);
}
return std::make_tuple(gpuNames, sharedMemSizes);
return std::make_tuple(gpuNames, sharedMemSizes, registersPerBlock);
}

} // namespace
Expand All @@ -61,8 +64,8 @@ CudaGPUInfo& CudaGPUInfo::GPUInfo() {
static thread_local bool inited = false;
if (!inited) {
auto infos = init();
pInfo = std::unique_ptr<CudaGPUInfo>(
new CudaGPUInfo(std::get<0>(infos), std::get<1>(infos)));
pInfo = std::unique_ptr<CudaGPUInfo>(new CudaGPUInfo(
std::get<0>(infos), std::get<1>(infos), std::get<2>(infos)));
inited = true;
}
return *pInfo;
Expand Down Expand Up @@ -102,4 +105,11 @@ size_t CudaGPUInfo::SharedMemorySize() const {
}
return sharedMemSizes_.at(CurrentGPUId());
}

size_t CudaGPUInfo::RegistersPerBlock() const {
if (NumberGPUs() == 0) {
return 0; // no registers if no GPUs
}
return registersPerBlock_.at(CurrentGPUId());
}
} // namespace tc
9 changes: 7 additions & 2 deletions tc/core/cuda/cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,11 @@ struct WithCudaDevice {
class CudaGPUInfo {
CudaGPUInfo(
const std::vector<std::string>& gpuNames,
const std::vector<size_t>& sharedMemSizes)
: gpuNames_(gpuNames), sharedMemSizes_(sharedMemSizes) {}
const std::vector<size_t>& sharedMemSizes,
const std::vector<size_t>& registersPerBlock)
: gpuNames_(gpuNames),
sharedMemSizes_(sharedMemSizes),
registersPerBlock_(registersPerBlock) {}

public:
static CudaGPUInfo& GPUInfo();
Expand All @@ -112,9 +115,11 @@ class CudaGPUInfo {
std::string GetGPUName(int id = -1) const;
std::string getCudaDeviceStr() const;
size_t SharedMemorySize() const;
size_t RegistersPerBlock() const;

std::vector<std::string> gpuNames_;
std::vector<size_t> sharedMemSizes_;
std::vector<size_t> registersPerBlock_;
};

struct CudaProfiler {
Expand Down
5 changes: 5 additions & 0 deletions tc/core/cuda/cuda_mapping_options.cc
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,11 @@ CudaMappingOptions& CudaMappingOptions::sharedDepth(uint32_t depth) {
return *this;
}

CudaMappingOptions& CudaMappingOptions::maxPrivateElements(uint64_t nElements) {
ownedProto_.set_max_private_elements(nElements);
return *this;
}

CudaMappingOptions& CudaMappingOptions::mapToThreads(
const std::string& commaSeparatedSizes) {
auto sizes = parseCommaSeparatedIntegers<uint64_t>(commaSeparatedSizes);
Expand Down
1 change: 1 addition & 0 deletions tc/core/cuda/cuda_mapping_options.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ class CudaMappingOptions {
CudaMappingOptions& useReadOnlyCache(bool b);
CudaMappingOptions& privateDepth(uint32_t depth);
CudaMappingOptions& sharedDepth(uint32_t depth);
CudaMappingOptions& maxPrivateElements(uint64_t nElements);
///@}

/// Static constructors for predefined strategies.
Expand Down
4 changes: 4 additions & 0 deletions tc/core/cuda/cuda_mapping_options_cpp_printer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ CudaMappingOptionsCppPrinter& operator<<(
}
prn.printValueOption("privateDepth", cudaOptions.proto().private_depth());
prn.printValueOption("sharedDepth", cudaOptions.proto().shared_depth());
if (cudaOptions.proto().has_max_private_elements()) {
prn.printValueOption(
"maxPrivateElements", cudaOptions.proto().max_private_elements());
}
prn.endStmt();
return prn;
}
Expand Down
11 changes: 11 additions & 0 deletions tc/core/gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,15 @@ inline size_t querySharedMemorySize() {
#endif
}

/// Get the maximum number of registers per block provided by the GPU device
/// active in the current thread. The call is forwarded to the GPU driver.
/// If the thread has no associated GPU, return 0.
inline size_t queryRegistersPerBlock() {
#if TC_WITH_CUDA && !defined(NO_CUDA_SDK)
return CudaGPUInfo::GPUInfo().RegistersPerBlock();
#else
return 0;
#endif
}

} // namespace tc
9 changes: 8 additions & 1 deletion tc/core/polyhedral/cuda/mapped_scop.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1086,7 +1086,14 @@ std::unique_ptr<MappedScop> MappedScop::makeWithOuterBlockInnerThreadStrategy(

// 9. Promote to registers below the loops mapped to threads.
if (cudaOptions.proto().use_private_memory()) {
promoteToRegistersAtDepth(*mappedScop, cudaOptions.proto().private_depth());
auto blockSizes = cudaOptions.block.extractVector();
auto nThreadsPerBlock = std::accumulate(
blockSizes.begin(), blockSizes.end(), 1, std::multiplies<size_t>());
auto nElementsPerThread = cudaOptions.proto().has_max_private_elements()
? cudaOptions.proto().max_private_elements()
: queryRegistersPerBlock() / nThreadsPerBlock;
promoteToRegistersAtDepth(
*mappedScop, cudaOptions.proto().private_depth(), nElementsPerThread);
}

LOG_IF(INFO, FLAGS_debug_tc_mapper)
Expand Down
125 changes: 77 additions & 48 deletions tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,51 @@ bool isInThreadMappedScope(
return false;
}

static std::vector<std::pair<isl::id, TensorGroupsInfo>> sortTensorGroupMap(
TensorGroups&& groupMap) {
// Prepare groups for sorting, to have specified order necessary for
// reproducibility and tests.
using TensorGroupList = std::pair<isl::id, TensorGroupsInfo>;
std::vector<TensorGroupList> groupLists(
std::make_move_iterator(groupMap.begin()),
std::make_move_iterator(groupMap.end()));

// Computes the total number of references in all groups.
auto refsCount = [](const TensorGroupsInfo& info) {
size_t refs = 0;
for (auto const& group : info) {
refs += group->referenceIds().size();
}
return refs;
};

// Sort by the total number of references, then by name. Because names are
// guarenteed to be unique, the order is total.
std::sort(
groupLists.begin(),
groupLists.end(),
[refsCount](const TensorGroupList& l1, const TensorGroupList& l2) {
auto r1 = refsCount(l1.second);
auto r2 = refsCount(l2.second);
return r1 == r2 ? l1.first.get_name() < l2.first.get_name() : r1 < r2;
});
return groupLists;
}

/* Sorts the given vector of tensor groups in place following the number of
* references in the group in decreasing order. This prioritize groups with
* more references as they are more likely to benefit from promotion.
*/
static void sortTensorGroups(TensorGroupsInfo& tensorGroups) {
std::sort(
tensorGroups.begin(),
tensorGroups.end(),
[](const std::unique_ptr<TensorReferenceGroup>& group1,
const std::unique_ptr<TensorReferenceGroup>& group2) {
return group1->referenceIds().size() > group2->referenceIds().size();
});
}

/*
* Promote to shared memory in "scop" below "node". Use at most
* "remainingMemory" bytes, and update the variable to reflect the amount of
Expand Down Expand Up @@ -474,49 +519,14 @@ void promoteToSharedBelow(
auto partialSched = partialSchedule(root, node);
auto mapping = collectMappingsTo<mapping::BlockId>(scop);

auto groupMap = TensorReferenceGroup::accessedWithin(
partialSched.intersect_domain(mapping), scop.body);
auto groupLists = sortTensorGroupMap(TensorReferenceGroup::accessedWithin(
partialSched.intersect_domain(mapping), scop.body));
// Pure affine schedule without (mapping) filters.
auto partialSchedMupa = partialScheduleMupa(root, node);

// Prepare groups for sorting, to have specified order necessary for
// reproducibility and tests.
using TensorGroupList = std::pair<isl::id, TensorGroupsInfo>;
std::vector<TensorGroupList> groupLists(
std::make_move_iterator(groupMap.begin()),
std::make_move_iterator(groupMap.end()));

// Computes the total number of references in all groups.
auto refsCount = [](const TensorGroupsInfo& info) {
size_t refs = 0;
for (auto const& group : info) {
refs += group->referenceIds().size();
}
return refs;
};

// Sort by the total number of references, then by name. Because names are
// guarenteed to be unique, the order is total.
std::sort(
groupLists.begin(),
groupLists.end(),
[refsCount](const TensorGroupList& l1, const TensorGroupList& l2) {
auto r1 = refsCount(l1.second);
auto r2 = refsCount(l2.second);
return r1 == r2 ? l1.first.get_name() < l2.first.get_name() : r1 < r2;
});
for (auto& tensorGroups : groupLists) {
auto tensorId = tensorGroups.first;
// Sort the reference groups to prioritize groups with more references as
// they are more likely to benefit from promotion.
std::sort(
tensorGroups.second.begin(),
tensorGroups.second.end(),
[refsCount](
const std::unique_ptr<TensorReferenceGroup>& group1,
const std::unique_ptr<TensorReferenceGroup>& group2) {
return group1->referenceIds().size() > group2->referenceIds().size();
});
sortTensorGroups(tensorGroups.second);

for (auto& group : tensorGroups.second) {
auto sizes = group->approximationSizes();
Expand Down Expand Up @@ -620,8 +630,17 @@ void promoteToSharedAtDepth(
* of "mscop". Throw if promotion would violate the well-formedness of the
* schedule tree, in particular in cases of promotion immediately below
* a set/sequence node or immediately above a thread-specific marker node.
* Promote at most "maxElements" elements per thread and return the difference
* between "maxElements" and the number of actuall promoted elements. Note
* that this function does not differentitate types and sizes of the promoted
* elements because register allocation cannot be controlled at the CUDA level
* anyway. Instead, the "maxElements" value controls how much register
* promotion is performed overall.
*/
void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
size_t promoteToRegistersBelow(
MappedScop& mscop,
detail::ScheduleTree* scope,
size_t maxElements) {
// Cannot promote below a sequence or a set node. Promotion may insert an
// extension node, but sequence/set must be followed by filters.
if (scope->as<detail::ScheduleTreeSequence>() ||
Expand All @@ -646,8 +665,8 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
auto mapping =
collectMappingsTo<mapping::ThreadId>(scop).intersect(blockMapping);
auto schedule = partialSchedule(scop.scheduleRoot(), scope);
auto groupMap = TensorReferenceGroup::accessedWithin(
schedule.intersect_domain(mapping), scop.body);
auto groupLists = sortTensorGroupMap(TensorReferenceGroup::accessedWithin(
schedule.intersect_domain(mapping), scop.body));

auto threadSchedule = mscop.threadMappingSchedule(mscop.schedule());
auto blockSchedule = mscop.blockMappingSchedule(mscop.schedule());
Expand All @@ -663,17 +682,22 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
// identical dimensions without affecting the result of the checks.
partialSchedMupa = partialSchedMupa.flat_range_product(blockSchedule);

for (auto& tensorGroups : groupMap) {
for (auto& tensorGroups : groupLists) {
auto tensorId = tensorGroups.first;

// TODO: sorting of groups and counting the number of promoted elements
sortTensorGroups(tensorGroups.second);

for (auto& group : tensorGroups.second) {
auto sizes = group->approximationSizes();
// No point in promoting a scalar that will go to a register anyway.
if (sizes.size() == 0) {
continue;
}
// Do not promote if requires more registers than remaining.
auto nElements = std::accumulate(
sizes.begin(), sizes.end(), 1u, std::multiplies<size_t>());
if (nElements > maxElements) {
continue;
}
if (!isPromotableToRegistersBelow(
*group, root, scope, partialSchedMupa, threadSchedule)) {
continue;
Expand All @@ -693,13 +717,14 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
std::move(group),
scope,
partialSched);
maxElements -= nElements;
}
}

// Return immediately if nothing was promoted.
if (scope->numChildren() == 0 ||
!matchOne(extension(sequence(any())), scope->child({0}))) {
return;
return maxElements;
}

// If promoting above thread mapping, insert synchronizations.
Expand All @@ -715,15 +740,19 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
if (functional::Filter(isMappingTo<mapping::ThreadId>, ancestors).empty()) {
scop.insertSyncsAroundSeqChildren(scope->child({0, 0}));
}
return maxElements;
}

/*
* Promote to registers below "depth" schedule dimensions. Split bands if
* necessary to create promotion scopes. Do not promote if it would require
* splitting the band mapped to threads as we assume only one band can be
* mapped.
* mapped. Use at most "maxElements" per thread in all promoted subtrees.
*/
void promoteToRegistersAtDepth(MappedScop& mscop, size_t depth) {
void promoteToRegistersAtDepth(
MappedScop& mscop,
size_t depth,
size_t maxElements) {
using namespace detail;

auto root = mscop.scop().scheduleRoot();
Expand Down Expand Up @@ -757,7 +786,7 @@ void promoteToRegistersAtDepth(MappedScop& mscop, size_t depth) {
auto scopes = functional::Map(findScope, bands);

for (auto scope : scopes) {
promoteToRegistersBelow(mscop, scope);
maxElements = promoteToRegistersBelow(mscop, scope, maxElements);
}
}

Expand Down
10 changes: 8 additions & 2 deletions tc/core/polyhedral/cuda/memory_promotion_heuristic.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,15 @@ void promoteToSharedAtDepth(
std::size_t sharedMemorySize,
bool unrollCopies);

void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope);
size_t promoteToRegistersBelow(
MappedScop& mscop,
detail::ScheduleTree* scope,
std::size_t maxElements = SIZE_MAX);

void promoteToRegistersAtDepth(MappedScop& scop, std::size_t depth);
void promoteToRegistersAtDepth(
MappedScop& scop,
std::size_t depth,
std::size_t maxElements = SIZE_MAX);

} // namespace cuda
} // namespace polyhedral
Expand Down
3 changes: 3 additions & 0 deletions tc/proto/mapping_options.proto
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ message CudaMappingOptionsProto {
optional uint32 private_depth = 9;
// Depth of promotion to shared memory, ignored if use_shared_memory is false.
optional uint32 shared_depth = 10;
// Maximum number of elements to promote to registers per thread. If not
// provided, the number 32-bit registers per thread will be used.
optional uint64 max_private_elements = 11;
}

message CpuMappingOptionsProto {
Expand Down
5 changes: 5 additions & 0 deletions tensor_comprehensions/pybinds/tclib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -672,6 +672,11 @@ PYBIND11_MODULE(tclib, m) {
"usePrivateMemory",
&tc::CudaMappingOptions::usePrivateMemory,
"Create thread-local copies of data in private memory")
.def(
"maxPrivateElements",
&tc::CudaMappingOptions::maxPrivateElements,
"The maximum number of elements per thread for which thread-local "
"copies are created")
.def(
"unrollCopyShared",
&tc::CudaMappingOptions::unrollCopyShared,
Expand Down
Loading