Skip to content
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 106 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -588,13 +588,14 @@ class AMDGPULowerModuleLDS {
return OrderedKernels;
}

static void partitionVariablesIntoIndirectStrategies(
void partitionVariablesIntoIndirectStrategies(
Module &M, LDSUsesInfoTy const &LDSUsesInfo,
VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly,
DenseSet<GlobalVariable *> &ModuleScopeVariables,
DenseSet<GlobalVariable *> &TableLookupVariables,
DenseSet<GlobalVariable *> &KernelAccessVariables,
DenseSet<GlobalVariable *> &DynamicVariables) {
DenseSet<GlobalVariable *> &DynamicVariables,
uint64_t MaybeModuleScopeStructSimSize = 0) {

GlobalVariable *HybridModuleRoot =
LoweringKindLoc != LoweringKind::hybrid
Expand Down Expand Up @@ -648,7 +649,19 @@ class AMDGPULowerModuleLDS {
} else if (K.second.size() == 1) {
KernelAccessVariables.insert(GV);
} else if (set_is_subset(K.second, HybridModuleRootKernels)) {
ModuleScopeVariables.insert(GV);
uint64_t LocalMemLimit = 0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment at the top of the file claims the hybrid strategy offers precise allocation, so is there just a bug somewhere?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I dont think there is a bug. Below is the access pattern of the test.

+; This test has the following kernels with following GV access pattern
+; EN32 kernels
+; EN32_compress_wrapperIhm - GV's 1, 2, 3, 4, 5, 6, 7
+; EN32_compress_wrapperItm - GV's 8, 9, 10, 11, 12, 13, 7
+; EN32_compress_wrapperIjm - GV's 15, 16, 17, 18, 19, 20, 7
+; EN32_compress_wrapperImm - GV's 21, 22, 23, 24, 25, 26, 27, 7
+; EN64 kernels
+; EN64_compress_wrapperIhm - GV's 1, 2, 3, 4, 5, 6, 7
+; EN64_compress_wrapperItm - GV's 8, 9, 10, 11, 12, 13, 7
+; EN64_compress_wrapperIjm - GV's 15, 16, 17, 18, 19, 20, 7
+; EN64_compress_wrapperImm - GV's 21, 22, 23, 24, 25, 26, 27, 7

for (Function &F : M) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment what this is doing

if (!F.isDeclaration()) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
LocalMemLimit = ST.getAddressableLocalMemorySize();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The limit should really come from the entry point kernel, not just the first function you happen to find

break;
}
}
if (MaybeModuleScopeStructSimSize <= LocalMemLimit)
ModuleScopeVariables.insert(GV);
else {
TableLookupVariables.insert(GV);
}
} else {
TableLookupVariables.insert(GV);
}
Expand Down Expand Up @@ -1070,6 +1083,20 @@ class AMDGPULowerModuleLDS {
M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
}

// Partition variables accessed indirectly into the different strategies
DenseSet<GlobalVariable *> ModuleScopeVariablesSim;
DenseSet<GlobalVariable *> TableLookupVariablesSim;
DenseSet<GlobalVariable *> KernelAccessVariablesSim;
DenseSet<GlobalVariable *> DynamicVariablesSim;
partitionVariablesIntoIndirectStrategies(
M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly,
ModuleScopeVariablesSim, TableLookupVariablesSim,
KernelAccessVariablesSim, DynamicVariablesSim);
uint64_t MaybeModuleScopeStructSimSize = 0;
if (!ModuleScopeVariablesSim.empty())
MaybeModuleScopeStructSimSize = getLDSStructSize(
M, "llvm.amdgcn.module.lds.sim", ModuleScopeVariablesSim);

// Partition variables accessed indirectly into the different strategies
DenseSet<GlobalVariable *> ModuleScopeVariables;
DenseSet<GlobalVariable *> TableLookupVariables;
Expand All @@ -1078,7 +1105,7 @@ class AMDGPULowerModuleLDS {
partitionVariablesIntoIndirectStrategies(
M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly,
ModuleScopeVariables, TableLookupVariables, KernelAccessVariables,
DynamicVariables);
DynamicVariables, MaybeModuleScopeStructSimSize);

// If the kernel accesses a variable that is going to be stored in the
// module instance through a call then that kernel needs to allocate the
Expand Down Expand Up @@ -1183,18 +1210,22 @@ class AMDGPULowerModuleLDS {
KernelToCreatedDynamicLDS.contains(&Func);

uint32_t Offset = 0;
LLVM_DEBUG(dbgs() << "Function - " << Func.getName()
<< " - amdgpu-lds-size" << '\n');

if (AllocateModuleScopeStruct) {
// Allocated at zero, recorded once on construction, not once per
// kernel
Offset += DL.getTypeAllocSize(MaybeModuleScopeStruct->getValueType());
LLVM_DEBUG(dbgs() << "after ModuleScopeStruct - " << Offset << '\n');
}

if (AllocateKernelScopeStruct) {
GlobalVariable *KernelStruct = Replacement->second.SGV;
Offset = alignTo(Offset, AMDGPU::getAlign(DL, KernelStruct));
recordLDSAbsoluteAddress(&M, KernelStruct, Offset);
Offset += DL.getTypeAllocSize(KernelStruct->getValueType());
LLVM_DEBUG(dbgs() << "after KernelScopeStruct - " << Offset << '\n');
}

// If there is dynamic allocation, the alignment needed is included in
Expand All @@ -1205,6 +1236,7 @@ class AMDGPULowerModuleLDS {
GlobalVariable *DynamicVariable = KernelToCreatedDynamicLDS[&Func];
Offset = alignTo(Offset, AMDGPU::getAlign(DL, DynamicVariable));
recordLDSAbsoluteAddress(&M, DynamicVariable, Offset);
LLVM_DEBUG(dbgs() << "after DynamicVariable - " << Offset << '\n');
}

if (Offset != 0) {
Expand Down Expand Up @@ -1288,6 +1320,76 @@ class AMDGPULowerModuleLDS {
return Changed;
}

static uint64_t
getLDSStructSize(Module &M, std::string VarName,
DenseSet<GlobalVariable *> const &LDSVarsToTransform,
Function *F = nullptr) {

LLVMContext &Ctx = M.getContext();
const DataLayout &DL = M.getDataLayout();
assert(!LDSVarsToTransform.empty());

SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
LayoutFields.reserve(LDSVarsToTransform.size());
{
auto Sorted = sortByName(std::vector<GlobalVariable *>(
LDSVarsToTransform.begin(), LDSVarsToTransform.end()));

for (GlobalVariable *GV : Sorted) {
OptimizedStructLayoutField F(GV,
DL.getTypeAllocSize(GV->getValueType()),
AMDGPU::getAlign(DL, GV));
LayoutFields.emplace_back(F);
}
}

performOptimizedStructLayout(LayoutFields);

std::vector<GlobalVariable *> LocalVars;
BitVector IsPaddingField;
LocalVars.reserve(LDSVarsToTransform.size()); // will be at least this large
IsPaddingField.reserve(LDSVarsToTransform.size());
{
uint64_t CurrentOffset = 0;
for (auto &F : LayoutFields) {
GlobalVariable *FGV =
static_cast<GlobalVariable *>(const_cast<void *>(F.Id));
Align DataAlign = F.Alignment;

uint64_t DataAlignV = DataAlign.value();
if (uint64_t Rem = CurrentOffset % DataAlignV) {
uint64_t Padding = DataAlignV - Rem;

// Append an array of padding bytes to meet alignment requested
// Note (o + (a - (o % a)) ) % a == 0
// (offset + Padding ) % align == 0
Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding);
LocalVars.push_back(new GlobalVariable(
M, ATy, false, GlobalValue::InternalLinkage,
PoisonValue::get(ATy), "", nullptr, GlobalValue::NotThreadLocal,
AMDGPUAS::LOCAL_ADDRESS, false));
IsPaddingField.push_back(true);
CurrentOffset += Padding;
}

LocalVars.push_back(FGV);
IsPaddingField.push_back(false);
CurrentOffset += F.Size;
}
}

std::vector<Type *> LocalVarTypes;
LocalVarTypes.reserve(LocalVars.size());
std::transform(
LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
[](const GlobalVariable *V) -> Type * { return V->getValueType(); });

StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t");
Align StructAlign = AMDGPU::getAlign(DL, LocalVars[0]);
uint64_t AllocSize = DL.getTypeAllocSize(LDSTy);
return alignTo(AllocSize, StructAlign);
}

static LDSVariableReplacement createLDSVariableReplacement(
Module &M, std::string VarName,
DenseSet<GlobalVariable *> const &LDSVarsToTransform) {
Expand Down
Loading