diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index f01d5f6726822..32d20a8ccbefd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -588,13 +588,14 @@ class AMDGPULowerModuleLDS {
     return OrderedKernels;
   }
 
-  static void partitionVariablesIntoIndirectStrategies(
+  void partitionVariablesIntoIndirectStrategies(
       Module &M, LDSUsesInfoTy const &LDSUsesInfo,
       VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly,
       DenseSet<GlobalVariable *> &ModuleScopeVariables,
       DenseSet<GlobalVariable *> &TableLookupVariables,
       DenseSet<GlobalVariable *> &KernelAccessVariables,
-      DenseSet<GlobalVariable *> &DynamicVariables) {
+      DenseSet<GlobalVariable *> &DynamicVariables,
+      uint64_t MaybeModuleScopeStructSimSize = 0) {
 
     GlobalVariable *HybridModuleRoot =
         LoweringKindLoc != LoweringKind::hybrid
@@ -648,7 +649,22 @@ class AMDGPULowerModuleLDS {
         } else if (K.second.size() == 1) {
           KernelAccessVariables.insert(GV);
         } else if (set_is_subset(K.second, HybridModuleRootKernels)) {
-          ModuleScopeVariables.insert(GV);
+          // If the struct holding module scope variables exceeds the maximum
+          // number of bytes of LDS that can be allocated to a single workgroup
+          // then switch to table strategy
+          uint64_t LocalMemLimit = 0;
+          for (Function *F : K.second) {
+            if (!F->isDeclaration()) {
+              const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*F);
+              LocalMemLimit = ST.getAddressableLocalMemorySize();
+              break;
+            }
+          }
+          if (MaybeModuleScopeStructSimSize <= LocalMemLimit)
+            ModuleScopeVariables.insert(GV);
+          else {
+            TableLookupVariables.insert(GV);
+          }
         } else {
           TableLookupVariables.insert(GV);
         }
@@ -1070,6 +1086,20 @@ class AMDGPULowerModuleLDS {
           M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
     }
 
+    // Partition variables accessed indirectly into the different strategies
+    DenseSet<GlobalVariable *> ModuleScopeVariablesSim;
+    DenseSet<GlobalVariable *> TableLookupVariablesSim;
+    DenseSet<GlobalVariable *> KernelAccessVariablesSim;
+    DenseSet<GlobalVariable *> DynamicVariablesSim;
+    partitionVariablesIntoIndirectStrategies(
+        M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly,
+        ModuleScopeVariablesSim, TableLookupVariablesSim,
+        KernelAccessVariablesSim, DynamicVariablesSim);
+    uint64_t MaybeModuleScopeStructSimSize = 0;
+    if (!ModuleScopeVariablesSim.empty())
+      MaybeModuleScopeStructSimSize = getLDSStructSize(
+          M, "llvm.amdgcn.module.lds.sim", ModuleScopeVariablesSim);
+
     // Partition variables accessed indirectly into the different strategies
     DenseSet<GlobalVariable *> ModuleScopeVariables;
     DenseSet<GlobalVariable *> TableLookupVariables;
@@ -1078,7 +1108,7 @@ class AMDGPULowerModuleLDS {
     partitionVariablesIntoIndirectStrategies(
         M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly,
         ModuleScopeVariables, TableLookupVariables, KernelAccessVariables,
-        DynamicVariables);
+        DynamicVariables, MaybeModuleScopeStructSimSize);
 
     // If the kernel accesses a variable that is going to be stored in the
     // module instance through a call then that kernel needs to allocate the
@@ -1183,11 +1213,14 @@ class AMDGPULowerModuleLDS {
             KernelToCreatedDynamicLDS.contains(&Func);
 
         uint32_t Offset = 0;
+        LLVM_DEBUG(dbgs() << "Function - " << Func.getName()
+                          << " - amdgpu-lds-size" << '\n');
 
         if (AllocateModuleScopeStruct) {
           // Allocated at zero, recorded once on construction, not once per
           // kernel
           Offset += DL.getTypeAllocSize(MaybeModuleScopeStruct->getValueType());
+          LLVM_DEBUG(dbgs() << "after ModuleScopeStruct - " << Offset << '\n');
         }
 
         if (AllocateKernelScopeStruct) {
@@ -1195,6 +1228,7 @@ class AMDGPULowerModuleLDS {
           Offset = alignTo(Offset, AMDGPU::getAlign(DL, KernelStruct));
           recordLDSAbsoluteAddress(&M, KernelStruct, Offset);
           Offset += DL.getTypeAllocSize(KernelStruct->getValueType());
+          LLVM_DEBUG(dbgs() << "after KernelScopeStruct - " << Offset << '\n');
         }
 
         // If there is dynamic allocation, the alignment needed is included in
@@ -1205,6 +1239,7 @@ class AMDGPULowerModuleLDS {
           GlobalVariable *DynamicVariable = KernelToCreatedDynamicLDS[&Func];
           Offset = alignTo(Offset, AMDGPU::getAlign(DL, DynamicVariable));
           recordLDSAbsoluteAddress(&M, DynamicVariable, Offset);
+          LLVM_DEBUG(dbgs() << "after DynamicVariable - " << Offset << '\n');
         }
 
         if (Offset != 0) {
@@ -1288,6 +1323,76 @@ class AMDGPULowerModuleLDS {
     return Changed;
   }
 
+  static uint64_t
+  getLDSStructSize(Module &M, std::string VarName,
+                   DenseSet<GlobalVariable *> const &LDSVarsToTransform,
+                   Function *F = nullptr) {
+
+    LLVMContext &Ctx = M.getContext();
+    const DataLayout &DL = M.getDataLayout();
+    assert(!LDSVarsToTransform.empty());
+
+    SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
+    LayoutFields.reserve(LDSVarsToTransform.size());
+    {
+      auto Sorted = sortByName(std::vector<GlobalVariable *>(
+          LDSVarsToTransform.begin(), LDSVarsToTransform.end()));
+
+      for (GlobalVariable *GV : Sorted) {
+        OptimizedStructLayoutField F(GV,
+                                     DL.getTypeAllocSize(GV->getValueType()),
+                                     AMDGPU::getAlign(DL, GV));
+        LayoutFields.emplace_back(F);
+      }
+    }
+
+    performOptimizedStructLayout(LayoutFields);
+
+    std::vector<GlobalVariable *> LocalVars;
+    BitVector IsPaddingField;
+    LocalVars.reserve(LDSVarsToTransform.size()); // will be at least this large
+    IsPaddingField.reserve(LDSVarsToTransform.size());
+    {
+      uint64_t CurrentOffset = 0;
+      for (auto &F : LayoutFields) {
+        GlobalVariable *FGV =
+            static_cast<GlobalVariable *>(const_cast<void *>(F.Id));
+        Align DataAlign = F.Alignment;
+
+        uint64_t DataAlignV = DataAlign.value();
+        if (uint64_t Rem = CurrentOffset % DataAlignV) {
+          uint64_t Padding = DataAlignV - Rem;
+
+          // Append an array of padding bytes to meet alignment requested
+          // Note (o +      (a - (o % a)) ) % a == 0
+          //      (offset + Padding       ) % align == 0
+          Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding);
+          LocalVars.push_back(new GlobalVariable(
+              M, ATy, false, GlobalValue::InternalLinkage,
+              PoisonValue::get(ATy), "", nullptr, GlobalValue::NotThreadLocal,
+              AMDGPUAS::LOCAL_ADDRESS, false));
+          IsPaddingField.push_back(true);
+          CurrentOffset += Padding;
+        }
+
+        LocalVars.push_back(FGV);
+        IsPaddingField.push_back(false);
+        CurrentOffset += F.Size;
+      }
+    }
+
+    std::vector<Type *> LocalVarTypes;
+    LocalVarTypes.reserve(LocalVars.size());
+    std::transform(
+        LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
+        [](const GlobalVariable *V) -> Type * { return V->getValueType(); });
+
+    StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t");
+    Align StructAlign = AMDGPU::getAlign(DL, LocalVars[0]);
+    uint64_t AllocSize = DL.getTypeAllocSize(LDSTy);
+    return alignTo(AllocSize, StructAlign);
+  }
+
   static LDSVariableReplacement createLDSVariableReplacement(
       Module &M, std::string VarName,
       DenseSet<GlobalVariable *> const &LDSVarsToTransform) {
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-force-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-force-table.ll
new file mode 100644
index 0000000000000..6c749a1a4e3e7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-force-table.ll
@@ -0,0 +1,264 @@
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx942 -amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx942 -passes=amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
+
+; This test has the following kernels with following GV access pattern
+; EN32 kernels
+; EN32_compress_wrapperIhm - GV's 1, 2, 3, 4, 5, 6, 7
+; EN32_compress_wrapperItm - GV's 8, 9, 10, 11, 12, 13, 7
+; EN32_compress_wrapperIjm - GV's 15, 16, 17, 18, 19, 20, 7
+; EN32_compress_wrapperImm - GV's 21, 22, 23, 24, 25, 26, 27, 7
+; EN64 kernels
+; EN64_compress_wrapperIhm - GV's 1, 2, 3, 4, 5, 6, 7
+; EN64_compress_wrapperItm - GV's 8, 9, 10, 11, 12, 13, 7
+; EN64_compress_wrapperIjm - GV's 15, 16, 17, 18, 19, 20, 7
+; EN64_compress_wrapperImm - GV's 21, 22, 23, 24, 25, 26, 27, 7
+
+; CHECK: define amdgpu_kernel void @EN32_compress_wrapperIhm() #0
+; CHECK: define amdgpu_kernel void @EN32_compress_wrapperItm() #2
+; CHECK: define amdgpu_kernel void @EN32_compress_wrapperIjm() #3
+; CHECK: define amdgpu_kernel void @EN32_compress_wrapperImm() #4
+; CHECK: define amdgpu_kernel void @EN64_compress_wrapperIhm() #0
+; CHECK: define amdgpu_kernel void @EN64_compress_wrapperItm() #2
+; CHECK: define amdgpu_kernel void @EN64_compress_wrapperIjm() #3
+; CHECK: define amdgpu_kernel void @EN64_compress_wrapperImm() #4
+
+; CHECK: attributes #0 = { "amdgpu-lds-size"="25760" "target-cpu"="gfx942" }
+; CHECK: attributes #2 = { "amdgpu-lds-size"="17560" "target-cpu"="gfx942" }
+; CHECK: attributes #3 = { "amdgpu-lds-size"="13464" "target-cpu"="gfx942" }
+; CHECK: attributes #4 = { "amdgpu-lds-size"="11424" "target-cpu"="gfx942" }
+
+%RawStorage1 = type { [1056 x i8] }
+%RawStorage2 = type { [4 x i8] }
+%RawStorage3 = type { [16 x i8] }
+
+@one = addrspace(3) global [1026 x i32] poison
+@two = addrspace(3) global [1026 x i32] poison
+@three = external addrspace(3) global [2048 x i32]
+@four = addrspace(3) global [2050 x i32] poison
+@five = addrspace(3) global [16 x i32] poison
+@six = external addrspace(3) global %RawStorage1
+@seven = addrspace(3) global %RawStorage2 poison
+@eight = addrspace(3) global [1026 x i32] poison
+@nine = addrspace(3) global [1026 x i32] poison
+@ten = external addrspace(3) global [1024 x i32]
+@eleven = addrspace(3) global [1026 x i32] poison
+@twelve = external addrspace(3) global [16 x i32]
+@thirteen = external addrspace(3) global %RawStorage1
+@fourteen = external addrspace(3) global [1 x i32]
+@fifteen = addrspace(3) global [1026 x i32] poison
+@sixteen = addrspace(3) global [1026 x i32] poison
+@seventeen = external addrspace(3) global [512 x i32]
+@eighteen = addrspace(3) global [514 x i32] poison
+@nineteen = external addrspace(3) global [16 x i32]
+@twenty = external addrspace(3) global %RawStorage1
+@twentyone = external addrspace(3) global [514 x i64]
+@twentytwo = external addrspace(3) global [514 x i64]
+@twentythree = external addrspace(3) global [256 x i32]
+@twentyfour = external addrspace(3) global [258 x i32]
+@twentyfive = external addrspace(3) global [16 x i32]
+@twentysix = external addrspace(3) global %RawStorage1
+@twentyseven = external addrspace(3) global %RawStorage3
+
+define amdgpu_kernel void @EN32_compress_wrapperIhm() {
+entry:
+  %0 = call i32 @Ihm_one()
+  ret void
+}
+
+define i32 @Ihm_one() {
+entry:
+  %0 = call i32 @Ihm_chunk()
+  ret i32 %0
+}
+
+define i32 @Ihm_chunk() {
+entry:
+  %0 = call i32 @Ihm_CascadedOpts()
+  ret i32 %0
+}
+
+define i32 @Ihm_CascadedOpts() {
+entry:
+  store ptr addrspacecast (ptr addrspace(3) @one to ptr), ptr null, align 8
+  store ptr addrspacecast (ptr addrspace(3) @two to ptr), ptr null, align 8
+  %add.ptr = getelementptr i32, ptr getelementptr inbounds (i32, ptr addrspacecast (ptr addrspace(3) @five to ptr), i64 1), i64 0
+  call void @Ihm_PS1_PT1_PS4_S7()
+  %call69 = call i32 @foo(ptr addrspacecast (ptr addrspace(3) @three to ptr), ptr addrspacecast (ptr addrspace(3) @four to ptr))
+  ret i32 %call69
+}
+
+define void @Ihm_PS1_PT1_PS4_S7() {
+entry:
+  store ptr addrspacecast (ptr addrspace(3) @six to ptr), ptr null, align 8
+  ret void
+}
+
+define i32 @foo(ptr %input, ptr %temp_storage) {
+entry:
+  call void @Itm_PjPS4()
+  ret i32 0
+}
+
+define void @Itm_PjPS4() {
+entry:
+  call void @Itm_PS1_Pj()
+  ret void
+}
+
+define void @Itm_PS1_Pj() {
+entry:
+  store ptr addrspacecast (ptr addrspace(3) @seven to ptr), ptr null, align 8
+  ret void
+}
+
+define amdgpu_kernel void @EN32_compress_wrapperItm() {
+entry:
+  %0 = call i32 @Itm_one()
+  ret void
+}
+
+define i32 @Itm_one() {
+entry:
+  %0 = call i32 @Itm_chunk()
+  ret i32 %0
+}
+
+define i32 @Itm_chunk() {
+entry:
+  %0 = call i32 @Itm_CascadedOpts()
+  ret i32 %0
+}
+
+define i32 @Itm_CascadedOpts() {
+entry:
+  store ptr addrspacecast (ptr addrspace(3) @eight to ptr), ptr null, align 8
+  store ptr addrspacecast (ptr addrspace(3) @nine to ptr), ptr null, align 8
+  %add.ptr = getelementptr i32, ptr getelementptr inbounds (i32, ptr addrspacecast (ptr addrspace(3) @twelve to ptr), i64 1), i64 0
+  call void @Itm_PS1_PT1_PS4_S7()
+  %call69 = call i32 @foo(ptr addrspacecast (ptr addrspace(3) @ten to ptr), ptr addrspacecast (ptr addrspace(3) @eleven to ptr))
+  ret i32 %call69
+}
+
+define void @Itm_PS1_PT1_PS4_S7() {
+entry:
+  store ptr addrspacecast (ptr addrspace(3) @thirteen to ptr), ptr null, align 8
+  ret void
+}
+
+define amdgpu_kernel void @EN32_compress_wrapperIjm() {
+entry:
+  %arrayidx = getelementptr [1 x i32], ptr addrspacecast (ptr addrspace(3) @fourteen to ptr), i64 0, i64 0
+  %0 = call i32 @Ijm_one()
+  ret void
+}
+
+define i32 @Ijm_one() {
+entry:
+  %0 = call i32 @Ijm_chunk()
+  ret i32 %0
+}
+
+define i32 @Ijm_chunk() {
+entry:
+  %0 = call i32 @Ijm_CascadedOpts()
+  ret i32 %0
+}
+
+define i32 @Ijm_CascadedOpts() {
+entry:
+  store ptr addrspacecast (ptr addrspace(3) @fifteen to ptr), ptr null, align 8
+  store ptr addrspacecast (ptr addrspace(3) @sixteen to ptr), ptr null, align 8
+  %add.ptr = getelementptr i32, ptr getelementptr inbounds (i32, ptr addrspacecast (ptr addrspace(3) @nineteen to ptr), i64 1), i64 0
+  call void @Ijm_PS1_PT1_PS4_S7()
+  %call69 = call i32 @foo(ptr addrspacecast (ptr addrspace(3) @seventeen to ptr), ptr addrspacecast (ptr addrspace(3) @eighteen to ptr))
+  ret i32 %call69
+}
+
+define void @Ijm_PS1_PT1_PS4_S7() {
+entry:
+  store ptr addrspacecast (ptr addrspace(3) @twenty to ptr), ptr null, align 8
+  ret void
+}
+
+define amdgpu_kernel void @EN32_compress_wrapperImm() {
+entry:
+  %0 = call i32 @Imm_one()
+  ret void
+}
+
+define i32 @Imm_one() {
+entry:
+  %0 = call i32 @Imm_chunk()
+  ret i32 %0
+}
+
+define i32 @Imm_chunk() {
+entry:
+  %0 = call i32 @Imm_CascadedOpts()
+  ret i32 %0
+}
+
+define i32 @Imm_CascadedOpts() {
+entry:
+  store ptr addrspacecast (ptr addrspace(3) @twentyone to ptr), ptr null, align 8
+  store ptr addrspacecast (ptr addrspace(3) @twentytwo to ptr), ptr null, align 8
+  %add.ptr = getelementptr i32, ptr getelementptr inbounds (i32, ptr addrspacecast (ptr addrspace(3) @twentyfive to ptr), i64 1), i64 0
+  br i1 false, label %for.body65, label %for.end102
+
+for.body65:
+  call void @Imm_PS1_PT1_PS4_S7()
+  %call69 = call i32 @foo(ptr addrspacecast (ptr addrspace(3) @twentythree to ptr), ptr addrspacecast (ptr addrspace(3) @twentyfour to ptr))
+  ret i32 %call69
+
+for.end102:
+  %call106 = call i32 @Imm_PjPKjPS5_S6_b()
+  ret i32 0
+}
+
+define void @Imm_PS1_PT1_PS4_S7() {
+entry:
+  store ptr addrspacecast (ptr addrspace(3) @twentysix to ptr), ptr null, align 8
+  ret void
+}
+
+define i32 @Imm_PjPKjPS5_S6_b() {
+entry:
+  call void @Imm_PjPS4()
+  ret i32 0
+}
+
+define void @Imm_PjPS4() {
+entry:
+  call void @Imm_PS1_Pj()
+  ret void
+}
+
+define void @Imm_PS1_Pj() {
+entry:
+  store ptr addrspacecast (ptr addrspace(3) @twentyseven to ptr), ptr null, align 8
+  ret void
+}
+
+define amdgpu_kernel void @EN64_compress_wrapperIhm() {
+entry:
+  %0 = call i32 @Ihm_one()
+  ret void
+}
+
+define amdgpu_kernel void @EN64_compress_wrapperItm() {
+entry:
+  %0 = call i32 @Itm_one()
+  ret void
+}
+
+define amdgpu_kernel void @EN64_compress_wrapperIjm() {
+entry:
+  %0 = call i32 @Ijm_one()
+  ret void
+}
+
+define amdgpu_kernel void @EN64_compress_wrapperImm() {
+entry:
+  %0 = call i32 @Imm_one()
+  ret void
+}