[AMDGPU] Add a target option to disable aggressive FMA fusion

yiqian1 · yiqian1 · commit aab5e41bd837 · 2025-10-02T02:01:04.000Z
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1238,6 +1238,13 @@ def FeatureSetPrioIncWgInst : SubtargetFeature<"setprio-inc-wg-inst",
 // Subtarget Features (options and debugging)
 //===------------------------------------------------------------===//
 
+def FeatureDisableAggressiveFMAFusion : SubtargetFeature<
+  "disable-aggressive-fma-fusion",
+  "DisableAggressiveFMAFusion",
+  "true",
+  "Do not fold fmul and fadd/fsub into fma."
+>;
+
 // Ugly hack to accomodate assembling modules with mixed
 // wavesizes. Ideally we would have a mapping symbol in assembly which
 // would keep track of which sections of code should be treated as
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -76,6 +76,7 @@ class AMDGPUSubtarget {
   bool EnablePromoteAlloca = false;
   bool HasTrigReducedRange = false;
   bool FastFMAF32 = false;
+  bool DisableAggressiveFMAFusion = false;
   unsigned EUsPerCU = 4;
   unsigned MaxWavesPerEU = 10;
   unsigned LocalMemorySize = 0;
@@ -303,6 +304,10 @@ class AMDGPUSubtarget {
     return FastFMAF32;
   }
 
+  bool hasDisableAggressiveFMAFusion() const {
+    return DisableAggressiveFMAFusion;
+  }
+
   bool isPromoteAllocaEnabled() const {
     return EnablePromoteAlloca;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -288,6 +288,7 @@ const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
     AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
     AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
     AMDGPU::FeatureUnalignedAccessMode,
+    AMDGPU::FeatureDisableAggressiveFMAFusion,
 
     AMDGPU::FeatureAutoWaitcntBeforeBarrier,
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -85,6 +85,10 @@ GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
       FullFS += "-wavefrontsize64,";
   }
 
+  // GFX9 enables fast-fmaf by default
+  if (GPU.contains_insensitive("gfx9") && !FS.contains_insensitive("fast-fmaf"))
+    FullFS += "+fast-fmaf";
+
   FullFS += FS;
 
   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6502,10 +6502,14 @@ bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
   // When fma is quarter rate, for f64 where add / sub are at best half rate,
   // most of these combines appear to be cycle neutral but save on instruction
   // count / code size.
-  return true;
+  return Subtarget->hasFastFMAF32() &&
+         !Subtarget->hasDisableAggressiveFMAFusion();
 }
 
-bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; }
+bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const {
+  return Subtarget->hasFastFMAF32() &&
+         !Subtarget->hasDisableAggressiveFMAFusion();
+}
 
 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
                                          EVT VT) const {

Original file line number	Diff line number	Diff line change
`@@ -85,6 +85,10 @@ GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,`
`85`	`85`	`FullFS += "-wavefrontsize64,";`
`86`	`86`	`}`
`87`	`87`
	`88`	`+ // GFX9 enables fast-fmaf by default`
	`89`	`+ if (GPU.contains_insensitive("gfx9") && !FS.contains_insensitive("fast-fmaf"))`
	`90`	`+ FullFS += "+fast-fmaf";`
	`91`	`+`
`88`	`92`	`FullFS += FS;`
`89`	`93`
`90`	`94`	`ParseSubtargetFeatures(GPU, /TuneCPU/ GPU, FullFS);`