From c629b06ac06a7f8596fa2ba9b7728d3fd8314f11 Mon Sep 17 00:00:00 2001
From: Artemiy Bulavin <artemiyb@graphcore.ai>
Date: Fri, 21 Mar 2025 11:59:45 -0400
Subject: [PATCH 1/2] Bump to llvm/llvm-project@0aa5ba4 (#6266)

Updating LLVM in order to pull in the following change:

- https://github.com/llvm/llvm-project/pull/128566

For context, crash reproduction generation in MLIR will run the
`PassManager`'s passes in a child thread. The above PR fixes crashes for
when passes such as `add_di_scope` add `DistinctAttr` to the IR and
their storage is then accessed later once the child thread joins.
Pulling this in improves QoL for out-of-tree projects and makes the pass
manager more robust to the use of `DistinctAttr`.

This pin update has also introduced the deprecation of a
`llvm::TargetMachine::createTargetMachine` overload. I've updated the
callsites to use the non-deprecated overloads.

- [x] I am not making a trivial change, such as fixing a typo in a
comment.
- [x] I have written a PR description following these
  [rules](https://cbea.ms/git-commit/#why-not-how).
- [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`.

- Select one of the following.
  - [ ] I have added tests.
    - `/test` for `lit` tests
    - `/unittest` for C++ tests
    - `/python/test` for end-to-end tests
- [x] This PR does not need a test because `this PR only updates the
LLVM pin, so CI is sufficient`.

- Select one of the following.
  - [x] I have not added any `lit` tests.
- [ ] The `lit` tests I have added follow these [best
practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices),
including the "tests should be minimal" section. (Usually running Python
code
    and using the instructions it generates is not minimal.)
---
 python/src/llvm.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/python/src/llvm.cc b/python/src/llvm.cc
index 222ff3f8f9fc..f1d976ed5425 100644
--- a/python/src/llvm.cc
+++ b/python/src/llvm.cc
@@ -59,7 +59,7 @@ createTargetMachine(llvm::Module *module, std::string proc,
   opt.MCOptions.AsmVerbose = true;
   opt.MCOptions.PreserveAsmComments = true;
   std::unique_ptr<llvm::TargetMachine> machine{target->createTargetMachine(
-      module->getTargetTriple(), proc, features, opt, llvm::Reloc::PIC_,
+      module->getTargetTriple().str(), proc, features, opt, llvm::Reloc::PIC_,
       std::nullopt,
       disableLLVMOpt ? llvm::CodeGenOptLevel::None
                      : llvm::CodeGenOptLevel::Aggressive)};

From a6f553ca14d32db8f5d8c8a310372cad36ce70ae Mon Sep 17 00:00:00 2001
From: Gary Geng <ggeng@nvidia.com>
Date: Thu, 20 Mar 2025 18:07:27 +0000
Subject: [PATCH 2/2] Add shmem swizzling heuristic for LL

---
 lib/Dialect/TritonGPU/Transforms/Utility.cpp | 64 +++++++++++++++++---
 1 file changed, 57 insertions(+), 7 deletions(-)

diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
index aa946f2c967d..7fe9f4d2725c 100644
--- a/lib/Dialect/TritonGPU/Transforms/Utility.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
@@ -1048,6 +1048,46 @@ StringRef getAMDArch(Operation *module) {
   return ref.drop_front(4); // drop the "hip:"
 }
 
+// Rough utility for obtaining a SharedEnc for a LinearEncoding,
+// as we've replaced DotOpEnc with Linear in some cases
+// (specifically, fp4ToFp and similar unpack-upcast thru join)
+std::optional<ttg::SwizzledSharedEncodingAttr>
+getSharedForLinear(ttg::LinearEncodingAttr enc,
+                   ArrayRef<unsigned int> globalOrder, ArrayRef<int64_t> shape,
+                   unsigned elemBitWidth, ttg::CTALayoutAttr ctaLayout) {
+  auto ctx = enc.getContext();
+  auto ll = enc.getLinearLayout();
+  auto rank = shape.size();
+
+  if (rank != 2)
+    return std::nullopt;
+
+  auto order = enc.getOrder();
+  assert(globalOrder.size() == rank);
+  // TODO add memdesc_trans support for dot(trans(cvt(src) #linear) #dot_op)
+  if (order != globalOrder)
+    return std::nullopt;
+
+  auto innerDim = order[0];
+  auto outerDim = order[1];
+  auto contigPerWarp = enc.getContigPerWarp();
+
+  constexpr unsigned BANK_SIZE{128};
+  auto elemBytes = elemBitWidth / 8;
+
+  auto vec = contigPerWarp[innerDim];
+  auto rowSize = elemBytes * (unsigned)shape[innerDim];
+  auto perPhase = std::max(BANK_SIZE / rowSize, 1u);
+  auto maxPhase = std::max(contigPerWarp[outerDim] / perPhase, 1u);
+
+  // cp.async does not support transfer size < 4B
+  if (vec * elemBytes < 4 && perPhase < maxPhase)
+    return std::nullopt;
+
+  return ttg::SwizzledSharedEncodingAttr::get(ctx, vec, perPhase, maxPhase,
+                                              order, ctaLayout);
+}
+
 // If all the transitive uses of the given value have are used by a convert to
 // the same dot operand encoding, return the shared encoding that needs to be
 // used to be compatible with users' layouts. If there are incompatible shared
@@ -1074,18 +1114,28 @@ getSharedEncIfAllUsersAreDotEnc(Value val, bool &incompatible) {
     } else {
       if (!isa<ttg::LocalLoadOp, ttg::ConvertLayoutOp>(user))
         return std::nullopt;
-      auto dotOpEnc = dyn_cast<ttg::DotOperandEncodingAttr>(
+      auto enc =
           cast<triton::gpu::TensorOrMemDesc>(user->getResult(0).getType())
-              .getEncoding());
-      if (!dotOpEnc)
-        return std::nullopt;
+              .getEncoding();
       auto srcTy = cast<triton::gpu::TensorOrMemDesc>(val.getType());
       auto CTALayout = ttg::getCTALayout(srcTy.getEncoding());
       auto order = getOrderForMemory(srcTy);
       unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth();
-      tempAttr = ttg::SwizzledSharedEncodingAttr::get(
-          val.getContext(), dotOpEnc, srcTy.getShape(), order, CTALayout,
-          bitWidth, /*needTrans=*/false);
+
+      if (auto dotOpEnc = dyn_cast<ttg::DotOperandEncodingAttr>(enc)) {
+        tempAttr = ttg::SwizzledSharedEncodingAttr::get(
+            val.getContext(), dotOpEnc, srcTy.getShape(), order, CTALayout,
+            bitWidth, /*needTrans=*/false);
+      } else if (auto linearEnc = dyn_cast<ttg::LinearEncodingAttr>(enc)) {
+
+        auto attrOpt = getSharedForLinear(linearEnc, order, srcTy.getShape(),
+                                          bitWidth, CTALayout);
+        if (!attrOpt)
+          return std::nullopt;
+        tempAttr = *attrOpt;
+      } else {
+        return std::nullopt;
+      }
     }
     // Check that the shared encodings needed by the users are compatible.
     if (attr != nullptr && attr != tempAttr) {