From c629b06ac06a7f8596fa2ba9b7728d3fd8314f11 Mon Sep 17 00:00:00 2001
From: Artemiy Bulavin <artemiyb@graphcore.ai>
Date: Fri, 21 Mar 2025 11:59:45 -0400
Subject: [PATCH 1/2] Bump to llvm/llvm-project@0aa5ba4 (#6266)

Updating LLVM in order to pull in the following change:

- https://github.com/llvm/llvm-project/pull/128566

For context, crash reproduction generation in MLIR will run the
`PassManager`'s passes in a child thread. The above PR fixes crashes for
when passes such as `add_di_scope` add `DistinctAttr` to the IR and
their storage is then accessed later once the child thread joins.
Pulling this in improves QoL for out-of-tree projects and makes the pass
manager more robust to the use of `DistinctAttr`.

This pin update has also introduced the deprecation of a
`llvm::TargetMachine::createTargetMachine` overload. I've updated the
callsites to use the non-deprecated overloads.

- [x] I am not making a trivial change, such as fixing a typo in a
comment.
- [x] I have written a PR description following these
  [rules](https://cbea.ms/git-commit/#why-not-how).
- [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`.

- Select one of the following.
  - [ ] I have added tests.
    - `/test` for `lit` tests
    - `/unittest` for C++ tests
    - `/python/test` for end-to-end tests
- [x] This PR does not need a test because `this PR only updates the
LLVM pin, so CI is sufficient`.

- Select one of the following.
  - [x] I have not added any `lit` tests.
- [ ] The `lit` tests I have added follow these [best
practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices),
including the "tests should be minimal" section. (Usually running Python
code
    and using the instructions it generates is not minimal.)
---
 python/src/llvm.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/python/src/llvm.cc b/python/src/llvm.cc
index 222ff3f8f9fc..f1d976ed5425 100644
--- a/python/src/llvm.cc
+++ b/python/src/llvm.cc
@@ -59,7 +59,7 @@ createTargetMachine(llvm::Module *module, std::string proc,
   opt.MCOptions.AsmVerbose = true;
   opt.MCOptions.PreserveAsmComments = true;
   std::unique_ptr<llvm::TargetMachine> machine{target->createTargetMachine(
-      module->getTargetTriple(), proc, features, opt, llvm::Reloc::PIC_,
+      module->getTargetTriple().str(), proc, features, opt, llvm::Reloc::PIC_,
       std::nullopt,
       disableLLVMOpt ? llvm::CodeGenOptLevel::None
                      : llvm::CodeGenOptLevel::Aggressive)};

From db4111f9fd76fc5a9eefb4e2ce35b373b16f25f6 Mon Sep 17 00:00:00 2001
From: Gary Geng <ggeng@nvidia.com>
Date: Thu, 20 Mar 2025 22:15:29 +0000
Subject: [PATCH 2/2] Add heuristic for join to interleave every 2 elts

---
 lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
index ee4534180b34..fa11745645ad 100644
--- a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
@@ -169,6 +169,19 @@ struct JoinOpConversion : public ConvertOpToLLVMPattern<JoinOp> {
     assert(lhsVals.size() == rhsVals.size());
     SmallVector<Value> joinedVals;
     joinedVals.resize(lhsVals.size() * 2);
+
+    // Specifically for packed upcasting from 4b to 16b dtypes
+    // numContiguousValues cannot be too large, since the two outputs of
+    // inline_asm contain interleaved values OTOH, if numContiguousValues * 16b
+    // < 32b, then we'll need to rearrange 16b values in 32b registers. Hnece we
+    // set numContiguousValues to 2
+    auto inlineOp =
+        dyn_cast<ElementwiseInlineAsmOp>(op.getLhs().getDefiningOp());
+    if (inlineOp && inlineOp.getPackedElement() == 4 &&
+        dstTy.getElementTypeBitWidth() == 16) {
+      numContiguousValues = 2;
+    }
+
     for (int i = 0; i < lhsVals.size(); i += numContiguousValues) {
       for (int j = 0; j < numContiguousValues; j++) {
         joinedVals[2 * i + j] = lhsVals[i + j];