From c629b06ac06a7f8596fa2ba9b7728d3fd8314f11 Mon Sep 17 00:00:00 2001 From: Artemiy Bulavin Date: Fri, 21 Mar 2025 11:59:45 -0400 Subject: [PATCH 1/2] Bump to llvm/llvm-project@0aa5ba4 (#6266) Updating LLVM in order to pull in the following change: - https://github.com/llvm/llvm-project/pull/128566 For context, crash reproduction generation in MLIR will run the `PassManager`'s passes in a child thread. The above PR fixes crashes for when passes such as `add_di_scope` add `DistinctAttr` to the IR and their storage is then accessed later once the child thread joins. Pulling this in improves QoL for out-of-tree projects and makes the pass manager more robust to the use of `DistinctAttr`. This pin update has also introduced the deprecation of a `llvm::TargetMachine::createTargetMachine` overload. I've updated the callsites to use the non-deprecated overloads. - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [ ] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [x] This PR does not need a test because `this PR only updates the LLVM pin, so CI is sufficient`. - Select one of the following. - [x] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.) --- python/src/llvm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/src/llvm.cc b/python/src/llvm.cc index 222ff3f8f9fc..f1d976ed5425 100644 --- a/python/src/llvm.cc +++ b/python/src/llvm.cc @@ -59,7 +59,7 @@ createTargetMachine(llvm::Module *module, std::string proc, opt.MCOptions.AsmVerbose = true; opt.MCOptions.PreserveAsmComments = true; std::unique_ptr machine{target->createTargetMachine( - module->getTargetTriple(), proc, features, opt, llvm::Reloc::PIC_, + module->getTargetTriple().str(), proc, features, opt, llvm::Reloc::PIC_, std::nullopt, disableLLVMOpt ? llvm::CodeGenOptLevel::None : llvm::CodeGenOptLevel::Aggressive)}; From db4111f9fd76fc5a9eefb4e2ce35b373b16f25f6 Mon Sep 17 00:00:00 2001 From: Gary Geng Date: Thu, 20 Mar 2025 22:15:29 +0000 Subject: [PATCH 2/2] Add heuristic for join to interleave every 2 elts --- lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp index ee4534180b34..fa11745645ad 100644 --- a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp +++ b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp @@ -169,6 +169,19 @@ struct JoinOpConversion : public ConvertOpToLLVMPattern { assert(lhsVals.size() == rhsVals.size()); SmallVector joinedVals; joinedVals.resize(lhsVals.size() * 2); + + // Specifically for packed upcasting from 4b to 16b dtypes + // numContiguousValues cannot be too large, since the two outputs of + // inline_asm contain interleaved values OTOH, if numContiguousValues * 16b + // < 32b, then we'll need to rearrange 16b values in 32b registers. Hnece we + // set numContiguousValues to 2 + auto inlineOp = + dyn_cast(op.getLhs().getDefiningOp()); + if (inlineOp && inlineOp.getPackedElement() == 4 && + dstTy.getElementTypeBitWidth() == 16) { + numContiguousValues = 2; + } + for (int i = 0; i < lhsVals.size(); i += numContiguousValues) { for (int j = 0; j < numContiguousValues; j++) { joinedVals[2 * i + j] = lhsVals[i + j];