From c629b06ac06a7f8596fa2ba9b7728d3fd8314f11 Mon Sep 17 00:00:00 2001 From: Artemiy Bulavin Date: Fri, 21 Mar 2025 11:59:45 -0400 Subject: [PATCH 1/2] Bump to llvm/llvm-project@0aa5ba4 (#6266) Updating LLVM in order to pull in the following change: - https://github.com/llvm/llvm-project/pull/128566 For context, crash reproduction generation in MLIR will run the `PassManager`'s passes in a child thread. The above PR fixes crashes for when passes such as `add_di_scope` add `DistinctAttr` to the IR and their storage is then accessed later once the child thread joins. Pulling this in improves QoL for out-of-tree projects and makes the pass manager more robust to the use of `DistinctAttr`. This pin update has also introduced the deprecation of a `llvm::TargetMachine::createTargetMachine` overload. I've updated the callsites to use the non-deprecated overloads. - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [ ] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [x] This PR does not need a test because `this PR only updates the LLVM pin, so CI is sufficient`. - Select one of the following. - [x] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.) --- python/src/llvm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/src/llvm.cc b/python/src/llvm.cc index 222ff3f8f9fc..f1d976ed5425 100644 --- a/python/src/llvm.cc +++ b/python/src/llvm.cc @@ -59,7 +59,7 @@ createTargetMachine(llvm::Module *module, std::string proc, opt.MCOptions.AsmVerbose = true; opt.MCOptions.PreserveAsmComments = true; std::unique_ptr machine{target->createTargetMachine( - module->getTargetTriple(), proc, features, opt, llvm::Reloc::PIC_, + module->getTargetTriple().str(), proc, features, opt, llvm::Reloc::PIC_, std::nullopt, disableLLVMOpt ? llvm::CodeGenOptLevel::None : llvm::CodeGenOptLevel::Aggressive)}; From a6f553ca14d32db8f5d8c8a310372cad36ce70ae Mon Sep 17 00:00:00 2001 From: Gary Geng Date: Thu, 20 Mar 2025 18:07:27 +0000 Subject: [PATCH 2/2] Add shmem swizzling heuristic for LL --- lib/Dialect/TritonGPU/Transforms/Utility.cpp | 64 +++++++++++++++++--- 1 file changed, 57 insertions(+), 7 deletions(-) diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp index aa946f2c967d..7fe9f4d2725c 100644 --- a/lib/Dialect/TritonGPU/Transforms/Utility.cpp +++ b/lib/Dialect/TritonGPU/Transforms/Utility.cpp @@ -1048,6 +1048,46 @@ StringRef getAMDArch(Operation *module) { return ref.drop_front(4); // drop the "hip:" } +// Rough utility for obtaining a SharedEnc for a LinearEncoding, +// as we've replaced DotOpEnc with Linear in some cases +// (specifically, fp4ToFp and similar unpack-upcast thru join) +std::optional +getSharedForLinear(ttg::LinearEncodingAttr enc, + ArrayRef globalOrder, ArrayRef shape, + unsigned elemBitWidth, ttg::CTALayoutAttr ctaLayout) { + auto ctx = enc.getContext(); + auto ll = enc.getLinearLayout(); + auto rank = shape.size(); + + if (rank != 2) + return std::nullopt; + + auto order = enc.getOrder(); + assert(globalOrder.size() == rank); + // TODO add memdesc_trans support for dot(trans(cvt(src) #linear) #dot_op) + if (order != globalOrder) + return std::nullopt; + + auto innerDim = order[0]; + auto outerDim = order[1]; + auto contigPerWarp = enc.getContigPerWarp(); + + constexpr unsigned BANK_SIZE{128}; + auto elemBytes = elemBitWidth / 8; + + auto vec = contigPerWarp[innerDim]; + auto rowSize = elemBytes * (unsigned)shape[innerDim]; + auto perPhase = std::max(BANK_SIZE / rowSize, 1u); + auto maxPhase = std::max(contigPerWarp[outerDim] / perPhase, 1u); + + // cp.async does not support transfer size < 4B + if (vec * elemBytes < 4 && perPhase < maxPhase) + return std::nullopt; + + return ttg::SwizzledSharedEncodingAttr::get(ctx, vec, perPhase, maxPhase, + order, ctaLayout); +} + // If all the transitive uses of the given value have are used by a convert to // the same dot operand encoding, return the shared encoding that needs to be // used to be compatible with users' layouts. If there are incompatible shared @@ -1074,18 +1114,28 @@ getSharedEncIfAllUsersAreDotEnc(Value val, bool &incompatible) { } else { if (!isa(user)) return std::nullopt; - auto dotOpEnc = dyn_cast( + auto enc = cast(user->getResult(0).getType()) - .getEncoding()); - if (!dotOpEnc) - return std::nullopt; + .getEncoding(); auto srcTy = cast(val.getType()); auto CTALayout = ttg::getCTALayout(srcTy.getEncoding()); auto order = getOrderForMemory(srcTy); unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth(); - tempAttr = ttg::SwizzledSharedEncodingAttr::get( - val.getContext(), dotOpEnc, srcTy.getShape(), order, CTALayout, - bitWidth, /*needTrans=*/false); + + if (auto dotOpEnc = dyn_cast(enc)) { + tempAttr = ttg::SwizzledSharedEncodingAttr::get( + val.getContext(), dotOpEnc, srcTy.getShape(), order, CTALayout, + bitWidth, /*needTrans=*/false); + } else if (auto linearEnc = dyn_cast(enc)) { + + auto attrOpt = getSharedForLinear(linearEnc, order, srcTy.getShape(), + bitWidth, CTALayout); + if (!attrOpt) + return std::nullopt; + tempAttr = *attrOpt; + } else { + return std::nullopt; + } } // Check that the shared encodings needed by the users are compatible. if (attr != nullptr && attr != tempAttr) {