diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index 84b4ad7c1d5a9..ff050a5c99224 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -896,7 +896,8 @@ getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap &StridesMap = DenseMap(), - bool Assume = false, bool ShouldCheckWrap = true); + bool Assume = false, bool ShouldCheckWrap = true, + DominatorTree *DT = nullptr); /// Returns the distance between the pointers \p PtrA and \p PtrB iff they are /// compatible and it is possible to calculate the distance between them. This diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 47dccde45337b..e49b6b198ead8 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -800,11 +800,11 @@ class AccessAnalysis { typedef SmallVector MemAccessInfoList; AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI, - MemoryDepChecker::DepCandidates &DA, + DominatorTree &DT, MemoryDepChecker::DepCandidates &DA, PredicatedScalarEvolution &PSE, SmallPtrSetImpl &LoopAliasScopes) - : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE), - LoopAliasScopes(LoopAliasScopes) { + : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DT(DT), DepCands(DA), + PSE(PSE), LoopAliasScopes(LoopAliasScopes) { // We're analyzing dependences across loop iterations. BAA.enableCrossIterationMode(); } @@ -928,6 +928,9 @@ class AccessAnalysis { /// The LoopInfo of the loop being checked. const LoopInfo *LI; + /// The dominator tree of the function. + DominatorTree &DT; + /// Sets of potentially dependent accesses - members of one set share an /// underlying pointer. The set "CheckDeps" identfies which sets really need a /// dependence check. @@ -1009,7 +1012,8 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, /// informating from the IR pointer value to determine no-wrap. static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, Value *Ptr, Type *AccessTy, const Loop *L, bool Assume, - std::optional Stride = std::nullopt) { + std::optional Stride = std::nullopt, + DominatorTree *DT = nullptr) { // FIXME: This should probably only return true for NUW. if (AR->getNoWrapFlags(SCEV::NoWrapMask)) return true; @@ -1023,8 +1027,22 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, // case, the GEP would be poison and any memory access dependent on it would // be immediate UB when executed. if (auto *GEP = dyn_cast_if_present(Ptr); - GEP && GEP->hasNoUnsignedSignedWrap()) - return true; + GEP && GEP->hasNoUnsignedSignedWrap()) { + // For the above reasoning to apply, the pointer must be dereferenced in + // every iteration. + if (L->getHeader() == L->getLoopLatch() || + any_of(GEP->users(), [L, DT](User *U) { + if (!isa(U)) + return false; + BasicBlock *UserBB = cast(U)->getParent(); + if (DT && !LoopAccessInfo::blockNeedsPredication(UserBB, L, DT)) + return true; + return UserBB == L->getHeader() || + (L->getExitingBlock() == L->getLoopLatch() && + UserBB == L->getLoopLatch()); + })) + return true; + } if (!Stride) Stride = getStrideFromAddRec(AR, L, AccessTy, Ptr, PSE); @@ -1287,7 +1305,7 @@ bool AccessAnalysis::createCheckForAccess( } if (!isNoWrap(PSE, AR, RTCheckPtrs.size() == 1 ? Ptr : nullptr, AccessTy, - TheLoop, Assume)) + TheLoop, Assume, std::nullopt, &DT)) return false; } @@ -1602,7 +1620,7 @@ std::optional llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap &StridesMap, - bool Assume, bool ShouldCheckWrap) { + bool Assume, bool ShouldCheckWrap, DominatorTree *DT) { const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr); if (PSE.getSE()->isLoopInvariant(PtrScev, Lp)) return 0; @@ -1624,7 +1642,7 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, if (!ShouldCheckWrap || !Stride) return Stride; - if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, Stride)) + if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, Stride, DT)) return Stride; LLVM_DEBUG( @@ -2041,10 +2059,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize( BPtr->getType()->getPointerAddressSpace()) return MemoryDepChecker::Dependence::Unknown; - std::optional StrideAPtr = - getPtrStride(PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true, true); - std::optional StrideBPtr = - getPtrStride(PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true, true); + std::optional StrideAPtr = getPtrStride( + PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true, true, DT); + std::optional StrideBPtr = getPtrStride( + PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true, true, DT); const SCEV *Src = PSE.getSCEV(APtr); const SCEV *Sink = PSE.getSCEV(BPtr); @@ -2621,7 +2639,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI, } MemoryDepChecker::DepCandidates DepCands; - AccessAnalysis Accesses(TheLoop, AA, LI, DepCands, *PSE, LoopAliasScopes); + AccessAnalysis Accesses(TheLoop, AA, LI, *DT, DepCands, *PSE, + LoopAliasScopes); // Holds the analyzed pointers. We don't want to call getUnderlyingObjects // multiple times on the same object. If the ptr is accessed twice, once @@ -2685,7 +2704,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI, bool IsReadOnlyPtr = false; Type *AccessTy = getLoadStoreType(LD); if (Seen.insert({Ptr, AccessTy}).second || - !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, SymbolicStrides)) { + !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, SymbolicStrides, false, + true, DT)) { ++NumReads; IsReadOnlyPtr = true; } diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 091d94843698c..f5948ac624779 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -1388,8 +1388,9 @@ void InterleavedAccessInfo::collectConstStrideAccesses( // even without the transformation. The wrapping checks are therefore // deferred until after we've formed the interleaved groups. int64_t Stride = - getPtrStride(PSE, ElementTy, Ptr, TheLoop, Strides, - /*Assume=*/true, /*ShouldCheckWrap=*/false).value_or(0); + getPtrStride(PSE, ElementTy, Ptr, TheLoop, Strides, + /*Assume=*/true, /*ShouldCheckWrap=*/false, DT) + .value_or(0); const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, @@ -1644,7 +1645,8 @@ void InterleavedAccessInfo::analyzeInterleaving( Value *MemberPtr = getLoadStorePointerOperand(Member); Type *AccessTy = getLoadStoreType(Member); if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, Strides, - /*Assume=*/false, /*ShouldCheckWrap=*/true).value_or(0)) + /*Assume=*/false, /*ShouldCheckWrap=*/true, DT) + .value_or(0)) return false; LLVM_DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " << FirstOrLast diff --git a/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll b/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll index 6eed0ec864820..d873b1c7605f4 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll @@ -19,9 +19,14 @@ define void @test_inbounds_gep_used_in_predicated_block(ptr %A, i64 %n) { ; CHECK-NEXT: Dependences: ; CHECK-NEXT: Run-time memory checks: ; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group GRP0: +; CHECK-NEXT: (Low: %A High: (-4611686018427387705 + %A)) +; CHECK-NEXT: Member: {%A,+,4611686018427387906}<%loop.header> +; CHECK-NEXT: Member: {%A,+,4611686018427387905}<%loop.header> ; CHECK-EMPTY: ; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. ; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: {%A,+,4611686018427387906}<%loop.header> Added Flags: ; CHECK-EMPTY: ; CHECK-NEXT: Expressions re-written: ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll index 89819f2be4967..1cbec47d72203 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll @@ -17,18 +17,33 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32-LABEL: @foo4( ; RV32-NEXT: entry: ; RV32-NEXT: br label [[VECTOR_MEMCHECK:%.*]] +; RV32: vector.scevcheck: +; RV32-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 128, i32 624) +; RV32-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 +; RV32-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 +; RV32-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[MUL_RESULT]] +; RV32-NEXT: [[TMP1:%.*]] = icmp ult ptr [[TMP0]], [[A]] +; RV32-NEXT: [[TMP2:%.*]] = or i1 [[TMP1]], [[MUL_OVERFLOW]] +; RV32-NEXT: [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 256, i32 624) +; RV32-NEXT: [[MUL_RESULT2:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0 +; RV32-NEXT: [[MUL_OVERFLOW3:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1 +; RV32-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[MUL_RESULT2]] +; RV32-NEXT: [[TMP4:%.*]] = icmp ult ptr [[TMP3]], [[B]] +; RV32-NEXT: [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW3]] +; RV32-NEXT: [[TMP6:%.*]] = or i1 [[TMP2]], [[TMP5]] +; RV32-NEXT: br i1 [[TMP6]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK1:%.*]] ; RV32: vector.memcheck: -; RV32-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 79880 ; RV32-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i32 39940 -; RV32-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 159752 -; RV32-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] -; RV32-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] +; RV32-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i32 79880 +; RV32-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B]], i32 159752 +; RV32-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] +; RV32-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] ; RV32-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; RV32-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]] ; RV32-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] ; RV32-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] ; RV32-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] -; RV32-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; RV32-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; RV32: vector.ph: ; RV32-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv2i64() ; RV32-NEXT: [[TMP9:%.*]] = mul [[TMP7]], splat (i64 16) @@ -43,25 +58,26 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP11]], i64 0 ; RV32-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], [[VEC_IND]] -; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i32.nxv2p0( align 4 [[TMP13]], splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]] +; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i32.nxv2p0( align 4 [[TMP13]], splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] ; RV32-NEXT: [[TMP14:%.*]] = icmp slt [[WIDE_MASKED_GATHER]], splat (i32 100) ; RV32-NEXT: [[TMP15:%.*]] = shl nuw nsw [[VEC_IND]], splat (i64 1) ; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[B]], [[TMP15]] -; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.vp.gather.nxv2f64.nxv2p0( align 8 [[TMP16]], [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3:![0-9]+]] +; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.vp.gather.nxv2f64.nxv2p0( align 8 [[TMP16]], [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]] ; RV32-NEXT: [[TMP17:%.*]] = sitofp [[WIDE_MASKED_GATHER]] to ; RV32-NEXT: [[TMP18:%.*]] = fadd [[WIDE_MASKED_GATHER6]], [[TMP17]] ; RV32-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], [[VEC_IND]] -; RV32-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0( [[TMP18]], align 8 [[TMP19]], [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] +; RV32-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0( [[TMP18]], align 8 [[TMP19]], [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3]], !noalias [[META5]] ; RV32-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] ; RV32-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; RV32-NEXT: [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; RV32-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; RV32-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; RV32: middle.block: ; RV32-NEXT: br label [[FOR_END:%.*]] ; RV32: scalar.ph: +; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_MEMCHECK1]] ] ; RV32-NEXT: br label [[FOR_BODY:%.*]] ; RV32: for.body: -; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; RV32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] ; RV32-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; RV32-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP21]], 100 @@ -78,7 +94,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32: for.inc: ; RV32-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 ; RV32-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 -; RV32-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]] +; RV32-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]] ; RV32: for.end: ; RV32-NEXT: ret void ; @@ -146,7 +162,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV64: for.inc: ; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 ; RV64-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 -; RV64-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]] +; RV64-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP11:![0-9]+]] ; RV64: for.end: ; RV64-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll index f29428c51c636..25cba55f5a57b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -711,17 +711,92 @@ define dso_local void @masked_strided3_optsize_unknown_tc(ptr noalias nocapture ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> poison, <8 x i32> zeroinitializer ; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: -; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE16:%.*]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE16]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i1> zeroinitializer -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = mul i32 [[INDEX]], 3 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP2]] -; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <24 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = and <24 x i1> [[INTERLEAVED_MASK]], -; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0(ptr [[TMP3]], i32 1, <24 x i1> [[TMP5]], <24 x i8> poison) -; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_MASKED_VEC]], <24 x i8> poison, <8 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = mul nsw <8 x i32> [[VEC_IND]], splat (i32 3) +; ENABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP51]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i64 0 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i32 [[TMP5]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP53:%.*]] = load i8, ptr [[TMP52]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> poison, i8 [[TMP53]], i64 0 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; ENABLED_MASKED_STRIDED: pred.load.continue: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = phi <8 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP4]], i64 1 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if3: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i64 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP11]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = load i8, ptr [[TMP12]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[TMP13]], i64 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; ENABLED_MASKED_STRIDED: pred.load.continue4: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = phi <8 x i8> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP4]], i64 2 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if5: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP3]], i64 2 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP17]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP15]], i8 [[TMP19]], i64 2 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; ENABLED_MASKED_STRIDED: pred.load.continue6: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = phi <8 x i8> [ [[TMP15]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP20]], [[PRED_LOAD_IF5]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP4]], i64 3 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if7: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP3]], i64 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP23]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = load i8, ptr [[TMP24]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = insertelement <8 x i8> [[TMP21]], i8 [[TMP25]], i64 3 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE8]] +; ENABLED_MASKED_STRIDED: pred.load.continue8: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = phi <8 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP26]], [[PRED_LOAD_IF7]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP4]], i64 4 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP28]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if9: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP3]], i64 4 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP29]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP32:%.*]] = insertelement <8 x i8> [[TMP27]], i8 [[TMP31]], i64 4 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE10]] +; ENABLED_MASKED_STRIDED: pred.load.continue10: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP33:%.*]] = phi <8 x i8> [ [[TMP27]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP32]], [[PRED_LOAD_IF9]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = extractelement <8 x i1> [[TMP4]], i64 5 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if11: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = extractelement <8 x i32> [[TMP3]], i64 5 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP35]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP37:%.*]] = load i8, ptr [[TMP36]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP38:%.*]] = insertelement <8 x i8> [[TMP33]], i8 [[TMP37]], i64 5 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE12]] +; ENABLED_MASKED_STRIDED: pred.load.continue12: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP39:%.*]] = phi <8 x i8> [ [[TMP33]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP38]], [[PRED_LOAD_IF11]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP40:%.*]] = extractelement <8 x i1> [[TMP4]], i64 6 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP40]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if13: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP41:%.*]] = extractelement <8 x i32> [[TMP3]], i64 6 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP41]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP43:%.*]] = load i8, ptr [[TMP42]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP44:%.*]] = insertelement <8 x i8> [[TMP39]], i8 [[TMP43]], i64 6 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE14]] +; ENABLED_MASKED_STRIDED: pred.load.continue14: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP45:%.*]] = phi <8 x i8> [ [[TMP39]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP44]], [[PRED_LOAD_IF13]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP46:%.*]] = extractelement <8 x i1> [[TMP4]], i64 7 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP46]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16]] +; ENABLED_MASKED_STRIDED: pred.load.if15: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP47:%.*]] = extractelement <8 x i32> [[TMP3]], i64 7 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP47]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = insertelement <8 x i8> [[TMP45]], i8 [[TMP49]], i64 7 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE16]] +; ENABLED_MASKED_STRIDED: pred.load.continue16: +; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = phi <8 x i8> [ [[TMP45]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP50]], [[PRED_LOAD_IF15]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]] ; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr [[TMP6]], i32 1, <8 x i1> [[TMP4]]) ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8