Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion llvm/include/llvm/Analysis/LoopAccessAnalysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -896,7 +896,8 @@ getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
const Loop *Lp,
const DenseMap<Value *, const SCEV *> &StridesMap =
DenseMap<Value *, const SCEV *>(),
bool Assume = false, bool ShouldCheckWrap = true);
bool Assume = false, bool ShouldCheckWrap = true,
DominatorTree *DT = nullptr);

/// Returns the distance between the pointers \p PtrA and \p PtrB iff they are
/// compatible and it is possible to calculate the distance between them. This
Expand Down
50 changes: 35 additions & 15 deletions llvm/lib/Analysis/LoopAccessAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -800,11 +800,11 @@ class AccessAnalysis {
typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;

AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI,
MemoryDepChecker::DepCandidates &DA,
DominatorTree &DT, MemoryDepChecker::DepCandidates &DA,
PredicatedScalarEvolution &PSE,
SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
: TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE),
LoopAliasScopes(LoopAliasScopes) {
: TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DT(DT), DepCands(DA),
PSE(PSE), LoopAliasScopes(LoopAliasScopes) {
// We're analyzing dependences across loop iterations.
BAA.enableCrossIterationMode();
}
Expand Down Expand Up @@ -928,6 +928,9 @@ class AccessAnalysis {
/// The LoopInfo of the loop being checked.
const LoopInfo *LI;

/// The dominator tree of the function.
DominatorTree &DT;

/// Sets of potentially dependent accesses - members of one set share an
/// underlying pointer. The set "CheckDeps" identfies which sets really need a
/// dependence check.
Expand Down Expand Up @@ -1009,7 +1012,8 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy,
/// informating from the IR pointer value to determine no-wrap.
static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
Value *Ptr, Type *AccessTy, const Loop *L, bool Assume,
std::optional<int64_t> Stride = std::nullopt) {
std::optional<int64_t> Stride = std::nullopt,
DominatorTree *DT = nullptr) {
// FIXME: This should probably only return true for NUW.
if (AR->getNoWrapFlags(SCEV::NoWrapMask))
return true;
Expand All @@ -1023,8 +1027,22 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
// case, the GEP would be poison and any memory access dependent on it would
// be immediate UB when executed.
if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Ptr);
GEP && GEP->hasNoUnsignedSignedWrap())
return true;
GEP && GEP->hasNoUnsignedSignedWrap()) {
// For the above reasoning to apply, the pointer must be dereferenced in
// every iteration.
if (L->getHeader() == L->getLoopLatch() ||
any_of(GEP->users(), [L, DT](User *U) {
if (!isa<LoadInst, StoreInst>(U))
return false;
Comment on lines +1035 to +1036
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if the GEP is used by another GEP, which is then used by a load/store, as opposed to an icmp and then never used by a load/store? Is it safe to bail out like this?

BasicBlock *UserBB = cast<Instruction>(U)->getParent();
if (DT && !LoopAccessInfo::blockNeedsPredication(UserBB, L, DT))
return true;
return UserBB == L->getHeader() ||
(L->getExitingBlock() == L->getLoopLatch() &&
UserBB == L->getLoopLatch());
}))
return true;
}

if (!Stride)
Stride = getStrideFromAddRec(AR, L, AccessTy, Ptr, PSE);
Expand Down Expand Up @@ -1287,7 +1305,7 @@ bool AccessAnalysis::createCheckForAccess(
}

if (!isNoWrap(PSE, AR, RTCheckPtrs.size() == 1 ? Ptr : nullptr, AccessTy,
TheLoop, Assume))
TheLoop, Assume, std::nullopt, &DT))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
TheLoop, Assume, std::nullopt, &DT))
TheLoop, Assume, {}, &DT))

return false;
}

Expand Down Expand Up @@ -1602,7 +1620,7 @@ std::optional<int64_t>
llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
const Loop *Lp,
const DenseMap<Value *, const SCEV *> &StridesMap,
bool Assume, bool ShouldCheckWrap) {
bool Assume, bool ShouldCheckWrap, DominatorTree *DT) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
bool Assume, bool ShouldCheckWrap, DominatorTree *DT) {
bool Assume, bool ShouldCheckWrap, const DominatorTree &DT) {

Would be good to forbid nullptr here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You cannot place a parameter without a default argument after the one that has a default argument, I think.

const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
if (PSE.getSE()->isLoopInvariant(PtrScev, Lp))
return 0;
Expand All @@ -1624,7 +1642,7 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
if (!ShouldCheckWrap || !Stride)
return Stride;

if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, Stride))
if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, Stride, DT))
return Stride;

LLVM_DEBUG(
Expand Down Expand Up @@ -2041,10 +2059,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
BPtr->getType()->getPointerAddressSpace())
return MemoryDepChecker::Dependence::Unknown;

std::optional<int64_t> StrideAPtr =
getPtrStride(PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true, true);
std::optional<int64_t> StrideBPtr =
getPtrStride(PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true, true);
std::optional<int64_t> StrideAPtr = getPtrStride(
PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true, true, DT);
std::optional<int64_t> StrideBPtr = getPtrStride(
PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true, true, DT);

const SCEV *Src = PSE.getSCEV(APtr);
const SCEV *Sink = PSE.getSCEV(BPtr);
Expand Down Expand Up @@ -2621,7 +2639,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
}

MemoryDepChecker::DepCandidates DepCands;
AccessAnalysis Accesses(TheLoop, AA, LI, DepCands, *PSE, LoopAliasScopes);
AccessAnalysis Accesses(TheLoop, AA, LI, *DT, DepCands, *PSE,
LoopAliasScopes);

// Holds the analyzed pointers. We don't want to call getUnderlyingObjects
// multiple times on the same object. If the ptr is accessed twice, once
Expand Down Expand Up @@ -2685,7 +2704,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
bool IsReadOnlyPtr = false;
Type *AccessTy = getLoadStoreType(LD);
if (Seen.insert({Ptr, AccessTy}).second ||
!getPtrStride(*PSE, AccessTy, Ptr, TheLoop, SymbolicStrides)) {
!getPtrStride(*PSE, AccessTy, Ptr, TheLoop, SymbolicStrides, false,
true, DT)) {
++NumReads;
IsReadOnlyPtr = true;
}
Expand Down
8 changes: 5 additions & 3 deletions llvm/lib/Analysis/VectorUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1388,8 +1388,9 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
// even without the transformation. The wrapping checks are therefore
// deferred until after we've formed the interleaved groups.
int64_t Stride =
getPtrStride(PSE, ElementTy, Ptr, TheLoop, Strides,
/*Assume=*/true, /*ShouldCheckWrap=*/false).value_or(0);
getPtrStride(PSE, ElementTy, Ptr, TheLoop, Strides,
/*Assume=*/true, /*ShouldCheckWrap=*/false, DT)
.value_or(0);

const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size,
Expand Down Expand Up @@ -1644,7 +1645,8 @@ void InterleavedAccessInfo::analyzeInterleaving(
Value *MemberPtr = getLoadStorePointerOperand(Member);
Type *AccessTy = getLoadStoreType(Member);
if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, Strides,
/*Assume=*/false, /*ShouldCheckWrap=*/true).value_or(0))
/*Assume=*/false, /*ShouldCheckWrap=*/true, DT)
.value_or(0))
return false;
LLVM_DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
<< FirstOrLast
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,14 @@ define void @test_inbounds_gep_used_in_predicated_block(ptr %A, i64 %n) {
; CHECK-NEXT: Dependences:
; CHECK-NEXT: Run-time memory checks:
; CHECK-NEXT: Grouped accesses:
; CHECK-NEXT: Group GRP0:
; CHECK-NEXT: (Low: %A High: (-4611686018427387705 + %A))
; CHECK-NEXT: Member: {%A,+,4611686018427387906}<%loop.header>
; CHECK-NEXT: Member: {%A,+,4611686018427387905}<%loop.header>
; CHECK-EMPTY:
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
; CHECK-NEXT: SCEV assumptions:
; CHECK-NEXT: {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw>
; CHECK-EMPTY:
; CHECK-NEXT: Expressions re-written:
;
Expand Down
40 changes: 28 additions & 12 deletions llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,33 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
; RV32-LABEL: @foo4(
; RV32-NEXT: entry:
; RV32-NEXT: br label [[VECTOR_MEMCHECK:%.*]]
; RV32: vector.scevcheck:
; RV32-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 128, i32 624)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Strange check: I wonder why it didn't get constant-folded by SCEV?

; RV32-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
; RV32-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
; RV32-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[MUL_RESULT]]
; RV32-NEXT: [[TMP1:%.*]] = icmp ult ptr [[TMP0]], [[A]]
; RV32-NEXT: [[TMP2:%.*]] = or i1 [[TMP1]], [[MUL_OVERFLOW]]
; RV32-NEXT: [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 256, i32 624)
; RV32-NEXT: [[MUL_RESULT2:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0
; RV32-NEXT: [[MUL_OVERFLOW3:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1
; RV32-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[MUL_RESULT2]]
; RV32-NEXT: [[TMP4:%.*]] = icmp ult ptr [[TMP3]], [[B]]
; RV32-NEXT: [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW3]]
; RV32-NEXT: [[TMP6:%.*]] = or i1 [[TMP2]], [[TMP5]]
; RV32-NEXT: br i1 [[TMP6]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK1:%.*]]
; RV32: vector.memcheck:
; RV32-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 79880
; RV32-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i32 39940
; RV32-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 159752
; RV32-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
; RV32-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
; RV32-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i32 79880
; RV32-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B]], i32 159752
; RV32-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
; RV32-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
; RV32-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; RV32-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]]
; RV32-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
; RV32-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
; RV32-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
; RV32-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; RV32-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; RV32: vector.ph:
; RV32-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
; RV32-NEXT: [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP7]], splat (i64 16)
Expand All @@ -43,25 +58,26 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
; RV32-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
; RV32-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]]
; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]]
; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
; RV32-NEXT: [[TMP14:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], splat (i32 100)
; RV32-NEXT: [[TMP15:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], splat (i64 1)
; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP15]]
; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.vp.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP16]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3:![0-9]+]]
; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.vp.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP16]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]]
; RV32-NEXT: [[TMP17:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double>
; RV32-NEXT: [[TMP18:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP17]]
; RV32-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
; RV32-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> align 8 [[TMP19]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
; RV32-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> align 8 [[TMP19]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3]], !noalias [[META5]]
; RV32-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
; RV32-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
; RV32-NEXT: [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
; RV32-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; RV32-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; RV32: middle.block:
; RV32-NEXT: br label [[FOR_END:%.*]]
; RV32: scalar.ph:
; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_MEMCHECK1]] ]
; RV32-NEXT: br label [[FOR_BODY:%.*]]
; RV32: for.body:
; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; RV32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
; RV32-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; RV32-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP21]], 100
Expand All @@ -78,7 +94,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
; RV32: for.inc:
; RV32-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
; RV32-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
; RV32-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]]
; RV32-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]]
; RV32: for.end:
; RV32-NEXT: ret void
;
Expand Down Expand Up @@ -146,7 +162,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
; RV64: for.inc:
; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
; RV64-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
; RV64-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]]
; RV64-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP11:![0-9]+]]
; RV64: for.end:
; RV64-NEXT: ret void
;
Expand Down
Loading