aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
diff options
context:
space:
mode:
authorshadchin <shadchin@yandex-team.ru>2022-02-10 16:44:30 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:44:30 +0300
commit2598ef1d0aee359b4b6d5fdd1758916d5907d04f (patch)
tree012bb94d777798f1f56ac1cec429509766d05181 /contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
parent6751af0b0c1b952fede40b19b71da8025b5d8bcf (diff)
downloadydb-2598ef1d0aee359b4b6d5fdd1758916d5907d04f.tar.gz
Restoring authorship annotation for <shadchin@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp')
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp916
1 files changed, 458 insertions, 458 deletions
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 8064c02e2b..e60c95b7be 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -47,7 +47,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/CmpInstAnalysis.h"
+#include "llvm/Analysis/CmpInstAnalysis.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
@@ -80,7 +80,7 @@
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
-#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
@@ -108,33 +108,33 @@ using namespace llvm;
STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
-STATISTIC(
- NumShiftUntilBitTest,
- "Number of uncountable loops recognized as 'shift until bitttest' idiom");
-
-bool DisableLIRP::All;
-static cl::opt<bool, true>
- DisableLIRPAll("disable-" DEBUG_TYPE "-all",
- cl::desc("Options to disable Loop Idiom Recognize Pass."),
- cl::location(DisableLIRP::All), cl::init(false),
- cl::ReallyHidden);
-
-bool DisableLIRP::Memset;
-static cl::opt<bool, true>
- DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
- cl::desc("Proceed with loop idiom recognize pass, but do "
- "not convert loop(s) to memset."),
- cl::location(DisableLIRP::Memset), cl::init(false),
- cl::ReallyHidden);
-
-bool DisableLIRP::Memcpy;
-static cl::opt<bool, true>
- DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
- cl::desc("Proceed with loop idiom recognize pass, but do "
- "not convert loop(s) to memcpy."),
- cl::location(DisableLIRP::Memcpy), cl::init(false),
- cl::ReallyHidden);
-
+STATISTIC(
+ NumShiftUntilBitTest,
+ "Number of uncountable loops recognized as 'shift until bitttest' idiom");
+
+bool DisableLIRP::All;
+static cl::opt<bool, true>
+ DisableLIRPAll("disable-" DEBUG_TYPE "-all",
+ cl::desc("Options to disable Loop Idiom Recognize Pass."),
+ cl::location(DisableLIRP::All), cl::init(false),
+ cl::ReallyHidden);
+
+bool DisableLIRP::Memset;
+static cl::opt<bool, true>
+ DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
+ cl::desc("Proceed with loop idiom recognize pass, but do "
+ "not convert loop(s) to memset."),
+ cl::location(DisableLIRP::Memset), cl::init(false),
+ cl::ReallyHidden);
+
+bool DisableLIRP::Memcpy;
+static cl::opt<bool, true>
+ DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
+ cl::desc("Proceed with loop idiom recognize pass, but do "
+ "not convert loop(s) to memcpy."),
+ cl::location(DisableLIRP::Memcpy), cl::init(false),
+ cl::ReallyHidden);
+
static cl::opt<bool> UseLIRCodeSizeHeurs(
"use-lir-code-size-heurs",
cl::desc("Use loop idiom recognition code size heuristics when compiling"
@@ -232,8 +232,8 @@ private:
const DebugLoc &DL, bool ZeroCheck,
bool IsCntPhiUsedOutsideLoop);
- bool recognizeShiftUntilBitTest();
-
+ bool recognizeShiftUntilBitTest();
+
/// @}
};
@@ -247,9 +247,9 @@ public:
}
bool runOnLoop(Loop *L, LPPassManager &LPM) override {
- if (DisableLIRP::All)
- return false;
-
+ if (DisableLIRP::All)
+ return false;
+
if (skipLoop(L))
return false;
@@ -295,9 +295,9 @@ char LoopIdiomRecognizeLegacyPass::ID = 0;
PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
LPMUpdater &) {
- if (DisableLIRP::All)
- return PreservedAnalyses::all();
-
+ if (DisableLIRP::All)
+ return PreservedAnalyses::all();
+
const auto *DL = &L.getHeader()->getModule()->getDataLayout();
// For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
@@ -469,17 +469,17 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
Value *StoredVal = SI->getValueOperand();
Value *StorePtr = SI->getPointerOperand();
- // Don't convert stores of non-integral pointer types to memsets (which stores
- // integers).
- if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
- return LegalStoreKind::None;
-
+ // Don't convert stores of non-integral pointer types to memsets (which stores
+ // integers).
+ if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
+ return LegalStoreKind::None;
+
// Reject stores that are so large that they overflow an unsigned.
- // When storing out scalable vectors we bail out for now, since the code
- // below currently only works for constant strides.
- TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
- if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) ||
- (SizeInBits.getFixedSize() >> 32) != 0)
+ // When storing out scalable vectors we bail out for now, since the code
+ // below currently only works for constant strides.
+ TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
+ if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) ||
+ (SizeInBits.getFixedSize() >> 32) != 0)
return LegalStoreKind::None;
// See if the pointer expression is an AddRec like {base,+,1} on the current
@@ -508,13 +508,13 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
// If we're allowed to form a memset, and the stored value would be
// acceptable for memset, use it.
- if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
+ if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
// Verify that the stored value is loop invariant. If not, we can't
// promote the memset.
CurLoop->isLoopInvariant(SplatValue)) {
// It looks like we can use SplatValue.
return LegalStoreKind::Memset;
- } else if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
+ } else if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
// Don't create memset_pattern16s with address spaces.
StorePtr->getType()->getPointerAddressSpace() == 0 &&
(PatternValue = getMemSetPatternValue(StoredVal, DL))) {
@@ -523,7 +523,7 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
}
// Otherwise, see if the store can be turned into a memcpy.
- if (HasMemcpy && !DisableLIRP::Memcpy) {
+ if (HasMemcpy && !DisableLIRP::Memcpy) {
// Check to see if the stride matches the size of the store. If so, then we
// know that every byte is touched in the loop.
APInt Stride = getStoreStride(StoreEv);
@@ -578,12 +578,12 @@ void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
break;
case LegalStoreKind::Memset: {
// Find the base pointer.
- Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
+ Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
StoreRefsForMemset[Ptr].push_back(SI);
} break;
case LegalStoreKind::MemsetPattern: {
// Find the base pointer.
- Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
+ Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
StoreRefsForMemsetPattern[Ptr].push_back(SI);
} break;
case LegalStoreKind::Memcpy:
@@ -851,7 +851,7 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
// Get the location that may be stored across the loop. Since the access is
// strided positively through memory, we say that the modified location starts
// at the pointer and has infinite size.
- LocationSize AccessSize = LocationSize::afterPointer();
+ LocationSize AccessSize = LocationSize::afterPointer();
// If the loop iterates a fixed number of times, we can refine the access size
// to be exactly the size of the memset, which is (BECount+1)*StoreSize
@@ -903,8 +903,8 @@ static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
// If we're going to need to zero extend the BE count, check if we can add
// one to it prior to zero extending without overflow. Provided this is safe,
// it allows better simplification of the +1.
- if (DL->getTypeSizeInBits(BECount->getType()).getFixedSize() <
- DL->getTypeSizeInBits(IntPtr).getFixedSize() &&
+ if (DL->getTypeSizeInBits(BECount->getType()).getFixedSize() <
+ DL->getTypeSizeInBits(IntPtr).getFixedSize() &&
SE->isLoopEntryGuardedByCond(
CurLoop, ICmpInst::ICMP_NE, BECount,
SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
@@ -947,12 +947,12 @@ bool LoopIdiomRecognize::processLoopStridedStore(
BasicBlock *Preheader = CurLoop->getLoopPreheader();
IRBuilder<> Builder(Preheader->getTerminator());
SCEVExpander Expander(*SE, *DL, "loop-idiom");
- SCEVExpanderCleaner ExpCleaner(Expander, *DT);
+ SCEVExpanderCleaner ExpCleaner(Expander, *DT);
Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
- bool Changed = false;
+ bool Changed = false;
const SCEV *Start = Ev->getStart();
// Handle negative strided loops.
if (NegStride)
@@ -961,7 +961,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
// TODO: ideally we should still be able to generate memset if SCEV expander
// is taught to generate the dependencies at the latest point.
if (!isSafeToExpand(Start, *SE))
- return Changed;
+ return Changed;
// Okay, we have a strided store "p[i]" of a splattable value. We can turn
// this into a memset in the loop preheader now if we want. However, this
@@ -970,22 +970,22 @@ bool LoopIdiomRecognize::processLoopStridedStore(
// base pointer and checking the region.
Value *BasePtr =
Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
-
- // From here on out, conservatively report to the pass manager that we've
- // changed the IR, even if we later clean up these added instructions. There
- // may be structural differences e.g. in the order of use lists not accounted
- // for in just a textual dump of the IR. This is written as a variable, even
- // though statically all the places this dominates could be replaced with
- // 'true', with the hope that anyone trying to be clever / "more precise" with
- // the return value will read this comment, and leave them alone.
- Changed = true;
-
+
+ // From here on out, conservatively report to the pass manager that we've
+ // changed the IR, even if we later clean up these added instructions. There
+ // may be structural differences e.g. in the order of use lists not accounted
+ // for in just a textual dump of the IR. This is written as a variable, even
+ // though statically all the places this dominates could be replaced with
+ // 'true', with the hope that anyone trying to be clever / "more precise" with
+ // the return value will read this comment, and leave them alone.
+ Changed = true;
+
if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
- StoreSize, *AA, Stores))
- return Changed;
+ StoreSize, *AA, Stores))
+ return Changed;
if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
- return Changed;
+ return Changed;
// Okay, everything looks good, insert the memset.
@@ -995,7 +995,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
// TODO: ideally we should still be able to generate memset if SCEV expander
// is taught to generate the dependencies at the latest point.
if (!isSafeToExpand(NumBytesS, *SE))
- return Changed;
+ return Changed;
Value *NumBytes =
Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
@@ -1054,7 +1054,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
if (MSSAU && VerifyMemorySSA)
MSSAU->getMemorySSA()->verifyMemorySSA();
++NumMemSet;
- ExpCleaner.markResultUsed();
+ ExpCleaner.markResultUsed();
return true;
}
@@ -1088,9 +1088,9 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
IRBuilder<> Builder(Preheader->getTerminator());
SCEVExpander Expander(*SE, *DL, "loop-idiom");
- SCEVExpanderCleaner ExpCleaner(Expander, *DT);
+ SCEVExpanderCleaner ExpCleaner(Expander, *DT);
- bool Changed = false;
+ bool Changed = false;
const SCEV *StrStart = StoreEv->getStart();
unsigned StrAS = SI->getPointerAddressSpace();
Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
@@ -1108,20 +1108,20 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
Value *StoreBasePtr = Expander.expandCodeFor(
StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
- // From here on out, conservatively report to the pass manager that we've
- // changed the IR, even if we later clean up these added instructions. There
- // may be structural differences e.g. in the order of use lists not accounted
- // for in just a textual dump of the IR. This is written as a variable, even
- // though statically all the places this dominates could be replaced with
- // 'true', with the hope that anyone trying to be clever / "more precise" with
- // the return value will read this comment, and leave them alone.
- Changed = true;
-
+ // From here on out, conservatively report to the pass manager that we've
+ // changed the IR, even if we later clean up these added instructions. There
+ // may be structural differences e.g. in the order of use lists not accounted
+ // for in just a textual dump of the IR. This is written as a variable, even
+ // though statically all the places this dominates could be replaced with
+ // 'true', with the hope that anyone trying to be clever / "more precise" with
+ // the return value will read this comment, and leave them alone.
+ Changed = true;
+
SmallPtrSet<Instruction *, 1> Stores;
Stores.insert(SI);
if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
StoreSize, *AA, Stores))
- return Changed;
+ return Changed;
const SCEV *LdStart = LoadEv->getStart();
unsigned LdAS = LI->getPointerAddressSpace();
@@ -1137,10 +1137,10 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
StoreSize, *AA, Stores))
- return Changed;
+ return Changed;
if (avoidLIRForMultiBlockLoop())
- return Changed;
+ return Changed;
// Okay, everything is safe, we can transform this!
@@ -1163,14 +1163,14 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
const Align StoreAlign = SI->getAlign();
const Align LoadAlign = LI->getAlign();
if (StoreAlign < StoreSize || LoadAlign < StoreSize)
- return Changed;
+ return Changed;
// If the element.atomic memcpy is not lowered into explicit
// loads/stores later, then it will be lowered into an element-size
// specific lib call. If the lib call doesn't exist for our store size, then
// we shouldn't generate the memcpy.
if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
- return Changed;
+ return Changed;
// Create the call.
// Note that unordered atomic loads/stores are *required* by the spec to
@@ -1208,7 +1208,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
if (MSSAU && VerifyMemorySSA)
MSSAU->getMemorySSA()->verifyMemorySSA();
++NumMemCpy;
- ExpCleaner.markResultUsed();
+ ExpCleaner.markResultUsed();
return true;
}
@@ -1218,7 +1218,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
bool IsLoopMemset) {
if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
- if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
+ if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
<< " : LIR " << (IsMemset ? "Memset" : "Memcpy")
<< " avoided: multi-block top-level loop\n");
@@ -1235,8 +1235,8 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() {
<< "] Noncountable Loop %"
<< CurLoop->getHeader()->getName() << "\n");
- return recognizePopcount() || recognizeAndInsertFFS() ||
- recognizeShiftUntilBitTest();
+ return recognizePopcount() || recognizeAndInsertFFS() ||
+ recognizeShiftUntilBitTest();
}
/// Check if the given conditional branch is based on the comparison between
@@ -1483,7 +1483,7 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
return false;
// step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
- // or cnt.next = cnt + -1.
+ // or cnt.next = cnt + -1.
// TODO: We can skip the step. If loop trip count is known (CTLZ),
// then all uses of "cnt.next" could be optimized to the trip count
// plus "cnt0". Currently it is not optimized.
@@ -1497,7 +1497,7 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
continue;
ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
- if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
+ if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
continue;
PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
@@ -1728,11 +1728,11 @@ void LoopIdiomRecognize::transformLoopToCountable(
Builder.SetCurrentDebugLocation(DL);
// Count = BitWidth - CTLZ(InitX);
- // NewCount = Count;
+ // NewCount = Count;
// If there are uses of CntPhi create:
- // NewCount = BitWidth - CTLZ(InitX >> 1);
- // Count = NewCount + 1;
- Value *InitXNext;
+ // NewCount = BitWidth - CTLZ(InitX >> 1);
+ // Count = NewCount + 1;
+ Value *InitXNext;
if (IsCntPhiUsedOutsideLoop) {
if (DefX->getOpcode() == Instruction::AShr)
InitXNext =
@@ -1747,31 +1747,31 @@ void LoopIdiomRecognize::transformLoopToCountable(
llvm_unreachable("Unexpected opcode!");
} else
InitXNext = InitX;
- Value *FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
- Value *Count = Builder.CreateSub(
- ConstantInt::get(FFS->getType(), FFS->getType()->getIntegerBitWidth()),
+ Value *FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
+ Value *Count = Builder.CreateSub(
+ ConstantInt::get(FFS->getType(), FFS->getType()->getIntegerBitWidth()),
FFS);
- Value *NewCount = Count;
+ Value *NewCount = Count;
if (IsCntPhiUsedOutsideLoop) {
- NewCount = Count;
- Count = Builder.CreateAdd(Count, ConstantInt::get(Count->getType(), 1));
+ NewCount = Count;
+ Count = Builder.CreateAdd(Count, ConstantInt::get(Count->getType(), 1));
}
- NewCount = Builder.CreateZExtOrTrunc(NewCount,
- cast<IntegerType>(CntInst->getType()));
+ NewCount = Builder.CreateZExtOrTrunc(NewCount,
+ cast<IntegerType>(CntInst->getType()));
Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
- if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
- // If the counter was being incremented in the loop, add NewCount to the
- // counter's initial value, but only if the initial value is not zero.
- ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
- if (!InitConst || !InitConst->isZero())
- NewCount = Builder.CreateAdd(NewCount, CntInitVal);
- } else {
- // If the count was being decremented in the loop, subtract NewCount from
- // the counter's initial value.
- NewCount = Builder.CreateSub(CntInitVal, NewCount);
- }
+ if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
+ // If the counter was being incremented in the loop, add NewCount to the
+ // counter's initial value, but only if the initial value is not zero.
+ ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
+ if (!InitConst || !InitConst->isZero())
+ NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+ } else {
+ // If the count was being decremented in the loop, subtract NewCount from
+ // the counter's initial value.
+ NewCount = Builder.CreateSub(CntInitVal, NewCount);
+ }
// Step 2: Insert new IV and loop condition:
// loop:
@@ -1919,343 +1919,343 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
// loop. The loop would otherwise not be deleted even if it becomes empty.
SE->forgetLoop(CurLoop);
}
-
-/// Match loop-invariant value.
-template <typename SubPattern_t> struct match_LoopInvariant {
- SubPattern_t SubPattern;
- const Loop *L;
-
- match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
- : SubPattern(SP), L(L) {}
-
- template <typename ITy> bool match(ITy *V) {
- return L->isLoopInvariant(V) && SubPattern.match(V);
- }
-};
-
-/// Matches if the value is loop-invariant.
-template <typename Ty>
-inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
- return match_LoopInvariant<Ty>(M, L);
-}
-
-/// Return true if the idiom is detected in the loop.
-///
-/// The core idiom we are trying to detect is:
-/// \code
-/// entry:
-/// <...>
-/// %bitmask = shl i32 1, %bitpos
-/// br label %loop
-///
-/// loop:
-/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
-/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
-/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
-/// %x.next = shl i32 %x.curr, 1
-/// <...>
-/// br i1 %x.curr.isbitunset, label %loop, label %end
-///
-/// end:
-/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
-/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
-/// <...>
-/// \endcode
-static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
- Value *&BitMask, Value *&BitPos,
- Value *&CurrX, Instruction *&NextX) {
- LLVM_DEBUG(dbgs() << DEBUG_TYPE
- " Performing shift-until-bittest idiom detection.\n");
-
- // Give up if the loop has multiple blocks or multiple backedges.
- if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
- LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
- return false;
- }
-
- BasicBlock *LoopHeaderBB = CurLoop->getHeader();
- BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
- assert(LoopPreheaderBB && "There is always a loop preheader.");
-
- using namespace PatternMatch;
-
- // Step 1: Check if the loop backedge is in desirable form.
-
- ICmpInst::Predicate Pred;
- Value *CmpLHS, *CmpRHS;
- BasicBlock *TrueBB, *FalseBB;
- if (!match(LoopHeaderBB->getTerminator(),
- m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
- m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
- LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
- return false;
- }
-
- // Step 2: Check if the backedge's condition is in desirable form.
-
- auto MatchVariableBitMask = [&]() {
- return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
- match(CmpLHS,
- m_c_And(m_Value(CurrX),
- m_CombineAnd(
- m_Value(BitMask),
- m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
- CurLoop))));
- };
- auto MatchConstantBitMask = [&]() {
- return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
- match(CmpLHS, m_And(m_Value(CurrX),
- m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
- (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
- };
- auto MatchDecomposableConstantBitMask = [&]() {
- APInt Mask;
- return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
- ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
- (BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
- (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
- };
-
- if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
- !MatchDecomposableConstantBitMask()) {
- LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
- return false;
- }
-
- // Step 3: Check if the recurrence is in desirable form.
- auto *CurrXPN = dyn_cast<PHINode>(CurrX);
- if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
- LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
- return false;
- }
-
- BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
- NextX =
- dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
-
- if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
- // FIXME: support right-shift?
- LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
- return false;
- }
-
- // Step 4: Check if the backedge's destinations are in desirable form.
-
- assert(ICmpInst::isEquality(Pred) &&
- "Should only get equality predicates here.");
-
- // cmp-br is commutative, so canonicalize to a single variant.
- if (Pred != ICmpInst::Predicate::ICMP_EQ) {
- Pred = ICmpInst::getInversePredicate(Pred);
- std::swap(TrueBB, FalseBB);
- }
-
- // We expect to exit loop when comparison yields false,
- // so when it yields true we should branch back to loop header.
- if (TrueBB != LoopHeaderBB) {
- LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
- return false;
- }
-
- // Okay, idiom checks out.
- return true;
-}
-
-/// Look for the following loop:
-/// \code
-/// entry:
-/// <...>
-/// %bitmask = shl i32 1, %bitpos
-/// br label %loop
-///
-/// loop:
-/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
-/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
-/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
-/// %x.next = shl i32 %x.curr, 1
-/// <...>
-/// br i1 %x.curr.isbitunset, label %loop, label %end
-///
-/// end:
-/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
-/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
-/// <...>
-/// \endcode
-///
-/// And transform it into:
-/// \code
-/// entry:
-/// %bitmask = shl i32 1, %bitpos
-/// %lowbitmask = add i32 %bitmask, -1
-/// %mask = or i32 %lowbitmask, %bitmask
-/// %x.masked = and i32 %x, %mask
-/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
-/// i1 true)
-/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
-/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
-/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
-/// %tripcount = add i32 %backedgetakencount, 1
-/// %x.curr = shl i32 %x, %backedgetakencount
-/// %x.next = shl i32 %x, %tripcount
-/// br label %loop
-///
-/// loop:
-/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
-/// %loop.iv.next = add nuw i32 %loop.iv, 1
-/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
-/// <...>
-/// br i1 %loop.ivcheck, label %end, label %loop
-///
-/// end:
-/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
-/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
-/// <...>
-/// \endcode
-bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
- bool MadeChange = false;
-
- Value *X, *BitMask, *BitPos, *XCurr;
- Instruction *XNext;
- if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
- XNext)) {
- LLVM_DEBUG(dbgs() << DEBUG_TYPE
- " shift-until-bittest idiom detection failed.\n");
- return MadeChange;
- }
- LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
-
- // Ok, it is the idiom we were looking for, we *could* transform this loop,
- // but is it profitable to transform?
-
- BasicBlock *LoopHeaderBB = CurLoop->getHeader();
- BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
- assert(LoopPreheaderBB && "There is always a loop preheader.");
-
- BasicBlock *SuccessorBB = CurLoop->getExitBlock();
- assert(LoopPreheaderBB && "There is only a single successor.");
-
- IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
- Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
-
- Intrinsic::ID IntrID = Intrinsic::ctlz;
- Type *Ty = X->getType();
-
- TargetTransformInfo::TargetCostKind CostKind =
- TargetTransformInfo::TCK_SizeAndLatency;
-
- // The rewrite is considered to be unprofitable iff and only iff the
- // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
- // making the loop countable, even if nothing else changes.
- IntrinsicCostAttributes Attrs(
- IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getTrue()});
- int Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
- if (Cost > TargetTransformInfo::TCC_Basic) {
- LLVM_DEBUG(dbgs() << DEBUG_TYPE
- " Intrinsic is too costly, not beneficial\n");
- return MadeChange;
- }
- if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
- TargetTransformInfo::TCC_Basic) {
- LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
- return MadeChange;
- }
-
- // Ok, transform appears worthwhile.
- MadeChange = true;
-
- // Step 1: Compute the loop trip count.
-
- Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
- BitPos->getName() + ".lowbitmask");
- Value *Mask =
- Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
- Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
- CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
- IntrID, Ty, {XMasked, /*is_zero_undef=*/Builder.getTrue()},
- /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
- Value *XMaskedNumActiveBits = Builder.CreateSub(
- ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
- XMasked->getName() + ".numactivebits");
- Value *XMaskedLeadingOnePos =
- Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
- XMasked->getName() + ".leadingonepos");
-
- Value *LoopBackedgeTakenCount = Builder.CreateSub(
- BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount");
- // We know loop's backedge-taken count, but what's loop's trip count?
- // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
- Value *LoopTripCount =
- Builder.CreateNUWAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
- CurLoop->getName() + ".tripcount");
-
- // Step 2: Compute the recurrence's final value without a loop.
-
- // NewX is always safe to compute, because `LoopBackedgeTakenCount`
- // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
- Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
- NewX->takeName(XCurr);
- if (auto *I = dyn_cast<Instruction>(NewX))
- I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
-
- Value *NewXNext;
- // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
- // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
- // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
- // that isn't the case, we'll need to emit an alternative, safe IR.
- if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
- PatternMatch::match(
- BitPos, PatternMatch::m_SpecificInt_ICMP(
- ICmpInst::ICMP_NE, APInt(Ty->getScalarSizeInBits(),
- Ty->getScalarSizeInBits() - 1))))
- NewXNext = Builder.CreateShl(X, LoopTripCount);
- else {
- // Otherwise, just additionally shift by one. It's the smallest solution,
- // alternatively, we could check that NewX is INT_MIN (or BitPos is )
- // and select 0 instead.
- NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
- }
-
- NewXNext->takeName(XNext);
- if (auto *I = dyn_cast<Instruction>(NewXNext))
- I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
-
- // Step 3: Adjust the successor basic block to recieve the computed
- // recurrence's final value instead of the recurrence itself.
-
- XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
- XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
-
- // Step 4: Rewrite the loop into a countable form, with canonical IV.
-
- // The new canonical induction variable.
- Builder.SetInsertPoint(&LoopHeaderBB->front());
- auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
-
- // The induction itself.
- // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
- Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
- auto *IVNext = Builder.CreateNUWAdd(IV, ConstantInt::get(Ty, 1),
- IV->getName() + ".next");
-
- // The loop trip count check.
- auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
- CurLoop->getName() + ".ivcheck");
- Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
- LoopHeaderBB->getTerminator()->eraseFromParent();
-
- // Populate the IV PHI.
- IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
- IV->addIncoming(IVNext, LoopHeaderBB);
-
- // Step 5: Forget the "non-computable" trip-count SCEV associated with the
- // loop. The loop would otherwise not be deleted even if it becomes empty.
-
- SE->forgetLoop(CurLoop);
-
- // Other passes will take care of actually deleting the loop if possible.
-
- LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
-
- ++NumShiftUntilBitTest;
- return MadeChange;
-}
+
+/// Match loop-invariant value.
+template <typename SubPattern_t> struct match_LoopInvariant {
+ SubPattern_t SubPattern;
+ const Loop *L;
+
+ match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
+ : SubPattern(SP), L(L) {}
+
+ template <typename ITy> bool match(ITy *V) {
+ return L->isLoopInvariant(V) && SubPattern.match(V);
+ }
+};
+
+/// Matches if the value is loop-invariant.
+template <typename Ty>
+inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
+ return match_LoopInvariant<Ty>(M, L);
+}
+
+/// Return true if the idiom is detected in the loop.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+/// entry:
+/// <...>
+/// %bitmask = shl i32 1, %bitpos
+/// br label %loop
+///
+/// loop:
+/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
+/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
+/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
+/// %x.next = shl i32 %x.curr, 1
+/// <...>
+/// br i1 %x.curr.isbitunset, label %loop, label %end
+///
+/// end:
+/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
+/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
+/// <...>
+/// \endcode
+static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
+ Value *&BitMask, Value *&BitPos,
+ Value *&CurrX, Instruction *&NextX) {
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE
+ " Performing shift-until-bittest idiom detection.\n");
+
+ // Give up if the loop has multiple blocks or multiple backedges.
+ if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
+ return false;
+ }
+
+ BasicBlock *LoopHeaderBB = CurLoop->getHeader();
+ BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
+ assert(LoopPreheaderBB && "There is always a loop preheader.");
+
+ using namespace PatternMatch;
+
+ // Step 1: Check if the loop backedge is in desirable form.
+
+ ICmpInst::Predicate Pred;
+ Value *CmpLHS, *CmpRHS;
+ BasicBlock *TrueBB, *FalseBB;
+ if (!match(LoopHeaderBB->getTerminator(),
+ m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
+ m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
+ return false;
+ }
+
+ // Step 2: Check if the backedge's condition is in desirable form.
+
+ auto MatchVariableBitMask = [&]() {
+ return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
+ match(CmpLHS,
+ m_c_And(m_Value(CurrX),
+ m_CombineAnd(
+ m_Value(BitMask),
+ m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
+ CurLoop))));
+ };
+ auto MatchConstantBitMask = [&]() {
+ return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
+ match(CmpLHS, m_And(m_Value(CurrX),
+ m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
+ (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
+ };
+ auto MatchDecomposableConstantBitMask = [&]() {
+ APInt Mask;
+ return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
+ ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
+ (BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
+ (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
+ };
+
+ if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
+ !MatchDecomposableConstantBitMask()) {
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
+ return false;
+ }
+
+ // Step 3: Check if the recurrence is in desirable form.
+ auto *CurrXPN = dyn_cast<PHINode>(CurrX);
+ if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
+ return false;
+ }
+
+ BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
+ NextX =
+ dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
+
+ if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
+ // FIXME: support right-shift?
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
+ return false;
+ }
+
+ // Step 4: Check if the backedge's destinations are in desirable form.
+
+ assert(ICmpInst::isEquality(Pred) &&
+ "Should only get equality predicates here.");
+
+ // cmp-br is commutative, so canonicalize to a single variant.
+ if (Pred != ICmpInst::Predicate::ICMP_EQ) {
+ Pred = ICmpInst::getInversePredicate(Pred);
+ std::swap(TrueBB, FalseBB);
+ }
+
+ // We expect to exit loop when comparison yields false,
+ // so when it yields true we should branch back to loop header.
+ if (TrueBB != LoopHeaderBB) {
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
+ return false;
+ }
+
+ // Okay, idiom checks out.
+ return true;
+}
+
+/// Look for the following loop:
+/// \code
+/// entry:
+/// <...>
+/// %bitmask = shl i32 1, %bitpos
+/// br label %loop
+///
+/// loop:
+/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
+/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
+/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
+/// %x.next = shl i32 %x.curr, 1
+/// <...>
+/// br i1 %x.curr.isbitunset, label %loop, label %end
+///
+/// end:
+/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
+/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
+/// <...>
+/// \endcode
+///
+/// And transform it into:
+/// \code
+/// entry:
+/// %bitmask = shl i32 1, %bitpos
+/// %lowbitmask = add i32 %bitmask, -1
+/// %mask = or i32 %lowbitmask, %bitmask
+/// %x.masked = and i32 %x, %mask
+/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
+/// i1 true)
+/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
+/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
+/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
+/// %tripcount = add i32 %backedgetakencount, 1
+/// %x.curr = shl i32 %x, %backedgetakencount
+/// %x.next = shl i32 %x, %tripcount
+/// br label %loop
+///
+/// loop:
+/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
+/// %loop.iv.next = add nuw i32 %loop.iv, 1
+/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
+/// <...>
+/// br i1 %loop.ivcheck, label %end, label %loop
+///
+/// end:
+/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
+/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
+/// <...>
+/// \endcode
+bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
+ bool MadeChange = false;
+
+ Value *X, *BitMask, *BitPos, *XCurr;
+ Instruction *XNext;
+ if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
+ XNext)) {
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE
+ " shift-until-bittest idiom detection failed.\n");
+ return MadeChange;
+ }
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
+
+ // Ok, it is the idiom we were looking for, we *could* transform this loop,
+ // but is it profitable to transform?
+
+ BasicBlock *LoopHeaderBB = CurLoop->getHeader();
+ BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
+ assert(LoopPreheaderBB && "There is always a loop preheader.");
+
+ BasicBlock *SuccessorBB = CurLoop->getExitBlock();
+ assert(LoopPreheaderBB && "There is only a single successor.");
+
+ IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
+ Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
+
+ Intrinsic::ID IntrID = Intrinsic::ctlz;
+ Type *Ty = X->getType();
+
+ TargetTransformInfo::TargetCostKind CostKind =
+ TargetTransformInfo::TCK_SizeAndLatency;
+
+ // The rewrite is considered to be unprofitable iff and only iff the
+ // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
+ // making the loop countable, even if nothing else changes.
+ IntrinsicCostAttributes Attrs(
+ IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getTrue()});
+ int Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
+ if (Cost > TargetTransformInfo::TCC_Basic) {
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE
+ " Intrinsic is too costly, not beneficial\n");
+ return MadeChange;
+ }
+ if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
+ TargetTransformInfo::TCC_Basic) {
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
+ return MadeChange;
+ }
+
+ // Ok, transform appears worthwhile.
+ MadeChange = true;
+
+ // Step 1: Compute the loop trip count.
+
+ Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
+ BitPos->getName() + ".lowbitmask");
+ Value *Mask =
+ Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
+ Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
+ CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
+ IntrID, Ty, {XMasked, /*is_zero_undef=*/Builder.getTrue()},
+ /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
+ Value *XMaskedNumActiveBits = Builder.CreateSub(
+ ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
+ XMasked->getName() + ".numactivebits");
+ Value *XMaskedLeadingOnePos =
+ Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
+ XMasked->getName() + ".leadingonepos");
+
+ Value *LoopBackedgeTakenCount = Builder.CreateSub(
+ BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount");
+ // We know loop's backedge-taken count, but what's loop's trip count?
+ // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
+ Value *LoopTripCount =
+ Builder.CreateNUWAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
+ CurLoop->getName() + ".tripcount");
+
+ // Step 2: Compute the recurrence's final value without a loop.
+
+ // NewX is always safe to compute, because `LoopBackedgeTakenCount`
+ // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
+ Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
+ NewX->takeName(XCurr);
+ if (auto *I = dyn_cast<Instruction>(NewX))
+ I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
+
+ Value *NewXNext;
+ // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
+ // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
+ // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
+ // that isn't the case, we'll need to emit an alternative, safe IR.
+ if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
+ PatternMatch::match(
+ BitPos, PatternMatch::m_SpecificInt_ICMP(
+ ICmpInst::ICMP_NE, APInt(Ty->getScalarSizeInBits(),
+ Ty->getScalarSizeInBits() - 1))))
+ NewXNext = Builder.CreateShl(X, LoopTripCount);
+ else {
+ // Otherwise, just additionally shift by one. It's the smallest solution,
+ // alternatively, we could check that NewX is INT_MIN (or BitPos is )
+ // and select 0 instead.
+ NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
+ }
+
+ NewXNext->takeName(XNext);
+ if (auto *I = dyn_cast<Instruction>(NewXNext))
+ I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
+
+ // Step 3: Adjust the successor basic block to recieve the computed
+ // recurrence's final value instead of the recurrence itself.
+
+ XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
+ XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
+
+ // Step 4: Rewrite the loop into a countable form, with canonical IV.
+
+ // The new canonical induction variable.
+ Builder.SetInsertPoint(&LoopHeaderBB->front());
+ auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
+
+ // The induction itself.
+ // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
+ Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
+ auto *IVNext = Builder.CreateNUWAdd(IV, ConstantInt::get(Ty, 1),
+ IV->getName() + ".next");
+
+ // The loop trip count check.
+ auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
+ CurLoop->getName() + ".ivcheck");
+ Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ LoopHeaderBB->getTerminator()->eraseFromParent();
+
+ // Populate the IV PHI.
+ IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
+ IV->addIncoming(IVNext, LoopHeaderBB);
+
+ // Step 5: Forget the "non-computable" trip-count SCEV associated with the
+ // loop. The loop would otherwise not be deleted even if it becomes empty.
+
+ SE->forgetLoop(CurLoop);
+
+ // Other passes will take care of actually deleting the loop if possible.
+
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
+
+ ++NumShiftUntilBitTest;
+ return MadeChange;
+}