diff options
author | shadchin <shadchin@yandex-team.ru> | 2022-02-10 16:44:30 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:44:30 +0300 |
commit | 2598ef1d0aee359b4b6d5fdd1758916d5907d04f (patch) | |
tree | 012bb94d777798f1f56ac1cec429509766d05181 /contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp | |
parent | 6751af0b0c1b952fede40b19b71da8025b5d8bcf (diff) | |
download | ydb-2598ef1d0aee359b4b6d5fdd1758916d5907d04f.tar.gz |
Restoring authorship annotation for <shadchin@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp')
-rw-r--r-- | contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 916 |
1 files changed, 458 insertions, 458 deletions
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 8064c02e2b..e60c95b7be 100644 --- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -47,7 +47,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/CmpInstAnalysis.h" +#include "llvm/Analysis/CmpInstAnalysis.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" @@ -80,7 +80,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" -#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" @@ -108,33 +108,33 @@ using namespace llvm; STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); -STATISTIC( - NumShiftUntilBitTest, - "Number of uncountable loops recognized as 'shift until bitttest' idiom"); - -bool DisableLIRP::All; -static cl::opt<bool, true> - DisableLIRPAll("disable-" DEBUG_TYPE "-all", - cl::desc("Options to disable Loop Idiom Recognize Pass."), - cl::location(DisableLIRP::All), cl::init(false), - cl::ReallyHidden); - -bool DisableLIRP::Memset; -static cl::opt<bool, true> - DisableLIRPMemset("disable-" DEBUG_TYPE "-memset", - cl::desc("Proceed with loop idiom recognize pass, but do " - "not convert loop(s) to memset."), - cl::location(DisableLIRP::Memset), cl::init(false), - cl::ReallyHidden); - -bool DisableLIRP::Memcpy; -static cl::opt<bool, true> - DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy", - cl::desc("Proceed with loop idiom recognize pass, but do " - "not convert loop(s) to memcpy."), - cl::location(DisableLIRP::Memcpy), cl::init(false), - cl::ReallyHidden); - +STATISTIC( + NumShiftUntilBitTest, + "Number of uncountable loops recognized as 'shift until bitttest' idiom"); + +bool DisableLIRP::All; +static cl::opt<bool, true> + DisableLIRPAll("disable-" DEBUG_TYPE "-all", + cl::desc("Options to disable Loop Idiom Recognize Pass."), + cl::location(DisableLIRP::All), cl::init(false), + cl::ReallyHidden); + +bool DisableLIRP::Memset; +static cl::opt<bool, true> + DisableLIRPMemset("disable-" DEBUG_TYPE "-memset", + cl::desc("Proceed with loop idiom recognize pass, but do " + "not convert loop(s) to memset."), + cl::location(DisableLIRP::Memset), cl::init(false), + cl::ReallyHidden); + +bool DisableLIRP::Memcpy; +static cl::opt<bool, true> + DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy", + cl::desc("Proceed with loop idiom recognize pass, but do " + "not convert loop(s) to memcpy."), + cl::location(DisableLIRP::Memcpy), cl::init(false), + cl::ReallyHidden); + static cl::opt<bool> UseLIRCodeSizeHeurs( "use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling" @@ -232,8 +232,8 @@ private: const DebugLoc &DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop); - bool recognizeShiftUntilBitTest(); - + bool recognizeShiftUntilBitTest(); + /// @} }; @@ -247,9 +247,9 @@ public: } bool runOnLoop(Loop *L, LPPassManager &LPM) override { - if (DisableLIRP::All) - return false; - + if (DisableLIRP::All) + return false; + if (skipLoop(L)) return false; @@ -295,9 +295,9 @@ char LoopIdiomRecognizeLegacyPass::ID = 0; PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &) { - if (DisableLIRP::All) - return PreservedAnalyses::all(); - + if (DisableLIRP::All) + return PreservedAnalyses::all(); + const auto *DL = &L.getHeader()->getModule()->getDataLayout(); // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis @@ -469,17 +469,17 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) { Value *StoredVal = SI->getValueOperand(); Value *StorePtr = SI->getPointerOperand(); - // Don't convert stores of non-integral pointer types to memsets (which stores - // integers). - if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType())) - return LegalStoreKind::None; - + // Don't convert stores of non-integral pointer types to memsets (which stores + // integers). + if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType())) + return LegalStoreKind::None; + // Reject stores that are so large that they overflow an unsigned. - // When storing out scalable vectors we bail out for now, since the code - // below currently only works for constant strides. - TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType()); - if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) || - (SizeInBits.getFixedSize() >> 32) != 0) + // When storing out scalable vectors we bail out for now, since the code + // below currently only works for constant strides. + TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType()); + if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) || + (SizeInBits.getFixedSize() >> 32) != 0) return LegalStoreKind::None; // See if the pointer expression is an AddRec like {base,+,1} on the current @@ -508,13 +508,13 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) { // If we're allowed to form a memset, and the stored value would be // acceptable for memset, use it. - if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset && + if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset && // Verify that the stored value is loop invariant. If not, we can't // promote the memset. CurLoop->isLoopInvariant(SplatValue)) { // It looks like we can use SplatValue. return LegalStoreKind::Memset; - } else if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset && + } else if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset && // Don't create memset_pattern16s with address spaces. StorePtr->getType()->getPointerAddressSpace() == 0 && (PatternValue = getMemSetPatternValue(StoredVal, DL))) { @@ -523,7 +523,7 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) { } // Otherwise, see if the store can be turned into a memcpy. - if (HasMemcpy && !DisableLIRP::Memcpy) { + if (HasMemcpy && !DisableLIRP::Memcpy) { // Check to see if the stride matches the size of the store. If so, then we // know that every byte is touched in the loop. APInt Stride = getStoreStride(StoreEv); @@ -578,12 +578,12 @@ void LoopIdiomRecognize::collectStores(BasicBlock *BB) { break; case LegalStoreKind::Memset: { // Find the base pointer. - Value *Ptr = getUnderlyingObject(SI->getPointerOperand()); + Value *Ptr = getUnderlyingObject(SI->getPointerOperand()); StoreRefsForMemset[Ptr].push_back(SI); } break; case LegalStoreKind::MemsetPattern: { // Find the base pointer. - Value *Ptr = getUnderlyingObject(SI->getPointerOperand()); + Value *Ptr = getUnderlyingObject(SI->getPointerOperand()); StoreRefsForMemsetPattern[Ptr].push_back(SI); } break; case LegalStoreKind::Memcpy: @@ -851,7 +851,7 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, // Get the location that may be stored across the loop. Since the access is // strided positively through memory, we say that the modified location starts // at the pointer and has infinite size. - LocationSize AccessSize = LocationSize::afterPointer(); + LocationSize AccessSize = LocationSize::afterPointer(); // If the loop iterates a fixed number of times, we can refine the access size // to be exactly the size of the memset, which is (BECount+1)*StoreSize @@ -903,8 +903,8 @@ static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr, // If we're going to need to zero extend the BE count, check if we can add // one to it prior to zero extending without overflow. Provided this is safe, // it allows better simplification of the +1. - if (DL->getTypeSizeInBits(BECount->getType()).getFixedSize() < - DL->getTypeSizeInBits(IntPtr).getFixedSize() && + if (DL->getTypeSizeInBits(BECount->getType()).getFixedSize() < + DL->getTypeSizeInBits(IntPtr).getFixedSize() && SE->isLoopEntryGuardedByCond( CurLoop, ICmpInst::ICMP_NE, BECount, SE->getNegativeSCEV(SE->getOne(BECount->getType())))) { @@ -947,12 +947,12 @@ bool LoopIdiomRecognize::processLoopStridedStore( BasicBlock *Preheader = CurLoop->getLoopPreheader(); IRBuilder<> Builder(Preheader->getTerminator()); SCEVExpander Expander(*SE, *DL, "loop-idiom"); - SCEVExpanderCleaner ExpCleaner(Expander, *DT); + SCEVExpanderCleaner ExpCleaner(Expander, *DT); Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS); Type *IntIdxTy = DL->getIndexType(DestPtr->getType()); - bool Changed = false; + bool Changed = false; const SCEV *Start = Ev->getStart(); // Handle negative strided loops. if (NegStride) @@ -961,7 +961,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( // TODO: ideally we should still be able to generate memset if SCEV expander // is taught to generate the dependencies at the latest point. if (!isSafeToExpand(Start, *SE)) - return Changed; + return Changed; // Okay, we have a strided store "p[i]" of a splattable value. We can turn // this into a memset in the loop preheader now if we want. However, this @@ -970,22 +970,22 @@ bool LoopIdiomRecognize::processLoopStridedStore( // base pointer and checking the region. Value *BasePtr = Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator()); - - // From here on out, conservatively report to the pass manager that we've - // changed the IR, even if we later clean up these added instructions. There - // may be structural differences e.g. in the order of use lists not accounted - // for in just a textual dump of the IR. This is written as a variable, even - // though statically all the places this dominates could be replaced with - // 'true', with the hope that anyone trying to be clever / "more precise" with - // the return value will read this comment, and leave them alone. - Changed = true; - + + // From here on out, conservatively report to the pass manager that we've + // changed the IR, even if we later clean up these added instructions. There + // may be structural differences e.g. in the order of use lists not accounted + // for in just a textual dump of the IR. This is written as a variable, even + // though statically all the places this dominates could be replaced with + // 'true', with the hope that anyone trying to be clever / "more precise" with + // the return value will read this comment, and leave them alone. + Changed = true; + if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount, - StoreSize, *AA, Stores)) - return Changed; + StoreSize, *AA, Stores)) + return Changed; if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset)) - return Changed; + return Changed; // Okay, everything looks good, insert the memset. @@ -995,7 +995,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( // TODO: ideally we should still be able to generate memset if SCEV expander // is taught to generate the dependencies at the latest point. if (!isSafeToExpand(NumBytesS, *SE)) - return Changed; + return Changed; Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); @@ -1054,7 +1054,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( if (MSSAU && VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); ++NumMemSet; - ExpCleaner.markResultUsed(); + ExpCleaner.markResultUsed(); return true; } @@ -1088,9 +1088,9 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, IRBuilder<> Builder(Preheader->getTerminator()); SCEVExpander Expander(*SE, *DL, "loop-idiom"); - SCEVExpanderCleaner ExpCleaner(Expander, *DT); + SCEVExpanderCleaner ExpCleaner(Expander, *DT); - bool Changed = false; + bool Changed = false; const SCEV *StrStart = StoreEv->getStart(); unsigned StrAS = SI->getPointerAddressSpace(); Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS)); @@ -1108,20 +1108,20 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, Value *StoreBasePtr = Expander.expandCodeFor( StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator()); - // From here on out, conservatively report to the pass manager that we've - // changed the IR, even if we later clean up these added instructions. There - // may be structural differences e.g. in the order of use lists not accounted - // for in just a textual dump of the IR. This is written as a variable, even - // though statically all the places this dominates could be replaced with - // 'true', with the hope that anyone trying to be clever / "more precise" with - // the return value will read this comment, and leave them alone. - Changed = true; - + // From here on out, conservatively report to the pass manager that we've + // changed the IR, even if we later clean up these added instructions. There + // may be structural differences e.g. in the order of use lists not accounted + // for in just a textual dump of the IR. This is written as a variable, even + // though statically all the places this dominates could be replaced with + // 'true', with the hope that anyone trying to be clever / "more precise" with + // the return value will read this comment, and leave them alone. + Changed = true; + SmallPtrSet<Instruction *, 1> Stores; Stores.insert(SI); if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount, StoreSize, *AA, Stores)) - return Changed; + return Changed; const SCEV *LdStart = LoadEv->getStart(); unsigned LdAS = LI->getPointerAddressSpace(); @@ -1137,10 +1137,10 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount, StoreSize, *AA, Stores)) - return Changed; + return Changed; if (avoidLIRForMultiBlockLoop()) - return Changed; + return Changed; // Okay, everything is safe, we can transform this! @@ -1163,14 +1163,14 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, const Align StoreAlign = SI->getAlign(); const Align LoadAlign = LI->getAlign(); if (StoreAlign < StoreSize || LoadAlign < StoreSize) - return Changed; + return Changed; // If the element.atomic memcpy is not lowered into explicit // loads/stores later, then it will be lowered into an element-size // specific lib call. If the lib call doesn't exist for our store size, then // we shouldn't generate the memcpy. if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize()) - return Changed; + return Changed; // Create the call. // Note that unordered atomic loads/stores are *required* by the spec to @@ -1208,7 +1208,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, if (MSSAU && VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); ++NumMemCpy; - ExpCleaner.markResultUsed(); + ExpCleaner.markResultUsed(); return true; } @@ -1218,7 +1218,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset, bool IsLoopMemset) { if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) { - if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) { + if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) { LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName() << " : LIR " << (IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n"); @@ -1235,8 +1235,8 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() { << "] Noncountable Loop %" << CurLoop->getHeader()->getName() << "\n"); - return recognizePopcount() || recognizeAndInsertFFS() || - recognizeShiftUntilBitTest(); + return recognizePopcount() || recognizeAndInsertFFS() || + recognizeShiftUntilBitTest(); } /// Check if the given conditional branch is based on the comparison between @@ -1483,7 +1483,7 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, return false; // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1 - // or cnt.next = cnt + -1. + // or cnt.next = cnt + -1. // TODO: We can skip the step. If loop trip count is known (CTLZ), // then all uses of "cnt.next" could be optimized to the trip count // plus "cnt0". Currently it is not optimized. @@ -1497,7 +1497,7 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, continue; ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); - if (!Inc || (!Inc->isOne() && !Inc->isMinusOne())) + if (!Inc || (!Inc->isOne() && !Inc->isMinusOne())) continue; PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry); @@ -1728,11 +1728,11 @@ void LoopIdiomRecognize::transformLoopToCountable( Builder.SetCurrentDebugLocation(DL); // Count = BitWidth - CTLZ(InitX); - // NewCount = Count; + // NewCount = Count; // If there are uses of CntPhi create: - // NewCount = BitWidth - CTLZ(InitX >> 1); - // Count = NewCount + 1; - Value *InitXNext; + // NewCount = BitWidth - CTLZ(InitX >> 1); + // Count = NewCount + 1; + Value *InitXNext; if (IsCntPhiUsedOutsideLoop) { if (DefX->getOpcode() == Instruction::AShr) InitXNext = @@ -1747,31 +1747,31 @@ void LoopIdiomRecognize::transformLoopToCountable( llvm_unreachable("Unexpected opcode!"); } else InitXNext = InitX; - Value *FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID); - Value *Count = Builder.CreateSub( - ConstantInt::get(FFS->getType(), FFS->getType()->getIntegerBitWidth()), + Value *FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID); + Value *Count = Builder.CreateSub( + ConstantInt::get(FFS->getType(), FFS->getType()->getIntegerBitWidth()), FFS); - Value *NewCount = Count; + Value *NewCount = Count; if (IsCntPhiUsedOutsideLoop) { - NewCount = Count; - Count = Builder.CreateAdd(Count, ConstantInt::get(Count->getType(), 1)); + NewCount = Count; + Count = Builder.CreateAdd(Count, ConstantInt::get(Count->getType(), 1)); } - NewCount = Builder.CreateZExtOrTrunc(NewCount, - cast<IntegerType>(CntInst->getType())); + NewCount = Builder.CreateZExtOrTrunc(NewCount, + cast<IntegerType>(CntInst->getType())); Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader); - if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) { - // If the counter was being incremented in the loop, add NewCount to the - // counter's initial value, but only if the initial value is not zero. - ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); - if (!InitConst || !InitConst->isZero()) - NewCount = Builder.CreateAdd(NewCount, CntInitVal); - } else { - // If the count was being decremented in the loop, subtract NewCount from - // the counter's initial value. - NewCount = Builder.CreateSub(CntInitVal, NewCount); - } + if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) { + // If the counter was being incremented in the loop, add NewCount to the + // counter's initial value, but only if the initial value is not zero. + ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); + if (!InitConst || !InitConst->isZero()) + NewCount = Builder.CreateAdd(NewCount, CntInitVal); + } else { + // If the count was being decremented in the loop, subtract NewCount from + // the counter's initial value. + NewCount = Builder.CreateSub(CntInitVal, NewCount); + } // Step 2: Insert new IV and loop condition: // loop: @@ -1919,343 +1919,343 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB, // loop. The loop would otherwise not be deleted even if it becomes empty. SE->forgetLoop(CurLoop); } - -/// Match loop-invariant value. -template <typename SubPattern_t> struct match_LoopInvariant { - SubPattern_t SubPattern; - const Loop *L; - - match_LoopInvariant(const SubPattern_t &SP, const Loop *L) - : SubPattern(SP), L(L) {} - - template <typename ITy> bool match(ITy *V) { - return L->isLoopInvariant(V) && SubPattern.match(V); - } -}; - -/// Matches if the value is loop-invariant. -template <typename Ty> -inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) { - return match_LoopInvariant<Ty>(M, L); -} - -/// Return true if the idiom is detected in the loop. -/// -/// The core idiom we are trying to detect is: -/// \code -/// entry: -/// <...> -/// %bitmask = shl i32 1, %bitpos -/// br label %loop -/// -/// loop: -/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ] -/// %x.curr.bitmasked = and i32 %x.curr, %bitmask -/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0 -/// %x.next = shl i32 %x.curr, 1 -/// <...> -/// br i1 %x.curr.isbitunset, label %loop, label %end -/// -/// end: -/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...> -/// %x.next.res = phi i32 [ %x.next, %loop ] <...> -/// <...> -/// \endcode -static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, - Value *&BitMask, Value *&BitPos, - Value *&CurrX, Instruction *&NextX) { - LLVM_DEBUG(dbgs() << DEBUG_TYPE - " Performing shift-until-bittest idiom detection.\n"); - - // Give up if the loop has multiple blocks or multiple backedges. - if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) { - LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n"); - return false; - } - - BasicBlock *LoopHeaderBB = CurLoop->getHeader(); - BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader(); - assert(LoopPreheaderBB && "There is always a loop preheader."); - - using namespace PatternMatch; - - // Step 1: Check if the loop backedge is in desirable form. - - ICmpInst::Predicate Pred; - Value *CmpLHS, *CmpRHS; - BasicBlock *TrueBB, *FalseBB; - if (!match(LoopHeaderBB->getTerminator(), - m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)), - m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) { - LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n"); - return false; - } - - // Step 2: Check if the backedge's condition is in desirable form. - - auto MatchVariableBitMask = [&]() { - return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) && - match(CmpLHS, - m_c_And(m_Value(CurrX), - m_CombineAnd( - m_Value(BitMask), - m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)), - CurLoop)))); - }; - auto MatchConstantBitMask = [&]() { - return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) && - match(CmpLHS, m_And(m_Value(CurrX), - m_CombineAnd(m_Value(BitMask), m_Power2()))) && - (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask))); - }; - auto MatchDecomposableConstantBitMask = [&]() { - APInt Mask; - return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) && - ICmpInst::isEquality(Pred) && Mask.isPowerOf2() && - (BitMask = ConstantInt::get(CurrX->getType(), Mask)) && - (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2())); - }; - - if (!MatchVariableBitMask() && !MatchConstantBitMask() && - !MatchDecomposableConstantBitMask()) { - LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n"); - return false; - } - - // Step 3: Check if the recurrence is in desirable form. - auto *CurrXPN = dyn_cast<PHINode>(CurrX); - if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) { - LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n"); - return false; - } - - BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB); - NextX = - dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB)); - - if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) { - // FIXME: support right-shift? - LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n"); - return false; - } - - // Step 4: Check if the backedge's destinations are in desirable form. - - assert(ICmpInst::isEquality(Pred) && - "Should only get equality predicates here."); - - // cmp-br is commutative, so canonicalize to a single variant. - if (Pred != ICmpInst::Predicate::ICMP_EQ) { - Pred = ICmpInst::getInversePredicate(Pred); - std::swap(TrueBB, FalseBB); - } - - // We expect to exit loop when comparison yields false, - // so when it yields true we should branch back to loop header. - if (TrueBB != LoopHeaderBB) { - LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n"); - return false; - } - - // Okay, idiom checks out. - return true; -} - -/// Look for the following loop: -/// \code -/// entry: -/// <...> -/// %bitmask = shl i32 1, %bitpos -/// br label %loop -/// -/// loop: -/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ] -/// %x.curr.bitmasked = and i32 %x.curr, %bitmask -/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0 -/// %x.next = shl i32 %x.curr, 1 -/// <...> -/// br i1 %x.curr.isbitunset, label %loop, label %end -/// -/// end: -/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...> -/// %x.next.res = phi i32 [ %x.next, %loop ] <...> -/// <...> -/// \endcode -/// -/// And transform it into: -/// \code -/// entry: -/// %bitmask = shl i32 1, %bitpos -/// %lowbitmask = add i32 %bitmask, -1 -/// %mask = or i32 %lowbitmask, %bitmask -/// %x.masked = and i32 %x, %mask -/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked, -/// i1 true) -/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros -/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1 -/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos -/// %tripcount = add i32 %backedgetakencount, 1 -/// %x.curr = shl i32 %x, %backedgetakencount -/// %x.next = shl i32 %x, %tripcount -/// br label %loop -/// -/// loop: -/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ] -/// %loop.iv.next = add nuw i32 %loop.iv, 1 -/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount -/// <...> -/// br i1 %loop.ivcheck, label %end, label %loop -/// -/// end: -/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...> -/// %x.next.res = phi i32 [ %x.next, %loop ] <...> -/// <...> -/// \endcode -bool LoopIdiomRecognize::recognizeShiftUntilBitTest() { - bool MadeChange = false; - - Value *X, *BitMask, *BitPos, *XCurr; - Instruction *XNext; - if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr, - XNext)) { - LLVM_DEBUG(dbgs() << DEBUG_TYPE - " shift-until-bittest idiom detection failed.\n"); - return MadeChange; - } - LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n"); - - // Ok, it is the idiom we were looking for, we *could* transform this loop, - // but is it profitable to transform? - - BasicBlock *LoopHeaderBB = CurLoop->getHeader(); - BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader(); - assert(LoopPreheaderBB && "There is always a loop preheader."); - - BasicBlock *SuccessorBB = CurLoop->getExitBlock(); - assert(LoopPreheaderBB && "There is only a single successor."); - - IRBuilder<> Builder(LoopPreheaderBB->getTerminator()); - Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc()); - - Intrinsic::ID IntrID = Intrinsic::ctlz; - Type *Ty = X->getType(); - - TargetTransformInfo::TargetCostKind CostKind = - TargetTransformInfo::TCK_SizeAndLatency; - - // The rewrite is considered to be unprofitable iff and only iff the - // intrinsic/shift we'll use are not cheap. Note that we are okay with *just* - // making the loop countable, even if nothing else changes. - IntrinsicCostAttributes Attrs( - IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getTrue()}); - int Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind); - if (Cost > TargetTransformInfo::TCC_Basic) { - LLVM_DEBUG(dbgs() << DEBUG_TYPE - " Intrinsic is too costly, not beneficial\n"); - return MadeChange; - } - if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) > - TargetTransformInfo::TCC_Basic) { - LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n"); - return MadeChange; - } - - // Ok, transform appears worthwhile. - MadeChange = true; - - // Step 1: Compute the loop trip count. - - Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty), - BitPos->getName() + ".lowbitmask"); - Value *Mask = - Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask"); - Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked"); - CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic( - IntrID, Ty, {XMasked, /*is_zero_undef=*/Builder.getTrue()}, - /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros"); - Value *XMaskedNumActiveBits = Builder.CreateSub( - ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros, - XMasked->getName() + ".numactivebits"); - Value *XMaskedLeadingOnePos = - Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty), - XMasked->getName() + ".leadingonepos"); - - Value *LoopBackedgeTakenCount = Builder.CreateSub( - BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount"); - // We know loop's backedge-taken count, but what's loop's trip count? - // Note that while NUW is always safe, while NSW is only for bitwidths != 2. - Value *LoopTripCount = - Builder.CreateNUWAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1), - CurLoop->getName() + ".tripcount"); - - // Step 2: Compute the recurrence's final value without a loop. - - // NewX is always safe to compute, because `LoopBackedgeTakenCount` - // will always be smaller than `bitwidth(X)`, i.e. we never get poison. - Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount); - NewX->takeName(XCurr); - if (auto *I = dyn_cast<Instruction>(NewX)) - I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true); - - Value *NewXNext; - // Rewriting XNext is more complicated, however, because `X << LoopTripCount` - // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen - // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know - // that isn't the case, we'll need to emit an alternative, safe IR. - if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() || - PatternMatch::match( - BitPos, PatternMatch::m_SpecificInt_ICMP( - ICmpInst::ICMP_NE, APInt(Ty->getScalarSizeInBits(), - Ty->getScalarSizeInBits() - 1)))) - NewXNext = Builder.CreateShl(X, LoopTripCount); - else { - // Otherwise, just additionally shift by one. It's the smallest solution, - // alternatively, we could check that NewX is INT_MIN (or BitPos is ) - // and select 0 instead. - NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1)); - } - - NewXNext->takeName(XNext); - if (auto *I = dyn_cast<Instruction>(NewXNext)) - I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true); - - // Step 3: Adjust the successor basic block to recieve the computed - // recurrence's final value instead of the recurrence itself. - - XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB); - XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB); - - // Step 4: Rewrite the loop into a countable form, with canonical IV. - - // The new canonical induction variable. - Builder.SetInsertPoint(&LoopHeaderBB->front()); - auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv"); - - // The induction itself. - // Note that while NUW is always safe, while NSW is only for bitwidths != 2. - Builder.SetInsertPoint(LoopHeaderBB->getTerminator()); - auto *IVNext = Builder.CreateNUWAdd(IV, ConstantInt::get(Ty, 1), - IV->getName() + ".next"); - - // The loop trip count check. - auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount, - CurLoop->getName() + ".ivcheck"); - Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB); - LoopHeaderBB->getTerminator()->eraseFromParent(); - - // Populate the IV PHI. - IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB); - IV->addIncoming(IVNext, LoopHeaderBB); - - // Step 5: Forget the "non-computable" trip-count SCEV associated with the - // loop. The loop would otherwise not be deleted even if it becomes empty. - - SE->forgetLoop(CurLoop); - - // Other passes will take care of actually deleting the loop if possible. - - LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n"); - - ++NumShiftUntilBitTest; - return MadeChange; -} + +/// Match loop-invariant value. +template <typename SubPattern_t> struct match_LoopInvariant { + SubPattern_t SubPattern; + const Loop *L; + + match_LoopInvariant(const SubPattern_t &SP, const Loop *L) + : SubPattern(SP), L(L) {} + + template <typename ITy> bool match(ITy *V) { + return L->isLoopInvariant(V) && SubPattern.match(V); + } +}; + +/// Matches if the value is loop-invariant. +template <typename Ty> +inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) { + return match_LoopInvariant<Ty>(M, L); +} + +/// Return true if the idiom is detected in the loop. +/// +/// The core idiom we are trying to detect is: +/// \code +/// entry: +/// <...> +/// %bitmask = shl i32 1, %bitpos +/// br label %loop +/// +/// loop: +/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ] +/// %x.curr.bitmasked = and i32 %x.curr, %bitmask +/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0 +/// %x.next = shl i32 %x.curr, 1 +/// <...> +/// br i1 %x.curr.isbitunset, label %loop, label %end +/// +/// end: +/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...> +/// %x.next.res = phi i32 [ %x.next, %loop ] <...> +/// <...> +/// \endcode +static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, + Value *&BitMask, Value *&BitPos, + Value *&CurrX, Instruction *&NextX) { + LLVM_DEBUG(dbgs() << DEBUG_TYPE + " Performing shift-until-bittest idiom detection.\n"); + + // Give up if the loop has multiple blocks or multiple backedges. + if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) { + LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n"); + return false; + } + + BasicBlock *LoopHeaderBB = CurLoop->getHeader(); + BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader(); + assert(LoopPreheaderBB && "There is always a loop preheader."); + + using namespace PatternMatch; + + // Step 1: Check if the loop backedge is in desirable form. + + ICmpInst::Predicate Pred; + Value *CmpLHS, *CmpRHS; + BasicBlock *TrueBB, *FalseBB; + if (!match(LoopHeaderBB->getTerminator(), + m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)), + m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) { + LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n"); + return false; + } + + // Step 2: Check if the backedge's condition is in desirable form. + + auto MatchVariableBitMask = [&]() { + return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) && + match(CmpLHS, + m_c_And(m_Value(CurrX), + m_CombineAnd( + m_Value(BitMask), + m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)), + CurLoop)))); + }; + auto MatchConstantBitMask = [&]() { + return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) && + match(CmpLHS, m_And(m_Value(CurrX), + m_CombineAnd(m_Value(BitMask), m_Power2()))) && + (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask))); + }; + auto MatchDecomposableConstantBitMask = [&]() { + APInt Mask; + return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) && + ICmpInst::isEquality(Pred) && Mask.isPowerOf2() && + (BitMask = ConstantInt::get(CurrX->getType(), Mask)) && + (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2())); + }; + + if (!MatchVariableBitMask() && !MatchConstantBitMask() && + !MatchDecomposableConstantBitMask()) { + LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n"); + return false; + } + + // Step 3: Check if the recurrence is in desirable form. + auto *CurrXPN = dyn_cast<PHINode>(CurrX); + if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) { + LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n"); + return false; + } + + BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB); + NextX = + dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB)); + + if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) { + // FIXME: support right-shift? + LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n"); + return false; + } + + // Step 4: Check if the backedge's destinations are in desirable form. + + assert(ICmpInst::isEquality(Pred) && + "Should only get equality predicates here."); + + // cmp-br is commutative, so canonicalize to a single variant. + if (Pred != ICmpInst::Predicate::ICMP_EQ) { + Pred = ICmpInst::getInversePredicate(Pred); + std::swap(TrueBB, FalseBB); + } + + // We expect to exit loop when comparison yields false, + // so when it yields true we should branch back to loop header. + if (TrueBB != LoopHeaderBB) { + LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n"); + return false; + } + + // Okay, idiom checks out. + return true; +} + +/// Look for the following loop: +/// \code +/// entry: +/// <...> +/// %bitmask = shl i32 1, %bitpos +/// br label %loop +/// +/// loop: +/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ] +/// %x.curr.bitmasked = and i32 %x.curr, %bitmask +/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0 +/// %x.next = shl i32 %x.curr, 1 +/// <...> +/// br i1 %x.curr.isbitunset, label %loop, label %end +/// +/// end: +/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...> +/// %x.next.res = phi i32 [ %x.next, %loop ] <...> +/// <...> +/// \endcode +/// +/// And transform it into: +/// \code +/// entry: +/// %bitmask = shl i32 1, %bitpos +/// %lowbitmask = add i32 %bitmask, -1 +/// %mask = or i32 %lowbitmask, %bitmask +/// %x.masked = and i32 %x, %mask +/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked, +/// i1 true) +/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros +/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1 +/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos +/// %tripcount = add i32 %backedgetakencount, 1 +/// %x.curr = shl i32 %x, %backedgetakencount +/// %x.next = shl i32 %x, %tripcount +/// br label %loop +/// +/// loop: +/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ] +/// %loop.iv.next = add nuw i32 %loop.iv, 1 +/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount +/// <...> +/// br i1 %loop.ivcheck, label %end, label %loop +/// +/// end: +/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...> +/// %x.next.res = phi i32 [ %x.next, %loop ] <...> +/// <...> +/// \endcode +bool LoopIdiomRecognize::recognizeShiftUntilBitTest() { + bool MadeChange = false; + + Value *X, *BitMask, *BitPos, *XCurr; + Instruction *XNext; + if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr, + XNext)) { + LLVM_DEBUG(dbgs() << DEBUG_TYPE + " shift-until-bittest idiom detection failed.\n"); + return MadeChange; + } + LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n"); + + // Ok, it is the idiom we were looking for, we *could* transform this loop, + // but is it profitable to transform? + + BasicBlock *LoopHeaderBB = CurLoop->getHeader(); + BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader(); + assert(LoopPreheaderBB && "There is always a loop preheader."); + + BasicBlock *SuccessorBB = CurLoop->getExitBlock(); + assert(LoopPreheaderBB && "There is only a single successor."); + + IRBuilder<> Builder(LoopPreheaderBB->getTerminator()); + Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc()); + + Intrinsic::ID IntrID = Intrinsic::ctlz; + Type *Ty = X->getType(); + + TargetTransformInfo::TargetCostKind CostKind = + TargetTransformInfo::TCK_SizeAndLatency; + + // The rewrite is considered to be unprofitable iff and only iff the + // intrinsic/shift we'll use are not cheap. Note that we are okay with *just* + // making the loop countable, even if nothing else changes. + IntrinsicCostAttributes Attrs( + IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getTrue()}); + int Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind); + if (Cost > TargetTransformInfo::TCC_Basic) { + LLVM_DEBUG(dbgs() << DEBUG_TYPE + " Intrinsic is too costly, not beneficial\n"); + return MadeChange; + } + if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) > + TargetTransformInfo::TCC_Basic) { + LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n"); + return MadeChange; + } + + // Ok, transform appears worthwhile. + MadeChange = true; + + // Step 1: Compute the loop trip count. + + Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty), + BitPos->getName() + ".lowbitmask"); + Value *Mask = + Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask"); + Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked"); + CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic( + IntrID, Ty, {XMasked, /*is_zero_undef=*/Builder.getTrue()}, + /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros"); + Value *XMaskedNumActiveBits = Builder.CreateSub( + ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros, + XMasked->getName() + ".numactivebits"); + Value *XMaskedLeadingOnePos = + Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty), + XMasked->getName() + ".leadingonepos"); + + Value *LoopBackedgeTakenCount = Builder.CreateSub( + BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount"); + // We know loop's backedge-taken count, but what's loop's trip count? + // Note that while NUW is always safe, while NSW is only for bitwidths != 2. + Value *LoopTripCount = + Builder.CreateNUWAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1), + CurLoop->getName() + ".tripcount"); + + // Step 2: Compute the recurrence's final value without a loop. + + // NewX is always safe to compute, because `LoopBackedgeTakenCount` + // will always be smaller than `bitwidth(X)`, i.e. we never get poison. + Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount); + NewX->takeName(XCurr); + if (auto *I = dyn_cast<Instruction>(NewX)) + I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true); + + Value *NewXNext; + // Rewriting XNext is more complicated, however, because `X << LoopTripCount` + // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen + // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know + // that isn't the case, we'll need to emit an alternative, safe IR. + if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() || + PatternMatch::match( + BitPos, PatternMatch::m_SpecificInt_ICMP( + ICmpInst::ICMP_NE, APInt(Ty->getScalarSizeInBits(), + Ty->getScalarSizeInBits() - 1)))) + NewXNext = Builder.CreateShl(X, LoopTripCount); + else { + // Otherwise, just additionally shift by one. It's the smallest solution, + // alternatively, we could check that NewX is INT_MIN (or BitPos is ) + // and select 0 instead. + NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1)); + } + + NewXNext->takeName(XNext); + if (auto *I = dyn_cast<Instruction>(NewXNext)) + I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true); + + // Step 3: Adjust the successor basic block to recieve the computed + // recurrence's final value instead of the recurrence itself. + + XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB); + XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB); + + // Step 4: Rewrite the loop into a countable form, with canonical IV. + + // The new canonical induction variable. + Builder.SetInsertPoint(&LoopHeaderBB->front()); + auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv"); + + // The induction itself. + // Note that while NUW is always safe, while NSW is only for bitwidths != 2. + Builder.SetInsertPoint(LoopHeaderBB->getTerminator()); + auto *IVNext = Builder.CreateNUWAdd(IV, ConstantInt::get(Ty, 1), + IV->getName() + ".next"); + + // The loop trip count check. + auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount, + CurLoop->getName() + ".ivcheck"); + Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB); + LoopHeaderBB->getTerminator()->eraseFromParent(); + + // Populate the IV PHI. + IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB); + IV->addIncoming(IVNext, LoopHeaderBB); + + // Step 5: Forget the "non-computable" trip-count SCEV associated with the + // loop. The loop would otherwise not be deleted even if it becomes empty. + + SE->forgetLoop(CurLoop); + + // Other passes will take care of actually deleting the loop if possible. + + LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n"); + + ++NumShiftUntilBitTest; + return MadeChange; +} |