Restoring authorship annotation for <shadchin@yandex-team.ru>. Commit 1 of 2.

author: shadchin <shadchin@yandex-team.ru> 2022-02-10 16:44:30 +0300
committer: Daniil Cherednik <dcherednik@yandex-team.ru> 2022-02-10 16:44:30 +0300
commit: 2598ef1d0aee359b4b6d5fdd1758916d5907d04f (patch)
tree: 012bb94d777798f1f56ac1cec429509766d05181 /contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
parent: 6751af0b0c1b952fede40b19b71da8025b5d8bcf (diff)
download: ydb-2598ef1d0aee359b4b6d5fdd1758916d5907d04f.tar.gz
1 files changed, 458 insertions, 458 deletions
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 8064c02e2b..e60c95b7be 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -47,7 +47,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/CmpInstAnalysis.h"
+#include "llvm/Analysis/CmpInstAnalysis.h" 
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -80,7 +80,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/PatternMatch.h" 
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
@@ -108,33 +108,33 @@ using namespace llvm;
 
 STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
-STATISTIC(
-    NumShiftUntilBitTest,
-    "Number of uncountable loops recognized as 'shift until bitttest' idiom");
-
-bool DisableLIRP::All;
-static cl::opt<bool, true>
-    DisableLIRPAll("disable-" DEBUG_TYPE "-all",
-                   cl::desc("Options to disable Loop Idiom Recognize Pass."),
-                   cl::location(DisableLIRP::All), cl::init(false),
-                   cl::ReallyHidden);
-
-bool DisableLIRP::Memset;
-static cl::opt<bool, true>
-    DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
-                      cl::desc("Proceed with loop idiom recognize pass, but do "
-                               "not convert loop(s) to memset."),
-                      cl::location(DisableLIRP::Memset), cl::init(false),
-                      cl::ReallyHidden);
-
-bool DisableLIRP::Memcpy;
-static cl::opt<bool, true>
-    DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
-                      cl::desc("Proceed with loop idiom recognize pass, but do "
-                               "not convert loop(s) to memcpy."),
-                      cl::location(DisableLIRP::Memcpy), cl::init(false),
-                      cl::ReallyHidden);
-
+STATISTIC( 
+    NumShiftUntilBitTest, 
+    "Number of uncountable loops recognized as 'shift until bitttest' idiom"); 
+
+bool DisableLIRP::All; 
+static cl::opt<bool, true> 
+    DisableLIRPAll("disable-" DEBUG_TYPE "-all", 
+                   cl::desc("Options to disable Loop Idiom Recognize Pass."), 
+                   cl::location(DisableLIRP::All), cl::init(false), 
+                   cl::ReallyHidden); 
+ 
+bool DisableLIRP::Memset; 
+static cl::opt<bool, true> 
+    DisableLIRPMemset("disable-" DEBUG_TYPE "-memset", 
+                      cl::desc("Proceed with loop idiom recognize pass, but do " 
+                               "not convert loop(s) to memset."), 
+                      cl::location(DisableLIRP::Memset), cl::init(false), 
+                      cl::ReallyHidden); 
+ 
+bool DisableLIRP::Memcpy; 
+static cl::opt<bool, true> 
+    DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy", 
+                      cl::desc("Proceed with loop idiom recognize pass, but do " 
+                               "not convert loop(s) to memcpy."), 
+                      cl::location(DisableLIRP::Memcpy), cl::init(false), 
+                      cl::ReallyHidden); 
+ 
 static cl::opt<bool> UseLIRCodeSizeHeurs(
     "use-lir-code-size-heurs",
     cl::desc("Use loop idiom recognition code size heuristics when compiling"
@@ -232,8 +232,8 @@ private:
                                 const DebugLoc &DL, bool ZeroCheck,
                                 bool IsCntPhiUsedOutsideLoop);
 
-  bool recognizeShiftUntilBitTest();
-
+  bool recognizeShiftUntilBitTest(); 
+ 
   /// @}
 };
 
@@ -247,9 +247,9 @@ public:
   }
 
   bool runOnLoop(Loop *L, LPPassManager &LPM) override {
-    if (DisableLIRP::All)
-      return false;
-
+    if (DisableLIRP::All) 
+      return false; 
+ 
     if (skipLoop(L))
       return false;
 
@@ -295,9 +295,9 @@ char LoopIdiomRecognizeLegacyPass::ID = 0;
 PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
                                               LoopStandardAnalysisResults &AR,
                                               LPMUpdater &) {
-  if (DisableLIRP::All)
-    return PreservedAnalyses::all();
-
+  if (DisableLIRP::All) 
+    return PreservedAnalyses::all(); 
+ 
   const auto *DL = &L.getHeader()->getModule()->getDataLayout();
 
   // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
@@ -469,17 +469,17 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
   Value *StoredVal = SI->getValueOperand();
   Value *StorePtr = SI->getPointerOperand();
 
-  // Don't convert stores of non-integral pointer types to memsets (which stores
-  // integers).
-  if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
-    return LegalStoreKind::None;
-
+  // Don't convert stores of non-integral pointer types to memsets (which stores 
+  // integers). 
+  if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType())) 
+    return LegalStoreKind::None; 
+ 
   // Reject stores that are so large that they overflow an unsigned.
-  // When storing out scalable vectors we bail out for now, since the code
-  // below currently only works for constant strides.
-  TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
-  if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) ||
-      (SizeInBits.getFixedSize() >> 32) != 0)
+  // When storing out scalable vectors we bail out for now, since the code 
+  // below currently only works for constant strides. 
+  TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType()); 
+  if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) || 
+      (SizeInBits.getFixedSize() >> 32) != 0) 
     return LegalStoreKind::None;
 
   // See if the pointer expression is an AddRec like {base,+,1} on the current
@@ -508,13 +508,13 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
 
   // If we're allowed to form a memset, and the stored value would be
   // acceptable for memset, use it.
-  if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
+  if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset && 
       // Verify that the stored value is loop invariant.  If not, we can't
       // promote the memset.
       CurLoop->isLoopInvariant(SplatValue)) {
     // It looks like we can use SplatValue.
     return LegalStoreKind::Memset;
-  } else if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
+  } else if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset && 
              // Don't create memset_pattern16s with address spaces.
              StorePtr->getType()->getPointerAddressSpace() == 0 &&
              (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
@@ -523,7 +523,7 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
   }
 
   // Otherwise, see if the store can be turned into a memcpy.
-  if (HasMemcpy && !DisableLIRP::Memcpy) {
+  if (HasMemcpy && !DisableLIRP::Memcpy) { 
     // Check to see if the stride matches the size of the store.  If so, then we
     // know that every byte is touched in the loop.
     APInt Stride = getStoreStride(StoreEv);
@@ -578,12 +578,12 @@ void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
       break;
     case LegalStoreKind::Memset: {
       // Find the base pointer.
-      Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
+      Value *Ptr = getUnderlyingObject(SI->getPointerOperand()); 
       StoreRefsForMemset[Ptr].push_back(SI);
     } break;
     case LegalStoreKind::MemsetPattern: {
       // Find the base pointer.
-      Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
+      Value *Ptr = getUnderlyingObject(SI->getPointerOperand()); 
       StoreRefsForMemsetPattern[Ptr].push_back(SI);
     } break;
     case LegalStoreKind::Memcpy:
@@ -851,7 +851,7 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
   // Get the location that may be stored across the loop.  Since the access is
   // strided positively through memory, we say that the modified location starts
   // at the pointer and has infinite size.
-  LocationSize AccessSize = LocationSize::afterPointer();
+  LocationSize AccessSize = LocationSize::afterPointer(); 
 
   // If the loop iterates a fixed number of times, we can refine the access size
   // to be exactly the size of the memset, which is (BECount+1)*StoreSize
@@ -903,8 +903,8 @@ static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
   // If we're going to need to zero extend the BE count, check if we can add
   // one to it prior to zero extending without overflow. Provided this is safe,
   // it allows better simplification of the +1.
-  if (DL->getTypeSizeInBits(BECount->getType()).getFixedSize() <
-          DL->getTypeSizeInBits(IntPtr).getFixedSize() &&
+  if (DL->getTypeSizeInBits(BECount->getType()).getFixedSize() < 
+          DL->getTypeSizeInBits(IntPtr).getFixedSize() && 
       SE->isLoopEntryGuardedByCond(
           CurLoop, ICmpInst::ICMP_NE, BECount,
           SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
@@ -947,12 +947,12 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   BasicBlock *Preheader = CurLoop->getLoopPreheader();
   IRBuilder<> Builder(Preheader->getTerminator());
   SCEVExpander Expander(*SE, *DL, "loop-idiom");
-  SCEVExpanderCleaner ExpCleaner(Expander, *DT);
+  SCEVExpanderCleaner ExpCleaner(Expander, *DT); 
 
   Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
   Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
 
-  bool Changed = false;
+  bool Changed = false; 
   const SCEV *Start = Ev->getStart();
   // Handle negative strided loops.
   if (NegStride)
@@ -961,7 +961,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   // TODO: ideally we should still be able to generate memset if SCEV expander
   // is taught to generate the dependencies at the latest point.
   if (!isSafeToExpand(Start, *SE))
-    return Changed;
+    return Changed; 
 
   // Okay, we have a strided store "p[i]" of a splattable value.  We can turn
   // this into a memset in the loop preheader now if we want.  However, this
@@ -970,22 +970,22 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   // base pointer and checking the region.
   Value *BasePtr =
       Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
-
-  // From here on out, conservatively report to the pass manager that we've
-  // changed the IR, even if we later clean up these added instructions. There
-  // may be structural differences e.g. in the order of use lists not accounted
-  // for in just a textual dump of the IR. This is written as a variable, even
-  // though statically all the places this dominates could be replaced with
-  // 'true', with the hope that anyone trying to be clever / "more precise" with
-  // the return value will read this comment, and leave them alone.
-  Changed = true;
-
+ 
+  // From here on out, conservatively report to the pass manager that we've 
+  // changed the IR, even if we later clean up these added instructions. There 
+  // may be structural differences e.g. in the order of use lists not accounted 
+  // for in just a textual dump of the IR. This is written as a variable, even 
+  // though statically all the places this dominates could be replaced with 
+  // 'true', with the hope that anyone trying to be clever / "more precise" with 
+  // the return value will read this comment, and leave them alone. 
+  Changed = true; 
+ 
   if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
-                            StoreSize, *AA, Stores))
-    return Changed;
+                            StoreSize, *AA, Stores)) 
+    return Changed; 
 
   if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
-    return Changed;
+    return Changed; 
 
   // Okay, everything looks good, insert the memset.
 
@@ -995,7 +995,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   // TODO: ideally we should still be able to generate memset if SCEV expander
   // is taught to generate the dependencies at the latest point.
   if (!isSafeToExpand(NumBytesS, *SE))
-    return Changed;
+    return Changed; 
 
   Value *NumBytes =
       Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
@@ -1054,7 +1054,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   if (MSSAU && VerifyMemorySSA)
     MSSAU->getMemorySSA()->verifyMemorySSA();
   ++NumMemSet;
-  ExpCleaner.markResultUsed();
+  ExpCleaner.markResultUsed(); 
   return true;
 }
 
@@ -1088,9 +1088,9 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   IRBuilder<> Builder(Preheader->getTerminator());
   SCEVExpander Expander(*SE, *DL, "loop-idiom");
 
-  SCEVExpanderCleaner ExpCleaner(Expander, *DT);
+  SCEVExpanderCleaner ExpCleaner(Expander, *DT); 
 
-  bool Changed = false;
+  bool Changed = false; 
   const SCEV *StrStart = StoreEv->getStart();
   unsigned StrAS = SI->getPointerAddressSpace();
   Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
@@ -1108,20 +1108,20 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   Value *StoreBasePtr = Expander.expandCodeFor(
       StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
 
-  // From here on out, conservatively report to the pass manager that we've
-  // changed the IR, even if we later clean up these added instructions. There
-  // may be structural differences e.g. in the order of use lists not accounted
-  // for in just a textual dump of the IR. This is written as a variable, even
-  // though statically all the places this dominates could be replaced with
-  // 'true', with the hope that anyone trying to be clever / "more precise" with
-  // the return value will read this comment, and leave them alone.
-  Changed = true;
-
+  // From here on out, conservatively report to the pass manager that we've 
+  // changed the IR, even if we later clean up these added instructions. There 
+  // may be structural differences e.g. in the order of use lists not accounted 
+  // for in just a textual dump of the IR. This is written as a variable, even 
+  // though statically all the places this dominates could be replaced with 
+  // 'true', with the hope that anyone trying to be clever / "more precise" with 
+  // the return value will read this comment, and leave them alone. 
+  Changed = true; 
+ 
   SmallPtrSet<Instruction *, 1> Stores;
   Stores.insert(SI);
   if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
                             StoreSize, *AA, Stores))
-    return Changed;
+    return Changed; 
 
   const SCEV *LdStart = LoadEv->getStart();
   unsigned LdAS = LI->getPointerAddressSpace();
@@ -1137,10 +1137,10 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
 
   if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
                             StoreSize, *AA, Stores))
-    return Changed;
+    return Changed; 
 
   if (avoidLIRForMultiBlockLoop())
-    return Changed;
+    return Changed; 
 
   // Okay, everything is safe, we can transform this!
 
@@ -1163,14 +1163,14 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
     const Align StoreAlign = SI->getAlign();
     const Align LoadAlign = LI->getAlign();
     if (StoreAlign < StoreSize || LoadAlign < StoreSize)
-      return Changed;
+      return Changed; 
 
     // If the element.atomic memcpy is not lowered into explicit
     // loads/stores later, then it will be lowered into an element-size
     // specific lib call. If the lib call doesn't exist for our store size, then
     // we shouldn't generate the memcpy.
     if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
-      return Changed;
+      return Changed; 
 
     // Create the call.
     // Note that unordered atomic loads/stores are *required* by the spec to
@@ -1208,7 +1208,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   if (MSSAU && VerifyMemorySSA)
     MSSAU->getMemorySSA()->verifyMemorySSA();
   ++NumMemCpy;
-  ExpCleaner.markResultUsed();
+  ExpCleaner.markResultUsed(); 
   return true;
 }
 
@@ -1218,7 +1218,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
 bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
                                                    bool IsLoopMemset) {
   if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
-    if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
+    if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) { 
       LLVM_DEBUG(dbgs() << "  " << CurLoop->getHeader()->getParent()->getName()
                         << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
                         << " avoided: multi-block top-level loop\n");
@@ -1235,8 +1235,8 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() {
                     << "] Noncountable Loop %"
                     << CurLoop->getHeader()->getName() << "\n");
 
-  return recognizePopcount() || recognizeAndInsertFFS() ||
-         recognizeShiftUntilBitTest();
+  return recognizePopcount() || recognizeAndInsertFFS() || 
+         recognizeShiftUntilBitTest(); 
 }
 
 /// Check if the given conditional branch is based on the comparison between
@@ -1483,7 +1483,7 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
     return false;
 
   // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
-  //         or cnt.next = cnt + -1.
+  //         or cnt.next = cnt + -1. 
   // TODO: We can skip the step. If loop trip count is known (CTLZ),
   //       then all uses of "cnt.next" could be optimized to the trip count
   //       plus "cnt0". Currently it is not optimized.
@@ -1497,7 +1497,7 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
       continue;
 
     ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
-    if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
+    if (!Inc || (!Inc->isOne() && !Inc->isMinusOne())) 
       continue;
 
     PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
@@ -1728,11 +1728,11 @@ void LoopIdiomRecognize::transformLoopToCountable(
   Builder.SetCurrentDebugLocation(DL);
 
   //   Count = BitWidth - CTLZ(InitX);
-  //   NewCount = Count;
+  //   NewCount = Count; 
   // If there are uses of CntPhi create:
-  //   NewCount = BitWidth - CTLZ(InitX >> 1);
-  //   Count = NewCount + 1;
-  Value *InitXNext;
+  //   NewCount = BitWidth - CTLZ(InitX >> 1); 
+  //   Count = NewCount + 1; 
+  Value *InitXNext; 
   if (IsCntPhiUsedOutsideLoop) {
     if (DefX->getOpcode() == Instruction::AShr)
       InitXNext =
@@ -1747,31 +1747,31 @@ void LoopIdiomRecognize::transformLoopToCountable(
       llvm_unreachable("Unexpected opcode!");
   } else
     InitXNext = InitX;
-  Value *FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
-  Value *Count = Builder.CreateSub(
-      ConstantInt::get(FFS->getType(), FFS->getType()->getIntegerBitWidth()),
+  Value *FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID); 
+  Value *Count = Builder.CreateSub( 
+      ConstantInt::get(FFS->getType(), FFS->getType()->getIntegerBitWidth()), 
       FFS);
-  Value *NewCount = Count;
+  Value *NewCount = Count; 
   if (IsCntPhiUsedOutsideLoop) {
-    NewCount = Count;
-    Count = Builder.CreateAdd(Count, ConstantInt::get(Count->getType(), 1));
+    NewCount = Count; 
+    Count = Builder.CreateAdd(Count, ConstantInt::get(Count->getType(), 1)); 
   }
 
-  NewCount = Builder.CreateZExtOrTrunc(NewCount,
-                                       cast<IntegerType>(CntInst->getType()));
+  NewCount = Builder.CreateZExtOrTrunc(NewCount, 
+                                       cast<IntegerType>(CntInst->getType())); 
 
   Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
-  if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
-    // If the counter was being incremented in the loop, add NewCount to the
-    // counter's initial value, but only if the initial value is not zero.
-    ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
-    if (!InitConst || !InitConst->isZero())
-      NewCount = Builder.CreateAdd(NewCount, CntInitVal);
-  } else {
-    // If the count was being decremented in the loop, subtract NewCount from
-    // the counter's initial value.
-    NewCount = Builder.CreateSub(CntInitVal, NewCount);
-  }
+  if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) { 
+    // If the counter was being incremented in the loop, add NewCount to the 
+    // counter's initial value, but only if the initial value is not zero. 
+    ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); 
+    if (!InitConst || !InitConst->isZero()) 
+      NewCount = Builder.CreateAdd(NewCount, CntInitVal); 
+  } else { 
+    // If the count was being decremented in the loop, subtract NewCount from 
+    // the counter's initial value. 
+    NewCount = Builder.CreateSub(CntInitVal, NewCount); 
+  } 
 
   // Step 2: Insert new IV and loop condition:
   // loop:
@@ -1919,343 +1919,343 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
   //   loop. The loop would otherwise not be deleted even if it becomes empty.
   SE->forgetLoop(CurLoop);
 }
-
-/// Match loop-invariant value.
-template <typename SubPattern_t> struct match_LoopInvariant {
-  SubPattern_t SubPattern;
-  const Loop *L;
-
-  match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
-      : SubPattern(SP), L(L) {}
-
-  template <typename ITy> bool match(ITy *V) {
-    return L->isLoopInvariant(V) && SubPattern.match(V);
-  }
-};
-
-/// Matches if the value is loop-invariant.
-template <typename Ty>
-inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
-  return match_LoopInvariant<Ty>(M, L);
-}
-
-/// Return true if the idiom is detected in the loop.
-///
-/// The core idiom we are trying to detect is:
-/// \code
-///   entry:
-///     <...>
-///     %bitmask = shl i32 1, %bitpos
-///     br label %loop
-///
-///   loop:
-///     %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
-///     %x.curr.bitmasked = and i32 %x.curr, %bitmask
-///     %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
-///     %x.next = shl i32 %x.curr, 1
-///     <...>
-///     br i1 %x.curr.isbitunset, label %loop, label %end
-///
-///   end:
-///     %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
-///     %x.next.res = phi i32 [ %x.next, %loop ] <...>
-///     <...>
-/// \endcode
-static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
-                                         Value *&BitMask, Value *&BitPos,
-                                         Value *&CurrX, Instruction *&NextX) {
-  LLVM_DEBUG(dbgs() << DEBUG_TYPE
-             " Performing shift-until-bittest idiom detection.\n");
-
-  // Give up if the loop has multiple blocks or multiple backedges.
-  if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
-    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
-    return false;
-  }
-
-  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
-  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
-  assert(LoopPreheaderBB && "There is always a loop preheader.");
-
-  using namespace PatternMatch;
-
-  // Step 1: Check if the loop backedge is in desirable form.
-
-  ICmpInst::Predicate Pred;
-  Value *CmpLHS, *CmpRHS;
-  BasicBlock *TrueBB, *FalseBB;
-  if (!match(LoopHeaderBB->getTerminator(),
-             m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
-                  m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
-    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
-    return false;
-  }
-
-  // Step 2: Check if the backedge's condition is in desirable form.
-
-  auto MatchVariableBitMask = [&]() {
-    return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
-           match(CmpLHS,
-                 m_c_And(m_Value(CurrX),
-                         m_CombineAnd(
-                             m_Value(BitMask),
-                             m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
-                                             CurLoop))));
-  };
-  auto MatchConstantBitMask = [&]() {
-    return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
-           match(CmpLHS, m_And(m_Value(CurrX),
-                               m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
-           (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
-  };
-  auto MatchDecomposableConstantBitMask = [&]() {
-    APInt Mask;
-    return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
-           ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
-           (BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
-           (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
-  };
-
-  if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
-      !MatchDecomposableConstantBitMask()) {
-    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
-    return false;
-  }
-
-  // Step 3: Check if the recurrence is in desirable form.
-  auto *CurrXPN = dyn_cast<PHINode>(CurrX);
-  if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
-    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
-    return false;
-  }
-
-  BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
-  NextX =
-      dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
-
-  if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
-    // FIXME: support right-shift?
-    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
-    return false;
-  }
-
-  // Step 4: Check if the backedge's destinations are in desirable form.
-
-  assert(ICmpInst::isEquality(Pred) &&
-         "Should only get equality predicates here.");
-
-  // cmp-br is commutative, so canonicalize to a single variant.
-  if (Pred != ICmpInst::Predicate::ICMP_EQ) {
-    Pred = ICmpInst::getInversePredicate(Pred);
-    std::swap(TrueBB, FalseBB);
-  }
-
-  // We expect to exit loop when comparison yields false,
-  // so when it yields true we should branch back to loop header.
-  if (TrueBB != LoopHeaderBB) {
-    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
-    return false;
-  }
-
-  // Okay, idiom checks out.
-  return true;
-}
-
-/// Look for the following loop:
-/// \code
-///   entry:
-///     <...>
-///     %bitmask = shl i32 1, %bitpos
-///     br label %loop
-///
-///   loop:
-///     %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
-///     %x.curr.bitmasked = and i32 %x.curr, %bitmask
-///     %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
-///     %x.next = shl i32 %x.curr, 1
-///     <...>
-///     br i1 %x.curr.isbitunset, label %loop, label %end
-///
-///   end:
-///     %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
-///     %x.next.res = phi i32 [ %x.next, %loop ] <...>
-///     <...>
-/// \endcode
-///
-/// And transform it into:
-/// \code
-///   entry:
-///     %bitmask = shl i32 1, %bitpos
-///     %lowbitmask = add i32 %bitmask, -1
-///     %mask = or i32 %lowbitmask, %bitmask
-///     %x.masked = and i32 %x, %mask
-///     %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
-///                                                         i1 true)
-///     %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
-///     %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
-///     %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
-///     %tripcount = add i32 %backedgetakencount, 1
-///     %x.curr = shl i32 %x, %backedgetakencount
-///     %x.next = shl i32 %x, %tripcount
-///     br label %loop
-///
-///   loop:
-///     %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
-///     %loop.iv.next = add nuw i32 %loop.iv, 1
-///     %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
-///     <...>
-///     br i1 %loop.ivcheck, label %end, label %loop
-///
-///   end:
-///     %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
-///     %x.next.res = phi i32 [ %x.next, %loop ] <...>
-///     <...>
-/// \endcode
-bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
-  bool MadeChange = false;
-
-  Value *X, *BitMask, *BitPos, *XCurr;
-  Instruction *XNext;
-  if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
-                                    XNext)) {
-    LLVM_DEBUG(dbgs() << DEBUG_TYPE
-               " shift-until-bittest idiom detection failed.\n");
-    return MadeChange;
-  }
-  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
-
-  // Ok, it is the idiom we were looking for, we *could* transform this loop,
-  // but is it profitable to transform?
-
-  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
-  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
-  assert(LoopPreheaderBB && "There is always a loop preheader.");
-
-  BasicBlock *SuccessorBB = CurLoop->getExitBlock();
-  assert(LoopPreheaderBB && "There is only a single successor.");
-
-  IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
-  Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
-
-  Intrinsic::ID IntrID = Intrinsic::ctlz;
-  Type *Ty = X->getType();
-
-  TargetTransformInfo::TargetCostKind CostKind =
-      TargetTransformInfo::TCK_SizeAndLatency;
-
-  // The rewrite is considered to be unprofitable iff and only iff the
-  // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
-  // making the loop countable, even if nothing else changes.
-  IntrinsicCostAttributes Attrs(
-      IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getTrue()});
-  int Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
-  if (Cost > TargetTransformInfo::TCC_Basic) {
-    LLVM_DEBUG(dbgs() << DEBUG_TYPE
-               " Intrinsic is too costly, not beneficial\n");
-    return MadeChange;
-  }
-  if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
-      TargetTransformInfo::TCC_Basic) {
-    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
-    return MadeChange;
-  }
-
-  // Ok, transform appears worthwhile.
-  MadeChange = true;
-
-  // Step 1: Compute the loop trip count.
-
-  Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
-                                        BitPos->getName() + ".lowbitmask");
-  Value *Mask =
-      Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
-  Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
-  CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
-      IntrID, Ty, {XMasked, /*is_zero_undef=*/Builder.getTrue()},
-      /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
-  Value *XMaskedNumActiveBits = Builder.CreateSub(
-      ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
-      XMasked->getName() + ".numactivebits");
-  Value *XMaskedLeadingOnePos =
-      Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
-                        XMasked->getName() + ".leadingonepos");
-
-  Value *LoopBackedgeTakenCount = Builder.CreateSub(
-      BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount");
-  // We know loop's backedge-taken count, but what's loop's trip count?
-  // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
-  Value *LoopTripCount =
-      Builder.CreateNUWAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
-                           CurLoop->getName() + ".tripcount");
-
-  // Step 2: Compute the recurrence's final value without a loop.
-
-  // NewX is always safe to compute, because `LoopBackedgeTakenCount`
-  // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
-  Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
-  NewX->takeName(XCurr);
-  if (auto *I = dyn_cast<Instruction>(NewX))
-    I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
-
-  Value *NewXNext;
-  // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
-  // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
-  // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
-  // that isn't the case, we'll need to emit an alternative, safe IR.
-  if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
-      PatternMatch::match(
-          BitPos, PatternMatch::m_SpecificInt_ICMP(
-                      ICmpInst::ICMP_NE, APInt(Ty->getScalarSizeInBits(),
-                                               Ty->getScalarSizeInBits() - 1))))
-    NewXNext = Builder.CreateShl(X, LoopTripCount);
-  else {
-    // Otherwise, just additionally shift by one. It's the smallest solution,
-    // alternatively, we could check that NewX is INT_MIN (or BitPos is )
-    // and select 0 instead.
-    NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
-  }
-
-  NewXNext->takeName(XNext);
-  if (auto *I = dyn_cast<Instruction>(NewXNext))
-    I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
-
-  // Step 3: Adjust the successor basic block to recieve the computed
-  //         recurrence's final value instead of the recurrence itself.
-
-  XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
-  XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
-
-  // Step 4: Rewrite the loop into a countable form, with canonical IV.
-
-  // The new canonical induction variable.
-  Builder.SetInsertPoint(&LoopHeaderBB->front());
-  auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
-
-  // The induction itself.
-  // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
-  Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
-  auto *IVNext = Builder.CreateNUWAdd(IV, ConstantInt::get(Ty, 1),
-                                      IV->getName() + ".next");
-
-  // The loop trip count check.
-  auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
-                                       CurLoop->getName() + ".ivcheck");
-  Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
-  LoopHeaderBB->getTerminator()->eraseFromParent();
-
-  // Populate the IV PHI.
-  IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
-  IV->addIncoming(IVNext, LoopHeaderBB);
-
-  // Step 5: Forget the "non-computable" trip-count SCEV associated with the
-  //   loop. The loop would otherwise not be deleted even if it becomes empty.
-
-  SE->forgetLoop(CurLoop);
-
-  // Other passes will take care of actually deleting the loop if possible.
-
-  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
-
-  ++NumShiftUntilBitTest;
-  return MadeChange;
-}
+ 
+/// Match loop-invariant value. 
+template <typename SubPattern_t> struct match_LoopInvariant { 
+  SubPattern_t SubPattern; 
+  const Loop *L; 
+ 
+  match_LoopInvariant(const SubPattern_t &SP, const Loop *L) 
+      : SubPattern(SP), L(L) {} 
+ 
+  template <typename ITy> bool match(ITy *V) { 
+    return L->isLoopInvariant(V) && SubPattern.match(V); 
+  } 
+}; 
+ 
+/// Matches if the value is loop-invariant. 
+template <typename Ty> 
+inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) { 
+  return match_LoopInvariant<Ty>(M, L); 
+} 
+ 
+/// Return true if the idiom is detected in the loop. 
+/// 
+/// The core idiom we are trying to detect is: 
+/// \code 
+///   entry: 
+///     <...> 
+///     %bitmask = shl i32 1, %bitpos 
+///     br label %loop 
+/// 
+///   loop: 
+///     %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ] 
+///     %x.curr.bitmasked = and i32 %x.curr, %bitmask 
+///     %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0 
+///     %x.next = shl i32 %x.curr, 1 
+///     <...> 
+///     br i1 %x.curr.isbitunset, label %loop, label %end 
+/// 
+///   end: 
+///     %x.curr.res = phi i32 [ %x.curr, %loop ] <...> 
+///     %x.next.res = phi i32 [ %x.next, %loop ] <...> 
+///     <...> 
+/// \endcode 
+static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, 
+                                         Value *&BitMask, Value *&BitPos, 
+                                         Value *&CurrX, Instruction *&NextX) { 
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE 
+             " Performing shift-until-bittest idiom detection.\n"); 
+ 
+  // Give up if the loop has multiple blocks or multiple backedges. 
+  if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) { 
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n"); 
+    return false; 
+  } 
+ 
+  BasicBlock *LoopHeaderBB = CurLoop->getHeader(); 
+  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader(); 
+  assert(LoopPreheaderBB && "There is always a loop preheader."); 
+ 
+  using namespace PatternMatch; 
+ 
+  // Step 1: Check if the loop backedge is in desirable form. 
+ 
+  ICmpInst::Predicate Pred; 
+  Value *CmpLHS, *CmpRHS; 
+  BasicBlock *TrueBB, *FalseBB; 
+  if (!match(LoopHeaderBB->getTerminator(), 
+             m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)), 
+                  m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) { 
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n"); 
+    return false; 
+  } 
+ 
+  // Step 2: Check if the backedge's condition is in desirable form. 
+ 
+  auto MatchVariableBitMask = [&]() { 
+    return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) && 
+           match(CmpLHS, 
+                 m_c_And(m_Value(CurrX), 
+                         m_CombineAnd( 
+                             m_Value(BitMask), 
+                             m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)), 
+                                             CurLoop)))); 
+  }; 
+  auto MatchConstantBitMask = [&]() { 
+    return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) && 
+           match(CmpLHS, m_And(m_Value(CurrX), 
+                               m_CombineAnd(m_Value(BitMask), m_Power2()))) && 
+           (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask))); 
+  }; 
+  auto MatchDecomposableConstantBitMask = [&]() { 
+    APInt Mask; 
+    return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) && 
+           ICmpInst::isEquality(Pred) && Mask.isPowerOf2() && 
+           (BitMask = ConstantInt::get(CurrX->getType(), Mask)) && 
+           (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2())); 
+  }; 
+ 
+  if (!MatchVariableBitMask() && !MatchConstantBitMask() && 
+      !MatchDecomposableConstantBitMask()) { 
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n"); 
+    return false; 
+  } 
+ 
+  // Step 3: Check if the recurrence is in desirable form. 
+  auto *CurrXPN = dyn_cast<PHINode>(CurrX); 
+  if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) { 
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n"); 
+    return false; 
+  } 
+ 
+  BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB); 
+  NextX = 
+      dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB)); 
+ 
+  if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) { 
+    // FIXME: support right-shift? 
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n"); 
+    return false; 
+  } 
+ 
+  // Step 4: Check if the backedge's destinations are in desirable form. 
+ 
+  assert(ICmpInst::isEquality(Pred) && 
+         "Should only get equality predicates here."); 
+ 
+  // cmp-br is commutative, so canonicalize to a single variant. 
+  if (Pred != ICmpInst::Predicate::ICMP_EQ) { 
+    Pred = ICmpInst::getInversePredicate(Pred); 
+    std::swap(TrueBB, FalseBB); 
+  } 
+ 
+  // We expect to exit loop when comparison yields false, 
+  // so when it yields true we should branch back to loop header. 
+  if (TrueBB != LoopHeaderBB) { 
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n"); 
+    return false; 
+  } 
+ 
+  // Okay, idiom checks out. 
+  return true; 
+} 
+ 
+/// Look for the following loop: 
+/// \code 
+///   entry: 
+///     <...> 
+///     %bitmask = shl i32 1, %bitpos 
+///     br label %loop 
+/// 
+///   loop: 
+///     %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ] 
+///     %x.curr.bitmasked = and i32 %x.curr, %bitmask 
+///     %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0 
+///     %x.next = shl i32 %x.curr, 1 
+///     <...> 
+///     br i1 %x.curr.isbitunset, label %loop, label %end 
+/// 
+///   end: 
+///     %x.curr.res = phi i32 [ %x.curr, %loop ] <...> 
+///     %x.next.res = phi i32 [ %x.next, %loop ] <...> 
+///     <...> 
+/// \endcode 
+/// 
+/// And transform it into: 
+/// \code 
+///   entry: 
+///     %bitmask = shl i32 1, %bitpos 
+///     %lowbitmask = add i32 %bitmask, -1 
+///     %mask = or i32 %lowbitmask, %bitmask 
+///     %x.masked = and i32 %x, %mask 
+///     %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked, 
+///                                                         i1 true) 
+///     %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros 
+///     %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1 
+///     %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos 
+///     %tripcount = add i32 %backedgetakencount, 1 
+///     %x.curr = shl i32 %x, %backedgetakencount 
+///     %x.next = shl i32 %x, %tripcount 
+///     br label %loop 
+/// 
+///   loop: 
+///     %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ] 
+///     %loop.iv.next = add nuw i32 %loop.iv, 1 
+///     %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount 
+///     <...> 
+///     br i1 %loop.ivcheck, label %end, label %loop 
+/// 
+///   end: 
+///     %x.curr.res = phi i32 [ %x.curr, %loop ] <...> 
+///     %x.next.res = phi i32 [ %x.next, %loop ] <...> 
+///     <...> 
+/// \endcode 
+bool LoopIdiomRecognize::recognizeShiftUntilBitTest() { 
+  bool MadeChange = false; 
+ 
+  Value *X, *BitMask, *BitPos, *XCurr; 
+  Instruction *XNext; 
+  if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr, 
+                                    XNext)) { 
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE 
+               " shift-until-bittest idiom detection failed.\n"); 
+    return MadeChange; 
+  } 
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n"); 
+ 
+  // Ok, it is the idiom we were looking for, we *could* transform this loop, 
+  // but is it profitable to transform? 
+ 
+  BasicBlock *LoopHeaderBB = CurLoop->getHeader(); 
+  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader(); 
+  assert(LoopPreheaderBB && "There is always a loop preheader."); 
+ 
+  BasicBlock *SuccessorBB = CurLoop->getExitBlock(); 
+  assert(LoopPreheaderBB && "There is only a single successor."); 
+ 
+  IRBuilder<> Builder(LoopPreheaderBB->getTerminator()); 
+  Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc()); 
+ 
+  Intrinsic::ID IntrID = Intrinsic::ctlz; 
+  Type *Ty = X->getType(); 
+ 
+  TargetTransformInfo::TargetCostKind CostKind = 
+      TargetTransformInfo::TCK_SizeAndLatency; 
+ 
+  // The rewrite is considered to be unprofitable iff and only iff the 
+  // intrinsic/shift we'll use are not cheap. Note that we are okay with *just* 
+  // making the loop countable, even if nothing else changes. 
+  IntrinsicCostAttributes Attrs( 
+      IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getTrue()}); 
+  int Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind); 
+  if (Cost > TargetTransformInfo::TCC_Basic) { 
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE 
+               " Intrinsic is too costly, not beneficial\n"); 
+    return MadeChange; 
+  } 
+  if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) > 
+      TargetTransformInfo::TCC_Basic) { 
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n"); 
+    return MadeChange; 
+  } 
+ 
+  // Ok, transform appears worthwhile. 
+  MadeChange = true; 
+ 
+  // Step 1: Compute the loop trip count. 
+ 
+  Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty), 
+                                        BitPos->getName() + ".lowbitmask"); 
+  Value *Mask = 
+      Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask"); 
+  Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked"); 
+  CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic( 
+      IntrID, Ty, {XMasked, /*is_zero_undef=*/Builder.getTrue()}, 
+      /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros"); 
+  Value *XMaskedNumActiveBits = Builder.CreateSub( 
+      ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros, 
+      XMasked->getName() + ".numactivebits"); 
+  Value *XMaskedLeadingOnePos = 
+      Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty), 
+                        XMasked->getName() + ".leadingonepos"); 
+ 
+  Value *LoopBackedgeTakenCount = Builder.CreateSub( 
+      BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount"); 
+  // We know loop's backedge-taken count, but what's loop's trip count? 
+  // Note that while NUW is always safe, while NSW is only for bitwidths != 2. 
+  Value *LoopTripCount = 
+      Builder.CreateNUWAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1), 
+                           CurLoop->getName() + ".tripcount"); 
+ 
+  // Step 2: Compute the recurrence's final value without a loop. 
+ 
+  // NewX is always safe to compute, because `LoopBackedgeTakenCount` 
+  // will always be smaller than `bitwidth(X)`, i.e. we never get poison. 
+  Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount); 
+  NewX->takeName(XCurr); 
+  if (auto *I = dyn_cast<Instruction>(NewX)) 
+    I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true); 
+ 
+  Value *NewXNext; 
+  // Rewriting XNext is more complicated, however, because `X << LoopTripCount` 
+  // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen 
+  // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know 
+  // that isn't the case, we'll need to emit an alternative, safe IR. 
+  if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() || 
+      PatternMatch::match( 
+          BitPos, PatternMatch::m_SpecificInt_ICMP( 
+                      ICmpInst::ICMP_NE, APInt(Ty->getScalarSizeInBits(), 
+                                               Ty->getScalarSizeInBits() - 1)))) 
+    NewXNext = Builder.CreateShl(X, LoopTripCount); 
+  else { 
+    // Otherwise, just additionally shift by one. It's the smallest solution, 
+    // alternatively, we could check that NewX is INT_MIN (or BitPos is ) 
+    // and select 0 instead. 
+    NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1)); 
+  } 
+ 
+  NewXNext->takeName(XNext); 
+  if (auto *I = dyn_cast<Instruction>(NewXNext)) 
+    I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true); 
+ 
+  // Step 3: Adjust the successor basic block to recieve the computed 
+  //         recurrence's final value instead of the recurrence itself. 
+ 
+  XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB); 
+  XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB); 
+ 
+  // Step 4: Rewrite the loop into a countable form, with canonical IV. 
+ 
+  // The new canonical induction variable. 
+  Builder.SetInsertPoint(&LoopHeaderBB->front()); 
+  auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv"); 
+ 
+  // The induction itself. 
+  // Note that while NUW is always safe, while NSW is only for bitwidths != 2. 
+  Builder.SetInsertPoint(LoopHeaderBB->getTerminator()); 
+  auto *IVNext = Builder.CreateNUWAdd(IV, ConstantInt::get(Ty, 1), 
+                                      IV->getName() + ".next"); 
+ 
+  // The loop trip count check. 
+  auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount, 
+                                       CurLoop->getName() + ".ivcheck"); 
+  Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB); 
+  LoopHeaderBB->getTerminator()->eraseFromParent(); 
+ 
+  // Populate the IV PHI. 
+  IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB); 
+  IV->addIncoming(IVNext, LoopHeaderBB); 
+ 
+  // Step 5: Forget the "non-computable" trip-count SCEV associated with the 
+  //   loop. The loop would otherwise not be deleted even if it becomes empty. 
+ 
+  SE->forgetLoop(CurLoop); 
+ 
+  // Other passes will take care of actually deleting the loop if possible. 
+ 
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n"); 
+ 
+  ++NumShiftUntilBitTest; 
+  return MadeChange; 
+}
author	shadchin <shadchin@yandex-team.ru>	2022-02-10 16:44:30 +0300
committer	Daniil Cherednik <dcherednik@yandex-team.ru>	2022-02-10 16:44:30 +0300
commit	2598ef1d0aee359b4b6d5fdd1758916d5907d04f (patch)
tree	012bb94d777798f1f56ac1cec429509766d05181 /contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
parent	6751af0b0c1b952fede40b19b71da8025b5d8bcf (diff)
download	ydb-2598ef1d0aee359b4b6d5fdd1758916d5907d04f.tar.gz